diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18664 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.998792950832864, + "eval_steps": 500, + "global_step": 12424, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032187977790295325, + "grad_norm": 137.12603459669486, + "learning_rate": 9.99275595621378e-07, + "logits/chosen": 0.90606689453125, + "logits/rejected": 0.8309692144393921, + "logps/chosen": -430.04998779296875, + "logps/rejected": -350.3500061035156, + "loss": 0.6748, + "rewards/accuracies": 0.40625, + "rewards/chosen": 0.12717513740062714, + "rewards/margins": 0.054534912109375, + "rewards/rejected": 0.072759248316288, + "step": 10 + }, + { + "epoch": 0.006437595558059065, + "grad_norm": 158.7531922727514, + "learning_rate": 9.984707018673534e-07, + "logits/chosen": 1.0995299816131592, + "logits/rejected": 1.094506859779358, + "logps/chosen": -394.57501220703125, + "logps/rejected": -375.125, + "loss": 0.698, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.2859855592250824, + "rewards/margins": 0.07439880073070526, + "rewards/rejected": 0.21175995469093323, + "step": 20 + }, + { + "epoch": 0.009656393337088598, + "grad_norm": 174.717232383771, + "learning_rate": 9.97665808113329e-07, + "logits/chosen": 0.7853759527206421, + "logits/rejected": 0.8491576910018921, + "logps/chosen": -399.0249938964844, + "logps/rejected": -359.5, + "loss": 0.6351, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.06510619819164276, + "rewards/margins": 0.23707886040210724, + "rewards/rejected": -0.17205505073070526, + "step": 30 + }, + { + "epoch": 0.01287519111611813, + "grad_norm": 136.8139664492458, + "learning_rate": 9.968609143593046e-07, + "logits/chosen": 0.821514904499054, + "logits/rejected": 0.89141845703125, + "logps/chosen": -414.45001220703125, + "logps/rejected": -386.82501220703125, + "loss": 0.6496, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.2662109434604645, + "rewards/margins": 0.2652130126953125, + "rewards/rejected": -0.53143310546875, + "step": 40 + }, + { + "epoch": 0.01609398889514766, + "grad_norm": 168.21357769707922, + "learning_rate": 9.9605602060528e-07, + "logits/chosen": 0.7420654296875, + "logits/rejected": 0.703808605670929, + "logps/chosen": -420.26251220703125, + "logps/rejected": -376.54998779296875, + "loss": 0.614, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.30087584257125854, + "rewards/margins": 0.34843140840530396, + "rewards/rejected": -0.6494140625, + "step": 50 + }, + { + "epoch": 0.019312786674177195, + "grad_norm": 88.83026879512722, + "learning_rate": 9.952511268512556e-07, + "logits/chosen": 0.893463134765625, + "logits/rejected": 0.816296398639679, + "logps/chosen": -424.7250061035156, + "logps/rejected": -363.2749938964844, + "loss": 0.6206, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.03999938815832138, + "rewards/margins": 0.36439818143844604, + "rewards/rejected": -0.4043945372104645, + "step": 60 + }, + { + "epoch": 0.022531584453206726, + "grad_norm": 141.17623218939528, + "learning_rate": 9.944462330972312e-07, + "logits/chosen": 0.9472290277481079, + "logits/rejected": 0.997802734375, + "logps/chosen": -372.2250061035156, + "logps/rejected": -386.5, + "loss": 0.7379, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.08520813286304474, + "rewards/margins": 0.13207702338695526, + "rewards/rejected": -0.04658966138958931, + "step": 70 + }, + { + "epoch": 0.02575038223223626, + "grad_norm": 121.93160933537867, + "learning_rate": 9.936413393432066e-07, + "logits/chosen": 0.9478515386581421, + "logits/rejected": 1.0186767578125, + "logps/chosen": -461.26251220703125, + "logps/rejected": -420.2250061035156, + "loss": 0.5897, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.35172730684280396, + "rewards/margins": 0.480172723531723, + "rewards/rejected": -0.12843552231788635, + "step": 80 + }, + { + "epoch": 0.02896918001126579, + "grad_norm": 191.64138375309489, + "learning_rate": 9.928364455891822e-07, + "logits/chosen": 0.860034167766571, + "logits/rejected": 0.9210449457168579, + "logps/chosen": -441.7250061035156, + "logps/rejected": -384.9750061035156, + "loss": 0.6238, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09489746391773224, + "rewards/margins": 0.38829344511032104, + "rewards/rejected": -0.48321837186813354, + "step": 90 + }, + { + "epoch": 0.03218797779029532, + "grad_norm": 139.81217569379513, + "learning_rate": 9.920315518351577e-07, + "logits/chosen": 0.733569324016571, + "logits/rejected": 0.752941906452179, + "logps/chosen": -381.7250061035156, + "logps/rejected": -353.125, + "loss": 0.7518, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.26585692167282104, + "rewards/margins": 0.16349487006664276, + "rewards/rejected": -0.42872315645217896, + "step": 100 + }, + { + "epoch": 0.03540677556932486, + "grad_norm": 161.4173604760746, + "learning_rate": 9.912266580811333e-07, + "logits/chosen": 0.7338012456893921, + "logits/rejected": 0.747174084186554, + "logps/chosen": -426.625, + "logps/rejected": -366.1000061035156, + "loss": 0.6208, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.15743407607078552, + "rewards/margins": 0.39329832792282104, + "rewards/rejected": -0.55059814453125, + "step": 110 + }, + { + "epoch": 0.03862557334835439, + "grad_norm": 130.35767179314732, + "learning_rate": 9.904217643271087e-07, + "logits/chosen": 0.866223156452179, + "logits/rejected": 0.9155944585800171, + "logps/chosen": -414.0249938964844, + "logps/rejected": -365.13751220703125, + "loss": 0.6236, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19197997450828552, + "rewards/margins": 0.4902908205986023, + "rewards/rejected": -0.298483282327652, + "step": 120 + }, + { + "epoch": 0.04184437112738392, + "grad_norm": 130.42831369024495, + "learning_rate": 9.896168705730843e-07, + "logits/chosen": 0.777783215045929, + "logits/rejected": 0.7722533941268921, + "logps/chosen": -417.7749938964844, + "logps/rejected": -345.5249938964844, + "loss": 0.5836, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.27844542264938354, + "rewards/margins": 0.488516241312027, + "rewards/rejected": -0.20972594618797302, + "step": 130 + }, + { + "epoch": 0.04506316890641345, + "grad_norm": 100.96761723306543, + "learning_rate": 9.888119768190599e-07, + "logits/chosen": 1.015112280845642, + "logits/rejected": 1.0008575916290283, + "logps/chosen": -406.48748779296875, + "logps/rejected": -339.17498779296875, + "loss": 0.5593, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.33648985624313354, + "rewards/margins": 0.5818115472793579, + "rewards/rejected": -0.24532166123390198, + "step": 140 + }, + { + "epoch": 0.04828196668544299, + "grad_norm": 148.48882724970096, + "learning_rate": 9.880070830650353e-07, + "logits/chosen": 0.773608386516571, + "logits/rejected": 0.803540050983429, + "logps/chosen": -379.9750061035156, + "logps/rejected": -370.3500061035156, + "loss": 0.5904, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.11785431206226349, + "rewards/margins": 0.555102527141571, + "rewards/rejected": -0.43765562772750854, + "step": 150 + }, + { + "epoch": 0.05150076446447252, + "grad_norm": 128.18140582594305, + "learning_rate": 9.872021893110109e-07, + "logits/chosen": 0.8162506222724915, + "logits/rejected": 0.8580383062362671, + "logps/chosen": -423.6625061035156, + "logps/rejected": -345.98748779296875, + "loss": 0.5896, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04999389499425888, + "rewards/margins": 0.5834900140762329, + "rewards/rejected": -0.5336242914199829, + "step": 160 + }, + { + "epoch": 0.05471956224350205, + "grad_norm": 99.59927197387697, + "learning_rate": 9.863972955569864e-07, + "logits/chosen": 0.8972969055175781, + "logits/rejected": 0.983569324016571, + "logps/chosen": -421.54998779296875, + "logps/rejected": -376.20001220703125, + "loss": 0.5977, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.7374267578125, + "rewards/margins": 0.587664783000946, + "rewards/rejected": 0.14959105849266052, + "step": 170 + }, + { + "epoch": 0.05793836002253158, + "grad_norm": 147.67550188845584, + "learning_rate": 9.85592401802962e-07, + "logits/chosen": 1.0340697765350342, + "logits/rejected": 0.94403076171875, + "logps/chosen": -431.42498779296875, + "logps/rejected": -369.04998779296875, + "loss": 0.6375, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.927685558795929, + "rewards/margins": 0.47840577363967896, + "rewards/rejected": 0.44982606172561646, + "step": 180 + }, + { + "epoch": 0.06115715780156112, + "grad_norm": 151.82098604183074, + "learning_rate": 9.847875080489374e-07, + "logits/chosen": 0.9466552734375, + "logits/rejected": 0.8711913824081421, + "logps/chosen": -373.32501220703125, + "logps/rejected": -340.1625061035156, + "loss": 0.6536, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.46101683378219604, + "rewards/margins": 0.4480346739292145, + "rewards/rejected": 0.01320800743997097, + "step": 190 + }, + { + "epoch": 0.06437595558059064, + "grad_norm": 100.42247975036338, + "learning_rate": 9.83982614294913e-07, + "logits/chosen": 0.6869872808456421, + "logits/rejected": 0.642321765422821, + "logps/chosen": -438.4750061035156, + "logps/rejected": -398.63751220703125, + "loss": 0.5971, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05717773362994194, + "rewards/margins": 0.6067870855331421, + "rewards/rejected": -0.6641479730606079, + "step": 200 + }, + { + "epoch": 0.06759475335962019, + "grad_norm": 154.33225039466234, + "learning_rate": 9.831777205408886e-07, + "logits/chosen": 0.41350096464157104, + "logits/rejected": 0.454183965921402, + "logps/chosen": -400.51251220703125, + "logps/rejected": -378.9750061035156, + "loss": 0.5724, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.4706664979457855, + "rewards/margins": 0.8174377679824829, + "rewards/rejected": -1.2868163585662842, + "step": 210 + }, + { + "epoch": 0.07081355113864972, + "grad_norm": 103.36184068733668, + "learning_rate": 9.82372826786864e-07, + "logits/chosen": 0.6059631109237671, + "logits/rejected": 0.6541503667831421, + "logps/chosen": -397.4624938964844, + "logps/rejected": -384.375, + "loss": 0.6145, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.08401794731616974, + "rewards/margins": 0.5648162961006165, + "rewards/rejected": -0.6494781374931335, + "step": 220 + }, + { + "epoch": 0.07403234891767925, + "grad_norm": 184.68077887211123, + "learning_rate": 9.815679330328396e-07, + "logits/chosen": 0.668505847454071, + "logits/rejected": 0.626812756061554, + "logps/chosen": -419.3999938964844, + "logps/rejected": -370.75, + "loss": 0.6579, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.17433777451515198, + "rewards/margins": 0.3880676329135895, + "rewards/rejected": -0.21386107802391052, + "step": 230 + }, + { + "epoch": 0.07725114669670878, + "grad_norm": 131.82081870116656, + "learning_rate": 9.807630392788152e-07, + "logits/chosen": 0.812976062297821, + "logits/rejected": 0.714111328125, + "logps/chosen": -408.20001220703125, + "logps/rejected": -354.67498779296875, + "loss": 0.6767, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.007267761044204235, + "rewards/margins": 0.4282073974609375, + "rewards/rejected": -0.4357193112373352, + "step": 240 + }, + { + "epoch": 0.08046994447573831, + "grad_norm": 144.1583937638351, + "learning_rate": 9.799581455247907e-07, + "logits/chosen": 0.6182586550712585, + "logits/rejected": 0.55316162109375, + "logps/chosen": -435.7250061035156, + "logps/rejected": -347.2875061035156, + "loss": 0.5349, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.14282837510108948, + "rewards/margins": 0.744189441204071, + "rewards/rejected": -0.8871002197265625, + "step": 250 + }, + { + "epoch": 0.08368874225476784, + "grad_norm": 113.15104481635116, + "learning_rate": 9.791532517707663e-07, + "logits/chosen": 0.748138427734375, + "logits/rejected": 0.814483642578125, + "logps/chosen": -403.42498779296875, + "logps/rejected": -371.5, + "loss": 0.6654, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.05366821214556694, + "rewards/margins": 0.532885730266571, + "rewards/rejected": -0.4789581298828125, + "step": 260 + }, + { + "epoch": 0.08690754003379737, + "grad_norm": 109.47613294905764, + "learning_rate": 9.783483580167417e-07, + "logits/chosen": 0.698840320110321, + "logits/rejected": 0.677935779094696, + "logps/chosen": -355.48748779296875, + "logps/rejected": -343.9624938964844, + "loss": 0.5577, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.27991944551467896, + "rewards/margins": 0.704296886920929, + "rewards/rejected": -0.42365723848342896, + "step": 270 + }, + { + "epoch": 0.0901263378128269, + "grad_norm": 150.54588270994566, + "learning_rate": 9.775434642627173e-07, + "logits/chosen": 0.637072741985321, + "logits/rejected": 0.67120361328125, + "logps/chosen": -379.82501220703125, + "logps/rejected": -363.9750061035156, + "loss": 0.5779, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.27180176973342896, + "rewards/margins": 0.5686401128768921, + "rewards/rejected": -0.2969711422920227, + "step": 280 + }, + { + "epoch": 0.09334513559185645, + "grad_norm": 137.32112305921768, + "learning_rate": 9.767385705086929e-07, + "logits/chosen": 0.6674270629882812, + "logits/rejected": 0.5724025964736938, + "logps/chosen": -328.9750061035156, + "logps/rejected": -324.82501220703125, + "loss": 0.6329, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.1750282347202301, + "rewards/margins": 0.40728759765625, + "rewards/rejected": -0.23234863579273224, + "step": 290 + }, + { + "epoch": 0.09656393337088598, + "grad_norm": 163.8638186208734, + "learning_rate": 9.759336767546683e-07, + "logits/chosen": 0.5502685308456421, + "logits/rejected": 0.534008800983429, + "logps/chosen": -429.8500061035156, + "logps/rejected": -388.3500061035156, + "loss": 0.6514, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.17510680854320526, + "rewards/margins": 0.41749876737594604, + "rewards/rejected": -0.24266967177391052, + "step": 300 + }, + { + "epoch": 0.09978273114991551, + "grad_norm": 100.57487107671191, + "learning_rate": 9.751287830006439e-07, + "logits/chosen": 0.926892101764679, + "logits/rejected": 0.9070800542831421, + "logps/chosen": -406.2250061035156, + "logps/rejected": -354.63751220703125, + "loss": 0.6323, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.8114989995956421, + "rewards/margins": 0.5528900027275085, + "rewards/rejected": 0.2575012147426605, + "step": 310 + }, + { + "epoch": 0.10300152892894504, + "grad_norm": 117.40097755770623, + "learning_rate": 9.743238892466194e-07, + "logits/chosen": 0.727459728717804, + "logits/rejected": 0.7076781988143921, + "logps/chosen": -390.7875061035156, + "logps/rejected": -352.2749938964844, + "loss": 0.5843, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.669995129108429, + "rewards/margins": 0.5040038824081421, + "rewards/rejected": 0.16578368842601776, + "step": 320 + }, + { + "epoch": 0.10622032670797457, + "grad_norm": 137.80228171828898, + "learning_rate": 9.73518995492595e-07, + "logits/chosen": 0.6731353998184204, + "logits/rejected": 0.703784167766571, + "logps/chosen": -375.76251220703125, + "logps/rejected": -315.66876220703125, + "loss": 0.6012, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.46072691679000854, + "rewards/margins": 0.5298706293106079, + "rewards/rejected": -0.06953735649585724, + "step": 330 + }, + { + "epoch": 0.1094391244870041, + "grad_norm": 133.19467305075054, + "learning_rate": 9.727141017385704e-07, + "logits/chosen": 0.7291015386581421, + "logits/rejected": 0.7159057855606079, + "logps/chosen": -397.8999938964844, + "logps/rejected": -347.07501220703125, + "loss": 0.5728, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.46173095703125, + "rewards/margins": 0.6433349847793579, + "rewards/rejected": -0.18232421576976776, + "step": 340 + }, + { + "epoch": 0.11265792226603363, + "grad_norm": 147.97547022267656, + "learning_rate": 9.71909207984546e-07, + "logits/chosen": 0.8326416015625, + "logits/rejected": 0.8359130620956421, + "logps/chosen": -381.13751220703125, + "logps/rejected": -359.2124938964844, + "loss": 0.6387, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.5150817632675171, + "rewards/margins": 0.4294799864292145, + "rewards/rejected": 0.0859581008553505, + "step": 350 + }, + { + "epoch": 0.11587672004506316, + "grad_norm": 104.79611643544327, + "learning_rate": 9.711043142305216e-07, + "logits/chosen": 0.7639831304550171, + "logits/rejected": 0.822186291217804, + "logps/chosen": -398.70001220703125, + "logps/rejected": -355.6499938964844, + "loss": 0.5643, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2787231504917145, + "rewards/margins": 0.62237548828125, + "rewards/rejected": -0.3434204161167145, + "step": 360 + }, + { + "epoch": 0.1190955178240927, + "grad_norm": 149.20483115074111, + "learning_rate": 9.70299420476497e-07, + "logits/chosen": 0.6078857183456421, + "logits/rejected": 0.626708984375, + "logps/chosen": -398.92498779296875, + "logps/rejected": -385.1000061035156, + "loss": 0.6792, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07133789360523224, + "rewards/margins": 0.47996824979782104, + "rewards/rejected": -0.5513671636581421, + "step": 370 + }, + { + "epoch": 0.12231431560312224, + "grad_norm": 136.97956703291783, + "learning_rate": 9.694945267224726e-07, + "logits/chosen": 0.788281261920929, + "logits/rejected": 0.6793457269668579, + "logps/chosen": -425.2250061035156, + "logps/rejected": -383.6499938964844, + "loss": 0.5484, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.2722885012626648, + "rewards/margins": 0.737561047077179, + "rewards/rejected": -0.46464234590530396, + "step": 380 + }, + { + "epoch": 0.12553311338215176, + "grad_norm": 155.29057137252292, + "learning_rate": 9.686896329684481e-07, + "logits/chosen": 0.698229968547821, + "logits/rejected": 0.694323718547821, + "logps/chosen": -408.1499938964844, + "logps/rejected": -385.92498779296875, + "loss": 0.7058, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.07219085842370987, + "rewards/margins": 0.4127563536167145, + "rewards/rejected": -0.34100341796875, + "step": 390 + }, + { + "epoch": 0.1287519111611813, + "grad_norm": 99.43040834368473, + "learning_rate": 9.678847392144237e-07, + "logits/chosen": 0.7289794683456421, + "logits/rejected": 0.6753174066543579, + "logps/chosen": -411.73748779296875, + "logps/rejected": -372.8374938964844, + "loss": 0.533, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.15974120795726776, + "rewards/margins": 0.719818115234375, + "rewards/rejected": -0.5601135492324829, + "step": 400 + }, + { + "epoch": 0.13197070894021082, + "grad_norm": 99.23560369501722, + "learning_rate": 9.670798454603993e-07, + "logits/chosen": 0.7481323480606079, + "logits/rejected": 0.717272937297821, + "logps/chosen": -390.79998779296875, + "logps/rejected": -339.09375, + "loss": 0.5928, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.02582397498190403, + "rewards/margins": 0.6076294183731079, + "rewards/rejected": -0.6336669921875, + "step": 410 + }, + { + "epoch": 0.13518950671924038, + "grad_norm": 109.65331666828253, + "learning_rate": 9.662749517063747e-07, + "logits/chosen": 0.616198718547821, + "logits/rejected": 0.65460205078125, + "logps/chosen": -437.75, + "logps/rejected": -380.45001220703125, + "loss": 0.6183, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.02402343787252903, + "rewards/margins": 0.608996570110321, + "rewards/rejected": -0.5844482183456421, + "step": 420 + }, + { + "epoch": 0.1384083044982699, + "grad_norm": 120.11337331781385, + "learning_rate": 9.654700579523503e-07, + "logits/chosen": 0.653155505657196, + "logits/rejected": 0.6378844976425171, + "logps/chosen": -393.82501220703125, + "logps/rejected": -375.0249938964844, + "loss": 0.6113, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.04633789137005806, + "rewards/margins": 0.6392791867256165, + "rewards/rejected": -0.593334972858429, + "step": 430 + }, + { + "epoch": 0.14162710227729944, + "grad_norm": 77.55917688483741, + "learning_rate": 9.646651641983257e-07, + "logits/chosen": 0.7222656011581421, + "logits/rejected": 0.6381591558456421, + "logps/chosen": -430.11248779296875, + "logps/rejected": -368.125, + "loss": 0.6043, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.01818237267434597, + "rewards/margins": 0.7796630859375, + "rewards/rejected": -0.7612549066543579, + "step": 440 + }, + { + "epoch": 0.14484590005632897, + "grad_norm": 84.73371076442537, + "learning_rate": 9.638602704443013e-07, + "logits/chosen": 0.5665310025215149, + "logits/rejected": 0.5167907476425171, + "logps/chosen": -435.36248779296875, + "logps/rejected": -386.3999938964844, + "loss": 0.6083, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.21475830674171448, + "rewards/margins": 0.5988525152206421, + "rewards/rejected": -0.38393861055374146, + "step": 450 + }, + { + "epoch": 0.1480646978353585, + "grad_norm": 138.75299396308097, + "learning_rate": 9.630553766902769e-07, + "logits/chosen": 0.835644543170929, + "logits/rejected": 0.800915539264679, + "logps/chosen": -417.54998779296875, + "logps/rejected": -399.4375, + "loss": 0.7004, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.26080322265625, + "rewards/margins": 0.4821533262729645, + "rewards/rejected": -0.22187499701976776, + "step": 460 + }, + { + "epoch": 0.15128349561438803, + "grad_norm": 131.7744117145701, + "learning_rate": 9.622504829362522e-07, + "logits/chosen": 0.7927612066268921, + "logits/rejected": 0.754748523235321, + "logps/chosen": -376.20001220703125, + "logps/rejected": -352.82501220703125, + "loss": 0.6848, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.6136474609375, + "rewards/margins": 0.41588133573532104, + "rewards/rejected": 0.19776001572608948, + "step": 470 + }, + { + "epoch": 0.15450229339341756, + "grad_norm": 106.0572827876084, + "learning_rate": 9.61445589182228e-07, + "logits/chosen": 0.863494873046875, + "logits/rejected": 0.796539306640625, + "logps/chosen": -400.38751220703125, + "logps/rejected": -343.07501220703125, + "loss": 0.6139, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.9970703125, + "rewards/margins": 0.566027820110321, + "rewards/rejected": 0.43206787109375, + "step": 480 + }, + { + "epoch": 0.1577210911724471, + "grad_norm": 148.66814693973356, + "learning_rate": 9.606406954282034e-07, + "logits/chosen": 0.904327392578125, + "logits/rejected": 0.8372436761856079, + "logps/chosen": -426.125, + "logps/rejected": -367.3500061035156, + "loss": 0.5789, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.104223608970642, + "rewards/margins": 0.677478015422821, + "rewards/rejected": 0.42724609375, + "step": 490 + }, + { + "epoch": 0.16093988895147662, + "grad_norm": 145.20563563115905, + "learning_rate": 9.59835801674179e-07, + "logits/chosen": 0.7755676507949829, + "logits/rejected": 0.758496105670929, + "logps/chosen": -419.3999938964844, + "logps/rejected": -376.92498779296875, + "loss": 0.5216, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.8776489496231079, + "rewards/margins": 0.8276733160018921, + "rewards/rejected": 0.04996643215417862, + "step": 500 + }, + { + "epoch": 0.16415868673050615, + "grad_norm": 84.4418196244266, + "learning_rate": 9.590309079201546e-07, + "logits/chosen": 0.815673828125, + "logits/rejected": 0.8183959722518921, + "logps/chosen": -415.7250061035156, + "logps/rejected": -372.70001220703125, + "loss": 0.5671, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.7270538210868835, + "rewards/margins": 0.744799792766571, + "rewards/rejected": -0.01804199256002903, + "step": 510 + }, + { + "epoch": 0.16737748450953568, + "grad_norm": 122.13720929335848, + "learning_rate": 9.5822601416613e-07, + "logits/chosen": 0.7340453863143921, + "logits/rejected": 0.711901843547821, + "logps/chosen": -381.63751220703125, + "logps/rejected": -350.21875, + "loss": 0.6808, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.5008178949356079, + "rewards/margins": 0.46656495332717896, + "rewards/rejected": 0.03481445461511612, + "step": 520 + }, + { + "epoch": 0.17059628228856522, + "grad_norm": 105.12866911942882, + "learning_rate": 9.574211204121056e-07, + "logits/chosen": 0.885302722454071, + "logits/rejected": 0.846630871295929, + "logps/chosen": -407.9375, + "logps/rejected": -364.7124938964844, + "loss": 0.5994, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.683367908000946, + "rewards/margins": 0.5986572504043579, + "rewards/rejected": 0.08445434272289276, + "step": 530 + }, + { + "epoch": 0.17381508006759475, + "grad_norm": 98.51348402518948, + "learning_rate": 9.566162266580811e-07, + "logits/chosen": 0.766223132610321, + "logits/rejected": 0.875048816204071, + "logps/chosen": -391.67498779296875, + "logps/rejected": -353.7124938964844, + "loss": 0.5729, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.616162121295929, + "rewards/margins": 0.7314513921737671, + "rewards/rejected": -0.11544189602136612, + "step": 540 + }, + { + "epoch": 0.17703387784662428, + "grad_norm": 145.87327055351415, + "learning_rate": 9.558113329040565e-07, + "logits/chosen": 0.9171142578125, + "logits/rejected": 0.7576538324356079, + "logps/chosen": -458.1499938964844, + "logps/rejected": -376.7250061035156, + "loss": 0.563, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.75408935546875, + "rewards/margins": 0.8544555902481079, + "rewards/rejected": -0.09978637844324112, + "step": 550 + }, + { + "epoch": 0.1802526756256538, + "grad_norm": 120.78929825968345, + "learning_rate": 9.550064391500321e-07, + "logits/chosen": 0.9074966311454773, + "logits/rejected": 0.857312023639679, + "logps/chosen": -397.5375061035156, + "logps/rejected": -358.1499938964844, + "loss": 0.6382, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.620037853717804, + "rewards/margins": 0.62677001953125, + "rewards/rejected": -0.006970214657485485, + "step": 560 + }, + { + "epoch": 0.18347147340468334, + "grad_norm": 112.22803961854301, + "learning_rate": 9.542015453960077e-07, + "logits/chosen": 1.0128905773162842, + "logits/rejected": 0.9171142578125, + "logps/chosen": -423.8999938964844, + "logps/rejected": -353.67498779296875, + "loss": 0.5491, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.0699462890625, + "rewards/margins": 0.983691394329071, + "rewards/rejected": 0.08544921875, + "step": 570 + }, + { + "epoch": 0.1866902711837129, + "grad_norm": 126.55298920088964, + "learning_rate": 9.533966516419833e-07, + "logits/chosen": 0.8557494878768921, + "logits/rejected": 0.8634277582168579, + "logps/chosen": -394.9375, + "logps/rejected": -379.5874938964844, + "loss": 0.6271, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.679058849811554, + "rewards/margins": 0.858386218547821, + "rewards/rejected": -0.17963866889476776, + "step": 580 + }, + { + "epoch": 0.18990906896274243, + "grad_norm": 154.08006088696342, + "learning_rate": 9.525917578879588e-07, + "logits/chosen": 0.819122314453125, + "logits/rejected": 0.82745361328125, + "logps/chosen": -393.6000061035156, + "logps/rejected": -342.20001220703125, + "loss": 0.5909, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.4309020936489105, + "rewards/margins": 0.74444580078125, + "rewards/rejected": -0.313955694437027, + "step": 590 + }, + { + "epoch": 0.19312786674177196, + "grad_norm": 100.99787124733443, + "learning_rate": 9.517868641339343e-07, + "logits/chosen": 0.8896484375, + "logits/rejected": 0.864501953125, + "logps/chosen": -361.8687438964844, + "logps/rejected": -343.57501220703125, + "loss": 0.6473, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.5120605230331421, + "rewards/margins": 0.5185546875, + "rewards/rejected": -0.006076049990952015, + "step": 600 + }, + { + "epoch": 0.1963466645208015, + "grad_norm": 122.9867704137838, + "learning_rate": 9.509819703799097e-07, + "logits/chosen": 0.77008056640625, + "logits/rejected": 0.80157470703125, + "logps/chosen": -440.5625, + "logps/rejected": -367.2250061035156, + "loss": 0.6164, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.560925304889679, + "rewards/margins": 0.6335083246231079, + "rewards/rejected": -0.07224731147289276, + "step": 610 + }, + { + "epoch": 0.19956546229983102, + "grad_norm": 94.01848871444173, + "learning_rate": 9.501770766258853e-07, + "logits/chosen": 0.832049548625946, + "logits/rejected": 0.745208740234375, + "logps/chosen": -403.04998779296875, + "logps/rejected": -365.0, + "loss": 0.6111, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.748077392578125, + "rewards/margins": 0.6544128656387329, + "rewards/rejected": 0.09367980808019638, + "step": 620 + }, + { + "epoch": 0.20278426007886055, + "grad_norm": 124.78115005828448, + "learning_rate": 9.493721828718608e-07, + "logits/chosen": 0.8262939453125, + "logits/rejected": 0.818219006061554, + "logps/chosen": -398.79998779296875, + "logps/rejected": -345.29998779296875, + "loss": 0.653, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.6160247921943665, + "rewards/margins": 0.497314453125, + "rewards/rejected": 0.1183319091796875, + "step": 630 + }, + { + "epoch": 0.20600305785789008, + "grad_norm": 96.37240337987899, + "learning_rate": 9.485672891178364e-07, + "logits/chosen": 0.8345702886581421, + "logits/rejected": 0.855419933795929, + "logps/chosen": -358.6499938964844, + "logps/rejected": -359.9750061035156, + "loss": 0.6463, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.6839538812637329, + "rewards/margins": 0.5669006109237671, + "rewards/rejected": 0.116668701171875, + "step": 640 + }, + { + "epoch": 0.2092218556369196, + "grad_norm": 98.44537517637919, + "learning_rate": 9.47762395363812e-07, + "logits/chosen": 0.836962878704071, + "logits/rejected": 0.783062756061554, + "logps/chosen": -383.6000061035156, + "logps/rejected": -361.29998779296875, + "loss": 0.6213, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.7096801996231079, + "rewards/margins": 0.502575695514679, + "rewards/rejected": 0.20663146674633026, + "step": 650 + }, + { + "epoch": 0.21244065341594914, + "grad_norm": 114.98703004705199, + "learning_rate": 9.469575016097875e-07, + "logits/chosen": 0.8012329339981079, + "logits/rejected": 0.8797668218612671, + "logps/chosen": -368.0, + "logps/rejected": -359.07501220703125, + "loss": 0.6359, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.531848132610321, + "rewards/margins": 0.4864257872104645, + "rewards/rejected": 0.0457611083984375, + "step": 660 + }, + { + "epoch": 0.21565945119497867, + "grad_norm": 137.08761537413147, + "learning_rate": 9.46152607855763e-07, + "logits/chosen": 0.663769543170929, + "logits/rejected": 0.7083374261856079, + "logps/chosen": -422.9375, + "logps/rejected": -424.25, + "loss": 0.5552, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.284393310546875, + "rewards/margins": 0.793353259563446, + "rewards/rejected": -0.5090087652206421, + "step": 670 + }, + { + "epoch": 0.2188782489740082, + "grad_norm": 113.66250144632087, + "learning_rate": 9.453477141017386e-07, + "logits/chosen": 0.7714477777481079, + "logits/rejected": 0.708935558795929, + "logps/chosen": -433.625, + "logps/rejected": -371.79998779296875, + "loss": 0.5727, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6012817621231079, + "rewards/margins": 0.8084472417831421, + "rewards/rejected": -0.20717772841453552, + "step": 680 + }, + { + "epoch": 0.22209704675303774, + "grad_norm": 116.57182136863611, + "learning_rate": 9.44542820347714e-07, + "logits/chosen": 0.6649169921875, + "logits/rejected": 0.625561535358429, + "logps/chosen": -423.17498779296875, + "logps/rejected": -351.98748779296875, + "loss": 0.6299, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.534350574016571, + "rewards/margins": 0.6606200933456421, + "rewards/rejected": -0.12540283799171448, + "step": 690 + }, + { + "epoch": 0.22531584453206727, + "grad_norm": 129.01839983933817, + "learning_rate": 9.437379265936895e-07, + "logits/chosen": 0.807666003704071, + "logits/rejected": 0.7372375726699829, + "logps/chosen": -453.54998779296875, + "logps/rejected": -399.57501220703125, + "loss": 0.6069, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.462738037109375, + "rewards/margins": 0.8102782964706421, + "rewards/rejected": -0.3473571836948395, + "step": 700 + }, + { + "epoch": 0.2285346423110968, + "grad_norm": 116.82192010589746, + "learning_rate": 9.429330328396651e-07, + "logits/chosen": 0.779797375202179, + "logits/rejected": 0.83154296875, + "logps/chosen": -411.25, + "logps/rejected": -352.0249938964844, + "loss": 0.5553, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 1.0826904773712158, + "rewards/margins": 1.14794921875, + "rewards/rejected": -0.06561279296875, + "step": 710 + }, + { + "epoch": 0.23175344009012633, + "grad_norm": 156.36892296085014, + "learning_rate": 9.421281390856407e-07, + "logits/chosen": 0.960131824016571, + "logits/rejected": 0.839947521686554, + "logps/chosen": -406.82501220703125, + "logps/rejected": -369.375, + "loss": 0.5789, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 1.11767578125, + "rewards/margins": 0.890002429485321, + "rewards/rejected": 0.22728271782398224, + "step": 720 + }, + { + "epoch": 0.23497223786915586, + "grad_norm": 146.08822123653349, + "learning_rate": 9.413232453316162e-07, + "logits/chosen": 0.7794189453125, + "logits/rejected": 0.758349597454071, + "logps/chosen": -358.3999938964844, + "logps/rejected": -358.29998779296875, + "loss": 0.6694, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5718139410018921, + "rewards/margins": 0.661389172077179, + "rewards/rejected": -0.08905029296875, + "step": 730 + }, + { + "epoch": 0.2381910356481854, + "grad_norm": 103.58240837359718, + "learning_rate": 9.405183515775918e-07, + "logits/chosen": 0.844042956829071, + "logits/rejected": 0.7357422113418579, + "logps/chosen": -436.5874938964844, + "logps/rejected": -395.6000061035156, + "loss": 0.6659, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5718994140625, + "rewards/margins": 0.6162109375, + "rewards/rejected": -0.04427490383386612, + "step": 740 + }, + { + "epoch": 0.24140983342721495, + "grad_norm": 94.55613023978667, + "learning_rate": 9.397134578235673e-07, + "logits/chosen": 0.8448241949081421, + "logits/rejected": 0.872088611125946, + "logps/chosen": -417.54998779296875, + "logps/rejected": -379.8999938964844, + "loss": 0.5354, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.7568908929824829, + "rewards/margins": 0.880419909954071, + "rewards/rejected": -0.12406615912914276, + "step": 750 + }, + { + "epoch": 0.24462863120624448, + "grad_norm": 108.10417504692975, + "learning_rate": 9.389085640695427e-07, + "logits/chosen": 0.910327136516571, + "logits/rejected": 0.918701171875, + "logps/chosen": -422.29998779296875, + "logps/rejected": -358.8500061035156, + "loss": 0.4736, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.0547668933868408, + "rewards/margins": 1.156005859375, + "rewards/rejected": -0.10032959282398224, + "step": 760 + }, + { + "epoch": 0.247847428985274, + "grad_norm": 176.5664717001598, + "learning_rate": 9.381036703155183e-07, + "logits/chosen": 0.9638000726699829, + "logits/rejected": 0.989208996295929, + "logps/chosen": -392.8374938964844, + "logps/rejected": -329.4750061035156, + "loss": 0.5945, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.4412841796875, + "rewards/margins": 0.822460949420929, + "rewards/rejected": -0.38138121366500854, + "step": 770 + }, + { + "epoch": 0.2510662267643035, + "grad_norm": 101.0690370928322, + "learning_rate": 9.372987765614938e-07, + "logits/chosen": 0.7282348871231079, + "logits/rejected": 0.7823120355606079, + "logps/chosen": -435.4125061035156, + "logps/rejected": -406.0249938964844, + "loss": 0.5575, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.19276733696460724, + "rewards/margins": 0.959851086139679, + "rewards/rejected": -0.7668212652206421, + "step": 780 + }, + { + "epoch": 0.25428502454333307, + "grad_norm": 114.58157401163727, + "learning_rate": 9.364938828074693e-07, + "logits/chosen": 0.7818084955215454, + "logits/rejected": 0.861083984375, + "logps/chosen": -419.57501220703125, + "logps/rejected": -377.95001220703125, + "loss": 0.5977, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.184173583984375, + "rewards/margins": 0.70361328125, + "rewards/rejected": -0.519726574420929, + "step": 790 + }, + { + "epoch": 0.2575038223223626, + "grad_norm": 125.32416816640854, + "learning_rate": 9.35688989053445e-07, + "logits/chosen": 0.7943481206893921, + "logits/rejected": 0.7965453863143921, + "logps/chosen": -398.25, + "logps/rejected": -375.82501220703125, + "loss": 0.6256, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.49952393770217896, + "rewards/margins": 0.7149718999862671, + "rewards/rejected": -0.21594543755054474, + "step": 800 + }, + { + "epoch": 0.26072262010139213, + "grad_norm": 153.33685610614802, + "learning_rate": 9.348840952994205e-07, + "logits/chosen": 0.8468017578125, + "logits/rejected": 0.972363293170929, + "logps/chosen": -374.7875061035156, + "logps/rejected": -356.5249938964844, + "loss": 0.6511, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6676269769668579, + "rewards/margins": 0.6561279296875, + "rewards/rejected": 0.01080932654440403, + "step": 810 + }, + { + "epoch": 0.26394141788042164, + "grad_norm": 75.82123206574856, + "learning_rate": 9.34079201545396e-07, + "logits/chosen": 1.0728027820587158, + "logits/rejected": 1.0114257335662842, + "logps/chosen": -366.25, + "logps/rejected": -315.5249938964844, + "loss": 0.5695, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.886029064655304, + "rewards/margins": 0.841967761516571, + "rewards/rejected": 0.0443267822265625, + "step": 820 + }, + { + "epoch": 0.2671602156594512, + "grad_norm": 135.9482162869519, + "learning_rate": 9.332743077913716e-07, + "logits/chosen": 1.0128173828125, + "logits/rejected": 0.907116711139679, + "logps/chosen": -390.54998779296875, + "logps/rejected": -366.92498779296875, + "loss": 0.5862, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.7749572992324829, + "rewards/margins": 0.7844604253768921, + "rewards/rejected": -0.010311889462172985, + "step": 830 + }, + { + "epoch": 0.27037901343848075, + "grad_norm": 137.82838189843346, + "learning_rate": 9.32469414037347e-07, + "logits/chosen": 0.9974609613418579, + "logits/rejected": 0.965039074420929, + "logps/chosen": -382.1625061035156, + "logps/rejected": -355.0375061035156, + "loss": 0.6108, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.624194324016571, + "rewards/margins": 0.700793445110321, + "rewards/rejected": -0.07603760063648224, + "step": 840 + }, + { + "epoch": 0.27359781121751026, + "grad_norm": 128.04493846743733, + "learning_rate": 9.316645202833225e-07, + "logits/chosen": 0.990478515625, + "logits/rejected": 0.951141357421875, + "logps/chosen": -438.63751220703125, + "logps/rejected": -363.29998779296875, + "loss": 0.628, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.815399169921875, + "rewards/margins": 0.5738586187362671, + "rewards/rejected": 0.2418212890625, + "step": 850 + }, + { + "epoch": 0.2768166089965398, + "grad_norm": 164.5639855193249, + "learning_rate": 9.308596265292981e-07, + "logits/chosen": 0.9894043207168579, + "logits/rejected": 0.8409408330917358, + "logps/chosen": -448.95001220703125, + "logps/rejected": -348.8374938964844, + "loss": 0.6132, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.9194396734237671, + "rewards/margins": 0.7328246831893921, + "rewards/rejected": 0.18695068359375, + "step": 860 + }, + { + "epoch": 0.2800354067755693, + "grad_norm": 152.45366304408336, + "learning_rate": 9.300547327752736e-07, + "logits/chosen": 1.02117919921875, + "logits/rejected": 0.991650402545929, + "logps/chosen": -372.625, + "logps/rejected": -380.32501220703125, + "loss": 0.6081, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.9017333984375, + "rewards/margins": 0.7234252691268921, + "rewards/rejected": 0.17891845107078552, + "step": 870 + }, + { + "epoch": 0.2832542045545989, + "grad_norm": 120.28934096371297, + "learning_rate": 9.292498390212492e-07, + "logits/chosen": 0.982226550579071, + "logits/rejected": 0.9811767339706421, + "logps/chosen": -337.3500061035156, + "logps/rejected": -326.125, + "loss": 0.528, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.49395751953125, + "rewards/margins": 0.8324950933456421, + "rewards/rejected": -0.3384338319301605, + "step": 880 + }, + { + "epoch": 0.2864730023336284, + "grad_norm": 165.9463967114343, + "learning_rate": 9.284449452672248e-07, + "logits/chosen": 0.9764159917831421, + "logits/rejected": 1.0019042491912842, + "logps/chosen": -381.7250061035156, + "logps/rejected": -339.45001220703125, + "loss": 0.618, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.639300525188446, + "rewards/margins": 0.681933581829071, + "rewards/rejected": -0.04223022609949112, + "step": 890 + }, + { + "epoch": 0.28969180011265794, + "grad_norm": 124.24031942644083, + "learning_rate": 9.276400515132003e-07, + "logits/chosen": 0.8619323968887329, + "logits/rejected": 0.88299560546875, + "logps/chosen": -364.38751220703125, + "logps/rejected": -386.625, + "loss": 0.6453, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.696856677532196, + "rewards/margins": 0.5087035894393921, + "rewards/rejected": 0.18862609565258026, + "step": 900 + }, + { + "epoch": 0.29291059789168744, + "grad_norm": 120.94307427180841, + "learning_rate": 9.268351577591757e-07, + "logits/chosen": 0.9283355474472046, + "logits/rejected": 0.871997058391571, + "logps/chosen": -446.70001220703125, + "logps/rejected": -387.625, + "loss": 0.7059, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.3326416015625, + "rewards/margins": 0.56170654296875, + "rewards/rejected": -0.2288818359375, + "step": 910 + }, + { + "epoch": 0.296129395670717, + "grad_norm": 92.61768155172672, + "learning_rate": 9.260302640051512e-07, + "logits/chosen": 0.798657238483429, + "logits/rejected": 0.8197265863418579, + "logps/chosen": -415.3500061035156, + "logps/rejected": -395.29998779296875, + "loss": 0.5666, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.12929686903953552, + "rewards/margins": 0.85443115234375, + "rewards/rejected": -0.7252715826034546, + "step": 920 + }, + { + "epoch": 0.2993481934497465, + "grad_norm": 166.857610446878, + "learning_rate": 9.252253702511268e-07, + "logits/chosen": 0.710357666015625, + "logits/rejected": 0.7950195074081421, + "logps/chosen": -384.42498779296875, + "logps/rejected": -377.3500061035156, + "loss": 0.5459, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.26148682832717896, + "rewards/margins": 0.965435802936554, + "rewards/rejected": -0.704541027545929, + "step": 930 + }, + { + "epoch": 0.30256699122877606, + "grad_norm": 115.94128926095229, + "learning_rate": 9.244204764971023e-07, + "logits/chosen": 0.7609618902206421, + "logits/rejected": 0.738171398639679, + "logps/chosen": -421.1499938964844, + "logps/rejected": -376.7124938964844, + "loss": 0.6023, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.07562866061925888, + "rewards/margins": 0.8101562261581421, + "rewards/rejected": -0.7355102300643921, + "step": 940 + }, + { + "epoch": 0.30578578900780556, + "grad_norm": 98.8086579693207, + "learning_rate": 9.236155827430778e-07, + "logits/chosen": 0.917102038860321, + "logits/rejected": 0.907958984375, + "logps/chosen": -370.375, + "logps/rejected": -334.2250061035156, + "loss": 0.599, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": 0.41017454862594604, + "rewards/margins": 0.810986340045929, + "rewards/rejected": -0.4005676209926605, + "step": 950 + }, + { + "epoch": 0.3090045867868351, + "grad_norm": 131.85498305356992, + "learning_rate": 9.228106889890535e-07, + "logits/chosen": 0.8178955316543579, + "logits/rejected": 0.872265636920929, + "logps/chosen": -398.29998779296875, + "logps/rejected": -370.82501220703125, + "loss": 0.6138, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.662365734577179, + "rewards/margins": 0.8785156011581421, + "rewards/rejected": -0.21645507216453552, + "step": 960 + }, + { + "epoch": 0.3122233845658646, + "grad_norm": 145.01569136204378, + "learning_rate": 9.22005795235029e-07, + "logits/chosen": 0.9415527582168579, + "logits/rejected": 0.899249255657196, + "logps/chosen": -422.75, + "logps/rejected": -369.48748779296875, + "loss": 0.6018, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6969238519668579, + "rewards/margins": 0.7401123046875, + "rewards/rejected": -0.04192199558019638, + "step": 970 + }, + { + "epoch": 0.3154421823448942, + "grad_norm": 166.1042792414827, + "learning_rate": 9.212009014810044e-07, + "logits/chosen": 0.964306652545929, + "logits/rejected": 0.923718273639679, + "logps/chosen": -431.1875, + "logps/rejected": -371.25, + "loss": 0.6124, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.5375610589981079, + "rewards/margins": 0.63946533203125, + "rewards/rejected": -0.10178832709789276, + "step": 980 + }, + { + "epoch": 0.3186609801239237, + "grad_norm": 129.50819143480481, + "learning_rate": 9.2039600772698e-07, + "logits/chosen": 0.9199676513671875, + "logits/rejected": 0.922656238079071, + "logps/chosen": -372.7749938964844, + "logps/rejected": -364.28125, + "loss": 0.6136, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8354705572128296, + "rewards/margins": 0.678088366985321, + "rewards/rejected": 0.1572265625, + "step": 990 + }, + { + "epoch": 0.32187977790295325, + "grad_norm": 126.38722088972222, + "learning_rate": 9.195911139729555e-07, + "logits/chosen": 0.957354724407196, + "logits/rejected": 0.951080322265625, + "logps/chosen": -414.6625061035156, + "logps/rejected": -366.04998779296875, + "loss": 0.6859, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 1.0168578624725342, + "rewards/margins": 0.46246337890625, + "rewards/rejected": 0.5536438226699829, + "step": 1000 + }, + { + "epoch": 0.3250985756819828, + "grad_norm": 107.53706034074659, + "learning_rate": 9.18786220218931e-07, + "logits/chosen": 1.224609375, + "logits/rejected": 1.1624023914337158, + "logps/chosen": -363.6000061035156, + "logps/rejected": -315.20001220703125, + "loss": 0.6257, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 1.332543969154358, + "rewards/margins": 0.6716064214706421, + "rewards/rejected": 0.6612793207168579, + "step": 1010 + }, + { + "epoch": 0.3283173734610123, + "grad_norm": 138.3302383549993, + "learning_rate": 9.179813264649066e-07, + "logits/chosen": 0.9932616949081421, + "logits/rejected": 1.0188782215118408, + "logps/chosen": -366.7250061035156, + "logps/rejected": -327.7749938964844, + "loss": 0.5265, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 1.23455810546875, + "rewards/margins": 0.875927746295929, + "rewards/rejected": 0.358468621969223, + "step": 1020 + }, + { + "epoch": 0.33153617124004187, + "grad_norm": 90.92135615400406, + "learning_rate": 9.171764327108821e-07, + "logits/chosen": 0.87164306640625, + "logits/rejected": 0.85986328125, + "logps/chosen": -424.67498779296875, + "logps/rejected": -356.8500061035156, + "loss": 0.6245, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7192138433456421, + "rewards/margins": 0.6904296875, + "rewards/rejected": 0.02871093712747097, + "step": 1030 + }, + { + "epoch": 0.33475496901907137, + "grad_norm": 191.72831763035254, + "learning_rate": 9.163715389568577e-07, + "logits/chosen": 0.8248291015625, + "logits/rejected": 0.8111816644668579, + "logps/chosen": -425.2250061035156, + "logps/rejected": -372.11248779296875, + "loss": 0.5641, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.764233410358429, + "rewards/margins": 0.820269763469696, + "rewards/rejected": -0.05629882961511612, + "step": 1040 + }, + { + "epoch": 0.3379737667981009, + "grad_norm": 84.38002510070466, + "learning_rate": 9.155666452028333e-07, + "logits/chosen": 0.8145996332168579, + "logits/rejected": 0.805407702922821, + "logps/chosen": -419.33123779296875, + "logps/rejected": -364.2250061035156, + "loss": 0.4965, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": 0.843554675579071, + "rewards/margins": 1.022485375404358, + "rewards/rejected": -0.17857666313648224, + "step": 1050 + }, + { + "epoch": 0.34119256457713043, + "grad_norm": 158.42272916795147, + "learning_rate": 9.147617514488087e-07, + "logits/chosen": 0.7636474370956421, + "logits/rejected": 0.784313976764679, + "logps/chosen": -383.6875, + "logps/rejected": -373.95001220703125, + "loss": 0.6314, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.5824950933456421, + "rewards/margins": 0.670269787311554, + "rewards/rejected": -0.0880889892578125, + "step": 1060 + }, + { + "epoch": 0.34441136235616, + "grad_norm": 95.26961820207424, + "learning_rate": 9.139568576947842e-07, + "logits/chosen": 0.909130871295929, + "logits/rejected": 0.863476574420929, + "logps/chosen": -377.48748779296875, + "logps/rejected": -333.0625, + "loss": 0.5618, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.46949464082717896, + "rewards/margins": 0.9392639398574829, + "rewards/rejected": -0.469451904296875, + "step": 1070 + }, + { + "epoch": 0.3476301601351895, + "grad_norm": 170.2524431838552, + "learning_rate": 9.131519639407598e-07, + "logits/chosen": 0.8279144167900085, + "logits/rejected": 0.7854889035224915, + "logps/chosen": -420.17498779296875, + "logps/rejected": -386.09375, + "loss": 0.6191, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.7132323980331421, + "rewards/margins": 0.8043457269668579, + "rewards/rejected": -0.09125366061925888, + "step": 1080 + }, + { + "epoch": 0.35084895791421905, + "grad_norm": 76.08911958489035, + "learning_rate": 9.123470701867353e-07, + "logits/chosen": 0.8701629638671875, + "logits/rejected": 0.831347644329071, + "logps/chosen": -408.04998779296875, + "logps/rejected": -351.4750061035156, + "loss": 0.57, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.7591552734375, + "rewards/margins": 0.9250243902206421, + "rewards/rejected": -0.16675719618797302, + "step": 1090 + }, + { + "epoch": 0.35406775569324855, + "grad_norm": 115.3308720535636, + "learning_rate": 9.115421764327108e-07, + "logits/chosen": 0.884350597858429, + "logits/rejected": 0.895434558391571, + "logps/chosen": -402.125, + "logps/rejected": -356.5, + "loss": 0.5844, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 1.012304663658142, + "rewards/margins": 1.0185120105743408, + "rewards/rejected": -0.00634765625, + "step": 1100 + }, + { + "epoch": 0.3572865534722781, + "grad_norm": 128.04519013240542, + "learning_rate": 9.107372826786864e-07, + "logits/chosen": 0.8774505853652954, + "logits/rejected": 0.857128918170929, + "logps/chosen": -410.95001220703125, + "logps/rejected": -397.79998779296875, + "loss": 0.6769, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4829955995082855, + "rewards/margins": 0.7186523675918579, + "rewards/rejected": -0.23542480170726776, + "step": 1110 + }, + { + "epoch": 0.3605053512513076, + "grad_norm": 82.17206153533643, + "learning_rate": 9.09932388924662e-07, + "logits/chosen": 0.690136730670929, + "logits/rejected": 0.658978283405304, + "logps/chosen": -396.10626220703125, + "logps/rejected": -383.48748779296875, + "loss": 0.5675, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.5024963617324829, + "rewards/margins": 0.8003906011581421, + "rewards/rejected": -0.297393798828125, + "step": 1120 + }, + { + "epoch": 0.3637241490303372, + "grad_norm": 97.6117393870686, + "learning_rate": 9.091274951706374e-07, + "logits/chosen": 0.7114013433456421, + "logits/rejected": 0.7171783447265625, + "logps/chosen": -402.42498779296875, + "logps/rejected": -361.23748779296875, + "loss": 0.5941, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.5426880121231079, + "rewards/margins": 0.784594714641571, + "rewards/rejected": -0.24268798530101776, + "step": 1130 + }, + { + "epoch": 0.3669429468093667, + "grad_norm": 136.11788761734726, + "learning_rate": 9.08322601416613e-07, + "logits/chosen": 0.7302795648574829, + "logits/rejected": 0.732452392578125, + "logps/chosen": -418.8500061035156, + "logps/rejected": -370.5, + "loss": 0.4994, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.765856146812439, + "rewards/margins": 1.056848168373108, + "rewards/rejected": -0.29130858182907104, + "step": 1140 + }, + { + "epoch": 0.37016174458839624, + "grad_norm": 90.9482934670661, + "learning_rate": 9.075177076625885e-07, + "logits/chosen": 0.6654327511787415, + "logits/rejected": 0.771960437297821, + "logps/chosen": -347.1625061035156, + "logps/rejected": -355.17498779296875, + "loss": 0.7108, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.5708892941474915, + "rewards/margins": 0.6116698980331421, + "rewards/rejected": -0.04078369215130806, + "step": 1150 + }, + { + "epoch": 0.3733805423674258, + "grad_norm": 113.76033751969729, + "learning_rate": 9.06712813908564e-07, + "logits/chosen": 0.8090575933456421, + "logits/rejected": 0.831225574016571, + "logps/chosen": -407.67498779296875, + "logps/rejected": -352.95001220703125, + "loss": 0.6051, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.5958007574081421, + "rewards/margins": 0.8100951910018921, + "rewards/rejected": -0.21381835639476776, + "step": 1160 + }, + { + "epoch": 0.3765993401464553, + "grad_norm": 151.44760376739922, + "learning_rate": 9.059079201545396e-07, + "logits/chosen": 0.7809814214706421, + "logits/rejected": 0.804553210735321, + "logps/chosen": -439.8500061035156, + "logps/rejected": -374.32501220703125, + "loss": 0.5847, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.530133068561554, + "rewards/margins": 0.9120239019393921, + "rewards/rejected": -0.3819824159145355, + "step": 1170 + }, + { + "epoch": 0.37981813792548486, + "grad_norm": 120.19177468893038, + "learning_rate": 9.051030264005151e-07, + "logits/chosen": 0.74072265625, + "logits/rejected": 0.793072521686554, + "logps/chosen": -369.82501220703125, + "logps/rejected": -342.5375061035156, + "loss": 0.4922, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8114975094795227, + "rewards/margins": 1.0849182605743408, + "rewards/rejected": -0.2739196717739105, + "step": 1180 + }, + { + "epoch": 0.38303693570451436, + "grad_norm": 65.84956237497434, + "learning_rate": 9.042981326464906e-07, + "logits/chosen": 0.7457336187362671, + "logits/rejected": 0.78619384765625, + "logps/chosen": -330.51251220703125, + "logps/rejected": -333.2875061035156, + "loss": 0.6143, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.762341320514679, + "rewards/margins": 0.7781738042831421, + "rewards/rejected": -0.01674346998333931, + "step": 1190 + }, + { + "epoch": 0.3862557334835439, + "grad_norm": 131.44501215656146, + "learning_rate": 9.034932388924663e-07, + "logits/chosen": 0.6068115234375, + "logits/rejected": 0.5734618902206421, + "logps/chosen": -424.0249938964844, + "logps/rejected": -391.1000061035156, + "loss": 0.696, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.5629379153251648, + "rewards/margins": 0.7165893316268921, + "rewards/rejected": -0.15408936142921448, + "step": 1200 + }, + { + "epoch": 0.3894745312625734, + "grad_norm": 120.31847674667104, + "learning_rate": 9.026883451384417e-07, + "logits/chosen": 0.695361316204071, + "logits/rejected": 0.627636730670929, + "logps/chosen": -415.75, + "logps/rejected": -367.3500061035156, + "loss": 0.6031, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.42229002714157104, + "rewards/margins": 0.9678100347518921, + "rewards/rejected": -0.5452636480331421, + "step": 1210 + }, + { + "epoch": 0.392693329041603, + "grad_norm": 168.54272713277982, + "learning_rate": 9.018834513844172e-07, + "logits/chosen": 0.6578918695449829, + "logits/rejected": 0.7466369867324829, + "logps/chosen": -389.5625, + "logps/rejected": -373.1499938964844, + "loss": 0.5669, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.7447448968887329, + "rewards/margins": 0.8658447265625, + "rewards/rejected": -0.12087402492761612, + "step": 1220 + }, + { + "epoch": 0.3959121268206325, + "grad_norm": 196.45764205602313, + "learning_rate": 9.010785576303927e-07, + "logits/chosen": 0.724169909954071, + "logits/rejected": 0.758056640625, + "logps/chosen": -380.98748779296875, + "logps/rejected": -364.4375, + "loss": 0.6597, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.8060363531112671, + "rewards/margins": 0.6910613775253296, + "rewards/rejected": 0.11439819633960724, + "step": 1230 + }, + { + "epoch": 0.39913092459966204, + "grad_norm": 115.6411955612444, + "learning_rate": 9.002736638763683e-07, + "logits/chosen": 0.6134033203125, + "logits/rejected": 0.6197662353515625, + "logps/chosen": -355.5, + "logps/rejected": -341.3500061035156, + "loss": 0.5692, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.555615246295929, + "rewards/margins": 0.7964233160018921, + "rewards/rejected": -0.24102783203125, + "step": 1240 + }, + { + "epoch": 0.40234972237869154, + "grad_norm": 148.580277337901, + "learning_rate": 8.994687701223438e-07, + "logits/chosen": 0.5727905035018921, + "logits/rejected": 0.592578113079071, + "logps/chosen": -413.2250061035156, + "logps/rejected": -362.625, + "loss": 0.6509, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.219268798828125, + "rewards/margins": 0.7148193120956421, + "rewards/rejected": -0.4957031309604645, + "step": 1250 + }, + { + "epoch": 0.4055685201577211, + "grad_norm": 89.25772213881434, + "learning_rate": 8.986638763683193e-07, + "logits/chosen": 0.7170044183731079, + "logits/rejected": 0.7281128168106079, + "logps/chosen": -439.625, + "logps/rejected": -398.07501220703125, + "loss": 0.5274, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.977587878704071, + "rewards/margins": 0.9934936761856079, + "rewards/rejected": -0.01645507849752903, + "step": 1260 + }, + { + "epoch": 0.4087873179367506, + "grad_norm": 152.11932491494767, + "learning_rate": 8.978589826142948e-07, + "logits/chosen": 0.7604430913925171, + "logits/rejected": 0.7707794308662415, + "logps/chosen": -435.8374938964844, + "logps/rejected": -414.36248779296875, + "loss": 0.5976, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.869537353515625, + "rewards/margins": 0.8920043706893921, + "rewards/rejected": -0.0223388671875, + "step": 1270 + }, + { + "epoch": 0.41200611571578016, + "grad_norm": 100.28967995687248, + "learning_rate": 8.970540888602704e-07, + "logits/chosen": 0.6881469488143921, + "logits/rejected": 0.678027331829071, + "logps/chosen": -344.6499938964844, + "logps/rejected": -329.1000061035156, + "loss": 0.5377, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.8848846554756165, + "rewards/margins": 0.8819335699081421, + "rewards/rejected": 0.0033386230934411287, + "step": 1280 + }, + { + "epoch": 0.41522491349480967, + "grad_norm": 100.432537444906, + "learning_rate": 8.962491951062459e-07, + "logits/chosen": 0.706616222858429, + "logits/rejected": 0.7476440668106079, + "logps/chosen": -372.20001220703125, + "logps/rejected": -325.11248779296875, + "loss": 0.7144, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.8992675542831421, + "rewards/margins": 0.665270984172821, + "rewards/rejected": 0.23406982421875, + "step": 1290 + }, + { + "epoch": 0.4184437112738392, + "grad_norm": 122.21604689701157, + "learning_rate": 8.954443013522215e-07, + "logits/chosen": 0.848071277141571, + "logits/rejected": 0.8653365969657898, + "logps/chosen": -368.7749938964844, + "logps/rejected": -351.42498779296875, + "loss": 0.6352, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.9377502202987671, + "rewards/margins": 0.7421020269393921, + "rewards/rejected": 0.19526366889476776, + "step": 1300 + }, + { + "epoch": 0.42166250905286873, + "grad_norm": 105.2527241613605, + "learning_rate": 8.94639407598197e-07, + "logits/chosen": 0.9032958745956421, + "logits/rejected": 0.896344006061554, + "logps/chosen": -399.6000061035156, + "logps/rejected": -375.625, + "loss": 0.5666, + "rewards/accuracies": 0.65625, + "rewards/chosen": 1.016119360923767, + "rewards/margins": 0.8512939214706421, + "rewards/rejected": 0.16531066596508026, + "step": 1310 + }, + { + "epoch": 0.4248813068318983, + "grad_norm": 144.27365670538185, + "learning_rate": 8.938345138441725e-07, + "logits/chosen": 0.7814544439315796, + "logits/rejected": 0.825390636920929, + "logps/chosen": -370.4750061035156, + "logps/rejected": -357.82501220703125, + "loss": 0.6555, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.832141101360321, + "rewards/margins": 0.57562255859375, + "rewards/rejected": 0.2562499940395355, + "step": 1320 + }, + { + "epoch": 0.42810010461092785, + "grad_norm": 105.08969302837095, + "learning_rate": 8.930296200901481e-07, + "logits/chosen": 0.77496337890625, + "logits/rejected": 0.7029052972793579, + "logps/chosen": -401.82501220703125, + "logps/rejected": -362.875, + "loss": 0.6142, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6996917724609375, + "rewards/margins": 0.781542956829071, + "rewards/rejected": -0.08185424655675888, + "step": 1330 + }, + { + "epoch": 0.43131890238995735, + "grad_norm": 128.0888735891468, + "learning_rate": 8.922247263361236e-07, + "logits/chosen": 0.762890636920929, + "logits/rejected": 0.6710205078125, + "logps/chosen": -440.2124938964844, + "logps/rejected": -392.04998779296875, + "loss": 0.5266, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.602795422077179, + "rewards/margins": 1.137597680091858, + "rewards/rejected": -0.534515380859375, + "step": 1340 + }, + { + "epoch": 0.4345377001689869, + "grad_norm": 134.40983834935247, + "learning_rate": 8.91419832582099e-07, + "logits/chosen": 0.665020763874054, + "logits/rejected": 0.5494323968887329, + "logps/chosen": -437.2749938964844, + "logps/rejected": -410.4750061035156, + "loss": 0.6497, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.6932128667831421, + "rewards/margins": 0.8495117425918579, + "rewards/rejected": -0.15596923232078552, + "step": 1350 + }, + { + "epoch": 0.4377564979480164, + "grad_norm": 118.6664442204952, + "learning_rate": 8.906149388280747e-07, + "logits/chosen": 0.7923583984375, + "logits/rejected": 0.7518066167831421, + "logps/chosen": -375.51251220703125, + "logps/rejected": -354.67498779296875, + "loss": 0.5922, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.776690661907196, + "rewards/margins": 0.8675903081893921, + "rewards/rejected": -0.0911865234375, + "step": 1360 + }, + { + "epoch": 0.44097529572704597, + "grad_norm": 88.9862898648953, + "learning_rate": 8.898100450740502e-07, + "logits/chosen": 0.719805896282196, + "logits/rejected": 0.5922607183456421, + "logps/chosen": -418.75, + "logps/rejected": -380.7749938964844, + "loss": 0.6298, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.6201232671737671, + "rewards/margins": 0.84161376953125, + "rewards/rejected": -0.22088012099266052, + "step": 1370 + }, + { + "epoch": 0.44419409350607547, + "grad_norm": 93.2541208336622, + "learning_rate": 8.890051513200257e-07, + "logits/chosen": 0.74285888671875, + "logits/rejected": 0.709027111530304, + "logps/chosen": -384.17498779296875, + "logps/rejected": -377.8500061035156, + "loss": 0.6035, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.7671310305595398, + "rewards/margins": 0.871325671672821, + "rewards/rejected": -0.10482177883386612, + "step": 1380 + }, + { + "epoch": 0.44741289128510503, + "grad_norm": 102.11526447982123, + "learning_rate": 8.882002575660013e-07, + "logits/chosen": 0.927258312702179, + "logits/rejected": 0.85107421875, + "logps/chosen": -410.95001220703125, + "logps/rejected": -354.3999938964844, + "loss": 0.5885, + "rewards/accuracies": 0.65625, + "rewards/chosen": 1.058325171470642, + "rewards/margins": 0.917407214641571, + "rewards/rejected": 0.14018554985523224, + "step": 1390 + }, + { + "epoch": 0.45063168906413453, + "grad_norm": 66.8366894519166, + "learning_rate": 8.873953638119768e-07, + "logits/chosen": 0.896618664264679, + "logits/rejected": 0.8875488042831421, + "logps/chosen": -376.9375, + "logps/rejected": -343.3374938964844, + "loss": 0.5348, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.1851043701171875, + "rewards/margins": 0.7641235589981079, + "rewards/rejected": 0.4222961366176605, + "step": 1400 + }, + { + "epoch": 0.4538504868431641, + "grad_norm": 90.61296739151915, + "learning_rate": 8.865904700579523e-07, + "logits/chosen": 0.9149414300918579, + "logits/rejected": 0.827136218547821, + "logps/chosen": -379.82501220703125, + "logps/rejected": -325.125, + "loss": 0.591, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 1.011376976966858, + "rewards/margins": 0.8319457769393921, + "rewards/rejected": 0.17889404296875, + "step": 1410 + }, + { + "epoch": 0.4570692846221936, + "grad_norm": 196.78558008935326, + "learning_rate": 8.857855763039278e-07, + "logits/chosen": 0.69482421875, + "logits/rejected": 0.713836669921875, + "logps/chosen": -382.32501220703125, + "logps/rejected": -350.5874938964844, + "loss": 0.6389, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.7297118902206421, + "rewards/margins": 0.713134765625, + "rewards/rejected": 0.01707153394818306, + "step": 1420 + }, + { + "epoch": 0.46028808240122315, + "grad_norm": 143.668703179847, + "learning_rate": 8.849806825499033e-07, + "logits/chosen": 0.6834716796875, + "logits/rejected": 0.773144543170929, + "logps/chosen": -316.36248779296875, + "logps/rejected": -330.2250061035156, + "loss": 0.5317, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.6270507574081421, + "rewards/margins": 0.895800769329071, + "rewards/rejected": -0.26994627714157104, + "step": 1430 + }, + { + "epoch": 0.46350688018025266, + "grad_norm": 118.95252665554322, + "learning_rate": 8.841757887958789e-07, + "logits/chosen": 0.8169311285018921, + "logits/rejected": 0.8177124261856079, + "logps/chosen": -434.04998779296875, + "logps/rejected": -405.95001220703125, + "loss": 0.5168, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.669238269329071, + "rewards/margins": 0.9462035894393921, + "rewards/rejected": -0.2770752012729645, + "step": 1440 + }, + { + "epoch": 0.4667256779592822, + "grad_norm": 130.46406284062596, + "learning_rate": 8.833708950418545e-07, + "logits/chosen": 0.772290050983429, + "logits/rejected": 0.756518542766571, + "logps/chosen": -418.54998779296875, + "logps/rejected": -357.76251220703125, + "loss": 0.6018, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.875659167766571, + "rewards/margins": 0.8941650390625, + "rewards/rejected": -0.01926269568502903, + "step": 1450 + }, + { + "epoch": 0.4699444757383117, + "grad_norm": 80.23990117322506, + "learning_rate": 8.8256600128783e-07, + "logits/chosen": 0.7005370855331421, + "logits/rejected": 0.7146240472793579, + "logps/chosen": -445.54998779296875, + "logps/rejected": -394.8999938964844, + "loss": 0.5956, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.8713623285293579, + "rewards/margins": 0.9448486566543579, + "rewards/rejected": -0.07421569526195526, + "step": 1460 + }, + { + "epoch": 0.4731632735173413, + "grad_norm": 140.18916921057345, + "learning_rate": 8.817611075338055e-07, + "logits/chosen": 0.803881824016571, + "logits/rejected": 0.7230224609375, + "logps/chosen": -453.0249938964844, + "logps/rejected": -382.0249938964844, + "loss": 0.5921, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.731823742389679, + "rewards/margins": 0.940930187702179, + "rewards/rejected": -0.20916748046875, + "step": 1470 + }, + { + "epoch": 0.4763820712963708, + "grad_norm": 150.90532174491113, + "learning_rate": 8.809562137797811e-07, + "logits/chosen": 0.618115246295929, + "logits/rejected": 0.7041991949081421, + "logps/chosen": -408.1000061035156, + "logps/rejected": -388.8999938964844, + "loss": 0.6422, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.7273925542831421, + "rewards/margins": 0.758190929889679, + "rewards/rejected": -0.03084106370806694, + "step": 1480 + }, + { + "epoch": 0.47960086907540034, + "grad_norm": 110.84050141948377, + "learning_rate": 8.801513200257566e-07, + "logits/chosen": 0.760009765625, + "logits/rejected": 0.69610595703125, + "logps/chosen": -439.9750061035156, + "logps/rejected": -381.75, + "loss": 0.5558, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.6460205316543579, + "rewards/margins": 0.88037109375, + "rewards/rejected": -0.23570556938648224, + "step": 1490 + }, + { + "epoch": 0.4828196668544299, + "grad_norm": 86.63252004392773, + "learning_rate": 8.79346426271732e-07, + "logits/chosen": 0.627197265625, + "logits/rejected": 0.6641601324081421, + "logps/chosen": -393.45001220703125, + "logps/rejected": -377.70001220703125, + "loss": 0.5164, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.50653076171875, + "rewards/margins": 1.0450561046600342, + "rewards/rejected": -0.5389648675918579, + "step": 1500 + }, + { + "epoch": 0.4860384646334594, + "grad_norm": 134.43895354601642, + "learning_rate": 8.785415325177076e-07, + "logits/chosen": 0.565991222858429, + "logits/rejected": 0.509246826171875, + "logps/chosen": -409.5874938964844, + "logps/rejected": -351.82501220703125, + "loss": 0.6009, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.23100586235523224, + "rewards/margins": 0.6974121332168579, + "rewards/rejected": -0.4656127989292145, + "step": 1510 + }, + { + "epoch": 0.48925726241248896, + "grad_norm": 143.47506140856004, + "learning_rate": 8.777366387636832e-07, + "logits/chosen": 0.473226934671402, + "logits/rejected": 0.42888182401657104, + "logps/chosen": -444.75, + "logps/rejected": -377.61248779296875, + "loss": 0.5921, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.256256103515625, + "rewards/margins": 0.9751952886581421, + "rewards/rejected": -0.7195678949356079, + "step": 1520 + }, + { + "epoch": 0.49247606019151846, + "grad_norm": 116.3816203192845, + "learning_rate": 8.769317450096587e-07, + "logits/chosen": 0.7764892578125, + "logits/rejected": 0.790771484375, + "logps/chosen": -421.67498779296875, + "logps/rejected": -374.32501220703125, + "loss": 0.6026, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.4445556700229645, + "rewards/margins": 1.0240478515625, + "rewards/rejected": -0.5795654058456421, + "step": 1530 + }, + { + "epoch": 0.495694857970548, + "grad_norm": 104.04412437828896, + "learning_rate": 8.761268512556343e-07, + "logits/chosen": 0.8881469964981079, + "logits/rejected": 0.814868152141571, + "logps/chosen": -437.48748779296875, + "logps/rejected": -373.3999938964844, + "loss": 0.7044, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.812670886516571, + "rewards/margins": 0.818225085735321, + "rewards/rejected": -0.005847168155014515, + "step": 1540 + }, + { + "epoch": 0.4989136557495775, + "grad_norm": 120.77289324320174, + "learning_rate": 8.753219575016098e-07, + "logits/chosen": 0.9123290777206421, + "logits/rejected": 0.9419921636581421, + "logps/chosen": -382.29998779296875, + "logps/rejected": -346.57501220703125, + "loss": 0.6153, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 1.0294067859649658, + "rewards/margins": 0.862597644329071, + "rewards/rejected": 0.16740722954273224, + "step": 1550 + }, + { + "epoch": 0.502132453528607, + "grad_norm": 151.73119847699107, + "learning_rate": 8.745170637475853e-07, + "logits/chosen": 0.735705554485321, + "logits/rejected": 0.731555163860321, + "logps/chosen": -386.7124938964844, + "logps/rejected": -354.2875061035156, + "loss": 0.5871, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 1.0596923828125, + "rewards/margins": 0.9839843511581421, + "rewards/rejected": 0.07494506984949112, + "step": 1560 + }, + { + "epoch": 0.5053512513076366, + "grad_norm": 160.17116042102248, + "learning_rate": 8.737121699935607e-07, + "logits/chosen": 0.8709869384765625, + "logits/rejected": 0.900463879108429, + "logps/chosen": -397.3999938964844, + "logps/rejected": -355.53125, + "loss": 0.632, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.705578625202179, + "rewards/margins": 0.68756103515625, + "rewards/rejected": 0.01857910118997097, + "step": 1570 + }, + { + "epoch": 0.5085700490866661, + "grad_norm": 124.34101987575076, + "learning_rate": 8.729072762395363e-07, + "logits/chosen": 0.871508777141571, + "logits/rejected": 0.8348144292831421, + "logps/chosen": -421.8374938964844, + "logps/rejected": -351.6625061035156, + "loss": 0.4979, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 1.004980444908142, + "rewards/margins": 1.195593237876892, + "rewards/rejected": -0.19039306044578552, + "step": 1580 + }, + { + "epoch": 0.5117888468656957, + "grad_norm": 107.10040223420316, + "learning_rate": 8.721023824855118e-07, + "logits/chosen": 0.6812988519668579, + "logits/rejected": 0.6584106683731079, + "logps/chosen": -424.38751220703125, + "logps/rejected": -427.23748779296875, + "loss": 0.4458, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.781695544719696, + "rewards/margins": 1.299768090248108, + "rewards/rejected": -0.5179198980331421, + "step": 1590 + }, + { + "epoch": 0.5150076446447251, + "grad_norm": 150.0786746263981, + "learning_rate": 8.712974887314874e-07, + "logits/chosen": 0.6302245855331421, + "logits/rejected": 0.6790405511856079, + "logps/chosen": -356.42498779296875, + "logps/rejected": -359.48748779296875, + "loss": 0.6106, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.4437927305698395, + "rewards/margins": 0.9345703125, + "rewards/rejected": -0.49114990234375, + "step": 1600 + }, + { + "epoch": 0.5182264424237547, + "grad_norm": 117.12966364280413, + "learning_rate": 8.70492594977463e-07, + "logits/chosen": 0.5621337890625, + "logits/rejected": 0.56689453125, + "logps/chosen": -417.70001220703125, + "logps/rejected": -406.61248779296875, + "loss": 0.6712, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.21570435166358948, + "rewards/margins": 0.886181652545929, + "rewards/rejected": -0.6698883175849915, + "step": 1610 + }, + { + "epoch": 0.5214452402027843, + "grad_norm": 125.29787575815318, + "learning_rate": 8.696877012234385e-07, + "logits/chosen": 0.7653793096542358, + "logits/rejected": 0.783801257610321, + "logps/chosen": -372.1000061035156, + "logps/rejected": -348.42498779296875, + "loss": 0.5749, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.4317626953125, + "rewards/margins": 0.8670898675918579, + "rewards/rejected": -0.43513184785842896, + "step": 1620 + }, + { + "epoch": 0.5246640379818138, + "grad_norm": 70.97165192105194, + "learning_rate": 8.68882807469414e-07, + "logits/chosen": 0.833361804485321, + "logits/rejected": 0.780957043170929, + "logps/chosen": -380.4375, + "logps/rejected": -365.6000061035156, + "loss": 0.5989, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.586438000202179, + "rewards/margins": 0.814990222454071, + "rewards/rejected": -0.22798767685890198, + "step": 1630 + }, + { + "epoch": 0.5278828357608433, + "grad_norm": 130.83237472917816, + "learning_rate": 8.680779137153895e-07, + "logits/chosen": 0.744580090045929, + "logits/rejected": 0.703723132610321, + "logps/chosen": -436.20001220703125, + "logps/rejected": -421.29998779296875, + "loss": 0.6074, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.81988525390625, + "rewards/margins": 0.962353527545929, + "rewards/rejected": -0.14262695610523224, + "step": 1640 + }, + { + "epoch": 0.5311016335398728, + "grad_norm": 87.82166710436096, + "learning_rate": 8.67273019961365e-07, + "logits/chosen": 0.709136962890625, + "logits/rejected": 0.5900970697402954, + "logps/chosen": -405.9750061035156, + "logps/rejected": -369.1000061035156, + "loss": 0.5631, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.20867919921875, + "rewards/margins": 1.088830590248108, + "rewards/rejected": -0.879656970500946, + "step": 1650 + }, + { + "epoch": 0.5343204313189024, + "grad_norm": 127.90547265729465, + "learning_rate": 8.664681262073405e-07, + "logits/chosen": 0.7460571527481079, + "logits/rejected": 0.787768542766571, + "logps/chosen": -379.5, + "logps/rejected": -342.25, + "loss": 0.6818, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.09000243991613388, + "rewards/margins": 0.68060302734375, + "rewards/rejected": -0.7704406976699829, + "step": 1660 + }, + { + "epoch": 0.537539229097932, + "grad_norm": 125.2981919344941, + "learning_rate": 8.656632324533161e-07, + "logits/chosen": 0.71905517578125, + "logits/rejected": 0.5973297357559204, + "logps/chosen": -416.45001220703125, + "logps/rejected": -355.70001220703125, + "loss": 0.5934, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.01535644568502903, + "rewards/margins": 0.8395141363143921, + "rewards/rejected": -0.8243957757949829, + "step": 1670 + }, + { + "epoch": 0.5407580268769615, + "grad_norm": 143.6084153269009, + "learning_rate": 8.648583386992917e-07, + "logits/chosen": 0.734936535358429, + "logits/rejected": 0.6809844970703125, + "logps/chosen": -410.32501220703125, + "logps/rejected": -369.23748779296875, + "loss": 0.6539, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.07265014946460724, + "rewards/margins": 0.7343384027481079, + "rewards/rejected": -0.662609875202179, + "step": 1680 + }, + { + "epoch": 0.543976824655991, + "grad_norm": 121.50378977426598, + "learning_rate": 8.640534449452672e-07, + "logits/chosen": 0.886584460735321, + "logits/rejected": 0.8672119379043579, + "logps/chosen": -396.5249938964844, + "logps/rejected": -362.20001220703125, + "loss": 0.679, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.4881530702114105, + "rewards/margins": 0.6416015625, + "rewards/rejected": -0.1527099609375, + "step": 1690 + }, + { + "epoch": 0.5471956224350205, + "grad_norm": 72.60780149951601, + "learning_rate": 8.632485511912428e-07, + "logits/chosen": 0.9634643793106079, + "logits/rejected": 0.910473644733429, + "logps/chosen": -408.42498779296875, + "logps/rejected": -366.33123779296875, + "loss": 0.5302, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.9645324945449829, + "rewards/margins": 1.031396508216858, + "rewards/rejected": -0.06760253757238388, + "step": 1700 + }, + { + "epoch": 0.5504144202140501, + "grad_norm": 111.68028396062127, + "learning_rate": 8.624436574372183e-07, + "logits/chosen": 0.8245300054550171, + "logits/rejected": 0.7607055902481079, + "logps/chosen": -375.6499938964844, + "logps/rejected": -336.57501220703125, + "loss": 0.6782, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.7129181027412415, + "rewards/margins": 0.611157238483429, + "rewards/rejected": 0.1020965576171875, + "step": 1710 + }, + { + "epoch": 0.5536332179930796, + "grad_norm": 130.84753254060564, + "learning_rate": 8.616387636831937e-07, + "logits/chosen": 0.862133800983429, + "logits/rejected": 0.797778308391571, + "logps/chosen": -382.6625061035156, + "logps/rejected": -356.5625, + "loss": 0.5687, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.3941894471645355, + "rewards/margins": 0.905017077922821, + "rewards/rejected": -0.51141357421875, + "step": 1720 + }, + { + "epoch": 0.5568520157721091, + "grad_norm": 102.2335666751953, + "learning_rate": 8.608338699291693e-07, + "logits/chosen": 0.8003784418106079, + "logits/rejected": 0.7295898199081421, + "logps/chosen": -379.2250061035156, + "logps/rejected": -367.625, + "loss": 0.6802, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.35570067167282104, + "rewards/margins": 0.791003406047821, + "rewards/rejected": -0.43532103300094604, + "step": 1730 + }, + { + "epoch": 0.5600708135511386, + "grad_norm": 135.9224090318731, + "learning_rate": 8.600289761751448e-07, + "logits/chosen": 0.7213379144668579, + "logits/rejected": 0.8390868902206421, + "logps/chosen": -390.67498779296875, + "logps/rejected": -355.5874938964844, + "loss": 0.6148, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.37620848417282104, + "rewards/margins": 0.959301769733429, + "rewards/rejected": -0.582775890827179, + "step": 1740 + }, + { + "epoch": 0.5632896113301682, + "grad_norm": 121.2618422027483, + "learning_rate": 8.592240824211203e-07, + "logits/chosen": 0.6652587652206421, + "logits/rejected": 0.62054443359375, + "logps/chosen": -399.67498779296875, + "logps/rejected": -366.0, + "loss": 0.5758, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.730364978313446, + "rewards/margins": 0.93804931640625, + "rewards/rejected": -0.20653076469898224, + "step": 1750 + }, + { + "epoch": 0.5665084091091978, + "grad_norm": 145.89091477768014, + "learning_rate": 8.58419188667096e-07, + "logits/chosen": 0.715618908405304, + "logits/rejected": 0.7235107421875, + "logps/chosen": -374.3500061035156, + "logps/rejected": -378.04998779296875, + "loss": 0.6549, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.501391589641571, + "rewards/margins": 0.6806885004043579, + "rewards/rejected": -0.179656982421875, + "step": 1760 + }, + { + "epoch": 0.5697272068882272, + "grad_norm": 132.45157262497162, + "learning_rate": 8.576142949130715e-07, + "logits/chosen": 0.7034912109375, + "logits/rejected": 0.6666595339775085, + "logps/chosen": -410.63751220703125, + "logps/rejected": -358.9375, + "loss": 0.7691, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.14030762016773224, + "rewards/margins": 0.5527282953262329, + "rewards/rejected": -0.4123901426792145, + "step": 1770 + }, + { + "epoch": 0.5729460046672568, + "grad_norm": 138.9684312327667, + "learning_rate": 8.56809401159047e-07, + "logits/chosen": 0.6848999261856079, + "logits/rejected": 0.64508056640625, + "logps/chosen": -404.125, + "logps/rejected": -395.29998779296875, + "loss": 0.6414, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.4085021913051605, + "rewards/margins": 0.8033202886581421, + "rewards/rejected": -0.395498663187027, + "step": 1780 + }, + { + "epoch": 0.5761648024462863, + "grad_norm": 110.5056387135016, + "learning_rate": 8.560045074050225e-07, + "logits/chosen": 0.766918957233429, + "logits/rejected": 0.809466540813446, + "logps/chosen": -430.25, + "logps/rejected": -354.2749938964844, + "loss": 0.539, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2895568907260895, + "rewards/margins": 0.93310546875, + "rewards/rejected": -0.64385986328125, + "step": 1790 + }, + { + "epoch": 0.5793836002253159, + "grad_norm": 129.73769953987048, + "learning_rate": 8.55199613650998e-07, + "logits/chosen": 0.7332763671875, + "logits/rejected": 0.7737671136856079, + "logps/chosen": -390.4375, + "logps/rejected": -351.2250061035156, + "loss": 0.565, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.4941650331020355, + "rewards/margins": 0.906628429889679, + "rewards/rejected": -0.4120239317417145, + "step": 1800 + }, + { + "epoch": 0.5826023980043453, + "grad_norm": 115.37600525140569, + "learning_rate": 8.543947198969735e-07, + "logits/chosen": 0.748613715171814, + "logits/rejected": 0.6977081298828125, + "logps/chosen": -437.2875061035156, + "logps/rejected": -355.36248779296875, + "loss": 0.6783, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.47144776582717896, + "rewards/margins": 0.8226562738418579, + "rewards/rejected": -0.35110169649124146, + "step": 1810 + }, + { + "epoch": 0.5858211957833749, + "grad_norm": 155.6124072215637, + "learning_rate": 8.535898261429491e-07, + "logits/chosen": 0.7040039300918579, + "logits/rejected": 0.7454589605331421, + "logps/chosen": -419.0375061035156, + "logps/rejected": -386.1499938964844, + "loss": 0.5766, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.714385986328125, + "rewards/margins": 0.99261474609375, + "rewards/rejected": -0.2774414122104645, + "step": 1820 + }, + { + "epoch": 0.5890399935624044, + "grad_norm": 121.92097042258222, + "learning_rate": 8.527849323889246e-07, + "logits/chosen": 0.7954467535018921, + "logits/rejected": 0.7120422124862671, + "logps/chosen": -409.6000061035156, + "logps/rejected": -375.70001220703125, + "loss": 0.62, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.5656493902206421, + "rewards/margins": 0.8524109125137329, + "rewards/rejected": -0.28562623262405396, + "step": 1830 + }, + { + "epoch": 0.592258791341434, + "grad_norm": 103.24339268911461, + "learning_rate": 8.519800386349002e-07, + "logits/chosen": 0.828540027141571, + "logits/rejected": 0.7642456293106079, + "logps/chosen": -439.0, + "logps/rejected": -399.875, + "loss": 0.701, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.523913562297821, + "rewards/margins": 0.7384399175643921, + "rewards/rejected": -0.21412353217601776, + "step": 1840 + }, + { + "epoch": 0.5954775891204636, + "grad_norm": 108.4009599783067, + "learning_rate": 8.511751448808758e-07, + "logits/chosen": 0.784069836139679, + "logits/rejected": 0.8148406744003296, + "logps/chosen": -368.26251220703125, + "logps/rejected": -369.25, + "loss": 0.5603, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.46116334199905396, + "rewards/margins": 0.8693603277206421, + "rewards/rejected": -0.40825194120407104, + "step": 1850 + }, + { + "epoch": 0.598696386899493, + "grad_norm": 117.95591271658638, + "learning_rate": 8.503702511268513e-07, + "logits/chosen": 0.578826904296875, + "logits/rejected": 0.598437488079071, + "logps/chosen": -391.5249938964844, + "logps/rejected": -364.25, + "loss": 0.5513, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.32760924100875854, + "rewards/margins": 1.10711669921875, + "rewards/rejected": -0.779217541217804, + "step": 1860 + }, + { + "epoch": 0.6019151846785226, + "grad_norm": 88.74957129686479, + "learning_rate": 8.495653573728267e-07, + "logits/chosen": 0.6682800054550171, + "logits/rejected": 0.5819336175918579, + "logps/chosen": -400.17498779296875, + "logps/rejected": -335.2250061035156, + "loss": 0.5826, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.24286499619483948, + "rewards/margins": 1.003076195716858, + "rewards/rejected": -0.7604125738143921, + "step": 1870 + }, + { + "epoch": 0.6051339824575521, + "grad_norm": 124.47890865640196, + "learning_rate": 8.487604636188022e-07, + "logits/chosen": 0.5824432373046875, + "logits/rejected": 0.5376129150390625, + "logps/chosen": -446.95001220703125, + "logps/rejected": -375.3374938964844, + "loss": 0.6671, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20930175483226776, + "rewards/margins": 0.7833007574081421, + "rewards/rejected": -0.5737396478652954, + "step": 1880 + }, + { + "epoch": 0.6083527802365817, + "grad_norm": 146.84178076960035, + "learning_rate": 8.479555698647778e-07, + "logits/chosen": 0.665332019329071, + "logits/rejected": 0.677075207233429, + "logps/chosen": -376.1875, + "logps/rejected": -353.6499938964844, + "loss": 0.5905, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.4824462831020355, + "rewards/margins": 0.876934826374054, + "rewards/rejected": -0.39520263671875, + "step": 1890 + }, + { + "epoch": 0.6115715780156111, + "grad_norm": 54.51866820946569, + "learning_rate": 8.471506761107533e-07, + "logits/chosen": 0.774121105670929, + "logits/rejected": 0.766308605670929, + "logps/chosen": -416.2250061035156, + "logps/rejected": -382.5874938964844, + "loss": 0.6021, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.5503906011581421, + "rewards/margins": 0.9388427734375, + "rewards/rejected": -0.3884521424770355, + "step": 1900 + }, + { + "epoch": 0.6147903757946407, + "grad_norm": 128.8301007355504, + "learning_rate": 8.463457823567288e-07, + "logits/chosen": 0.604901134967804, + "logits/rejected": 0.6837097406387329, + "logps/chosen": -361.7250061035156, + "logps/rejected": -390.25, + "loss": 0.6895, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.580090343952179, + "rewards/margins": 0.6656249761581421, + "rewards/rejected": -0.08521728217601776, + "step": 1910 + }, + { + "epoch": 0.6180091735736702, + "grad_norm": 106.0837684527091, + "learning_rate": 8.455408886027045e-07, + "logits/chosen": 0.7379394769668579, + "logits/rejected": 0.5441039800643921, + "logps/chosen": -441.26251220703125, + "logps/rejected": -375.2250061035156, + "loss": 0.6674, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.5667480230331421, + "rewards/margins": 0.980395495891571, + "rewards/rejected": -0.41374510526657104, + "step": 1920 + }, + { + "epoch": 0.6212279713526998, + "grad_norm": 80.9103510866508, + "learning_rate": 8.4473599484868e-07, + "logits/chosen": 0.5796447992324829, + "logits/rejected": 0.5673156976699829, + "logps/chosen": -429.20001220703125, + "logps/rejected": -391.3125, + "loss": 0.6026, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.3130508363246918, + "rewards/margins": 0.8823181390762329, + "rewards/rejected": -0.56915283203125, + "step": 1930 + }, + { + "epoch": 0.6244467691317293, + "grad_norm": 123.03195647704065, + "learning_rate": 8.439311010946554e-07, + "logits/chosen": 0.4520828127861023, + "logits/rejected": 0.4125503599643707, + "logps/chosen": -406.0, + "logps/rejected": -385.125, + "loss": 0.5829, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.027679443359375, + "rewards/margins": 1.0650146007537842, + "rewards/rejected": -1.036860704421997, + "step": 1940 + }, + { + "epoch": 0.6276655669107588, + "grad_norm": 145.83432689260442, + "learning_rate": 8.43126207340631e-07, + "logits/chosen": 0.5933898687362671, + "logits/rejected": 0.49702757596969604, + "logps/chosen": -454.79998779296875, + "logps/rejected": -372.9375, + "loss": 0.4794, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": 0.32109373807907104, + "rewards/margins": 1.3983643054962158, + "rewards/rejected": -1.077539086341858, + "step": 1950 + }, + { + "epoch": 0.6308843646897884, + "grad_norm": 127.26459899477062, + "learning_rate": 8.423213135866065e-07, + "logits/chosen": 0.4945434629917145, + "logits/rejected": 0.500195324420929, + "logps/chosen": -350.86248779296875, + "logps/rejected": -343.6000061035156, + "loss": 0.5648, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.2679199278354645, + "rewards/margins": 0.9600830078125, + "rewards/rejected": -0.6918395757675171, + "step": 1960 + }, + { + "epoch": 0.6341031624688179, + "grad_norm": 109.64312607132199, + "learning_rate": 8.41516419832582e-07, + "logits/chosen": 0.606396496295929, + "logits/rejected": 0.552600085735321, + "logps/chosen": -448.79998779296875, + "logps/rejected": -414.2749938964844, + "loss": 0.5654, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.5818389654159546, + "rewards/margins": 1.0782470703125, + "rewards/rejected": -0.4966796934604645, + "step": 1970 + }, + { + "epoch": 0.6373219602478474, + "grad_norm": 96.8795670450314, + "learning_rate": 8.407115260785576e-07, + "logits/chosen": 0.564135730266571, + "logits/rejected": 0.538177490234375, + "logps/chosen": -394.92498779296875, + "logps/rejected": -376.92498779296875, + "loss": 0.5573, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.474832147359848, + "rewards/margins": 0.9240967035293579, + "rewards/rejected": -0.448974609375, + "step": 1980 + }, + { + "epoch": 0.6405407580268769, + "grad_norm": 141.50123190438515, + "learning_rate": 8.399066323245331e-07, + "logits/chosen": 0.5654662847518921, + "logits/rejected": 0.575390636920929, + "logps/chosen": -407.25, + "logps/rejected": -353.8500061035156, + "loss": 0.6543, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.4442138671875, + "rewards/margins": 0.9124755859375, + "rewards/rejected": -0.46831053495407104, + "step": 1990 + }, + { + "epoch": 0.6437595558059065, + "grad_norm": 108.24444219962213, + "learning_rate": 8.391017385705087e-07, + "logits/chosen": 0.661572277545929, + "logits/rejected": 0.671826183795929, + "logps/chosen": -388.23748779296875, + "logps/rejected": -348.73126220703125, + "loss": 0.5782, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.6115478277206421, + "rewards/margins": 0.9420531988143921, + "rewards/rejected": -0.33001708984375, + "step": 2000 + }, + { + "epoch": 0.646978353584936, + "grad_norm": 139.76781309597646, + "learning_rate": 8.382968448164842e-07, + "logits/chosen": 0.7258850336074829, + "logits/rejected": 0.6925048828125, + "logps/chosen": -409.4750061035156, + "logps/rejected": -372.75, + "loss": 0.6747, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.767504870891571, + "rewards/margins": 0.708026111125946, + "rewards/rejected": 0.05900878831744194, + "step": 2010 + }, + { + "epoch": 0.6501971513639656, + "grad_norm": 118.07613411129167, + "learning_rate": 8.374919510624597e-07, + "logits/chosen": 0.7453979253768921, + "logits/rejected": 0.7837280035018921, + "logps/chosen": -389.4125061035156, + "logps/rejected": -365.8500061035156, + "loss": 0.6421, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.5404052734375, + "rewards/margins": 0.9052978754043579, + "rewards/rejected": -0.36555176973342896, + "step": 2020 + }, + { + "epoch": 0.6534159491429951, + "grad_norm": 129.06304734267533, + "learning_rate": 8.366870573084352e-07, + "logits/chosen": 0.7041991949081421, + "logits/rejected": 0.7592560052871704, + "logps/chosen": -360.2749938964844, + "logps/rejected": -369.86248779296875, + "loss": 0.5216, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.30781251192092896, + "rewards/margins": 1.0836303234100342, + "rewards/rejected": -0.7762451171875, + "step": 2030 + }, + { + "epoch": 0.6566347469220246, + "grad_norm": 123.44556440768211, + "learning_rate": 8.358821635544108e-07, + "logits/chosen": 0.65478515625, + "logits/rejected": 0.6085449457168579, + "logps/chosen": -425.6499938964844, + "logps/rejected": -380.92498779296875, + "loss": 0.6007, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.4173431396484375, + "rewards/margins": 1.058349609375, + "rewards/rejected": -0.641064465045929, + "step": 2040 + }, + { + "epoch": 0.6598535447010542, + "grad_norm": 115.66975239748679, + "learning_rate": 8.350772698003863e-07, + "logits/chosen": 0.59222412109375, + "logits/rejected": 0.5804992914199829, + "logps/chosen": -368.92498779296875, + "logps/rejected": -325.63751220703125, + "loss": 0.7096, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.19013671576976776, + "rewards/margins": 0.590624988079071, + "rewards/rejected": -0.3997802734375, + "step": 2050 + }, + { + "epoch": 0.6630723424800837, + "grad_norm": 120.88639287995744, + "learning_rate": 8.342723760463618e-07, + "logits/chosen": 0.7411285638809204, + "logits/rejected": 0.794604480266571, + "logps/chosen": -370.98748779296875, + "logps/rejected": -355.57501220703125, + "loss": 0.6419, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.456878662109375, + "rewards/margins": 0.829211413860321, + "rewards/rejected": -0.371856689453125, + "step": 2060 + }, + { + "epoch": 0.6662911402591132, + "grad_norm": 98.39827045433174, + "learning_rate": 8.334674822923374e-07, + "logits/chosen": 0.694226086139679, + "logits/rejected": 0.6204894781112671, + "logps/chosen": -403.3125, + "logps/rejected": -350.95001220703125, + "loss": 0.5489, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.48541259765625, + "rewards/margins": 0.854632556438446, + "rewards/rejected": -0.3693908751010895, + "step": 2070 + }, + { + "epoch": 0.6695099380381427, + "grad_norm": 107.87893659304308, + "learning_rate": 8.32662588538313e-07, + "logits/chosen": 0.6167968511581421, + "logits/rejected": 0.5534301996231079, + "logps/chosen": -408.95001220703125, + "logps/rejected": -395.32501220703125, + "loss": 0.6168, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.535107433795929, + "rewards/margins": 0.983349621295929, + "rewards/rejected": -0.4478088319301605, + "step": 2080 + }, + { + "epoch": 0.6727287358171723, + "grad_norm": 182.62264045525205, + "learning_rate": 8.318576947842884e-07, + "logits/chosen": 0.7122436761856079, + "logits/rejected": 0.6259094476699829, + "logps/chosen": -435.48748779296875, + "logps/rejected": -381.70001220703125, + "loss": 0.6728, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.44140625, + "rewards/margins": 0.750537097454071, + "rewards/rejected": -0.3092285096645355, + "step": 2090 + }, + { + "epoch": 0.6759475335962019, + "grad_norm": 101.4854449112291, + "learning_rate": 8.31052801030264e-07, + "logits/chosen": 0.8445678949356079, + "logits/rejected": 0.7760986089706421, + "logps/chosen": -362.07501220703125, + "logps/rejected": -334.57501220703125, + "loss": 0.6203, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.47640228271484375, + "rewards/margins": 0.7912048101425171, + "rewards/rejected": -0.3154968321323395, + "step": 2100 + }, + { + "epoch": 0.6791663313752313, + "grad_norm": 97.54215728042183, + "learning_rate": 8.302479072762395e-07, + "logits/chosen": 0.6719558835029602, + "logits/rejected": 0.6254943609237671, + "logps/chosen": -434.57501220703125, + "logps/rejected": -386.0, + "loss": 0.6168, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4815307557582855, + "rewards/margins": 0.963787853717804, + "rewards/rejected": -0.48268431425094604, + "step": 2110 + }, + { + "epoch": 0.6823851291542609, + "grad_norm": 110.35789011219163, + "learning_rate": 8.29443013522215e-07, + "logits/chosen": 0.6293426752090454, + "logits/rejected": 0.609759509563446, + "logps/chosen": -401.1000061035156, + "logps/rejected": -370.79998779296875, + "loss": 0.5639, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.2870620787143707, + "rewards/margins": 1.098107933998108, + "rewards/rejected": -0.811572253704071, + "step": 2120 + }, + { + "epoch": 0.6856039269332904, + "grad_norm": 162.3629161541313, + "learning_rate": 8.286381197681906e-07, + "logits/chosen": 0.6685333251953125, + "logits/rejected": 0.587536633014679, + "logps/chosen": -404.8999938964844, + "logps/rejected": -360.70001220703125, + "loss": 0.5394, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.27323609590530396, + "rewards/margins": 1.1164062023162842, + "rewards/rejected": -0.840893566608429, + "step": 2130 + }, + { + "epoch": 0.68882272471232, + "grad_norm": 130.8284965243104, + "learning_rate": 8.278332260141661e-07, + "logits/chosen": 0.42315673828125, + "logits/rejected": 0.3591461181640625, + "logps/chosen": -407.26251220703125, + "logps/rejected": -389.6000061035156, + "loss": 0.6019, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.174519345164299, + "rewards/margins": 0.889697253704071, + "rewards/rejected": -0.715344250202179, + "step": 2140 + }, + { + "epoch": 0.6920415224913494, + "grad_norm": 113.28782525108359, + "learning_rate": 8.270283322601416e-07, + "logits/chosen": 0.6356201171875, + "logits/rejected": 0.515185534954071, + "logps/chosen": -435.79998779296875, + "logps/rejected": -401.2749938964844, + "loss": 0.5985, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.07464599609375, + "rewards/margins": 1.014196753501892, + "rewards/rejected": -0.9403076171875, + "step": 2150 + }, + { + "epoch": 0.695260320270379, + "grad_norm": 106.39390288784632, + "learning_rate": 8.262234385061172e-07, + "logits/chosen": 0.7736572027206421, + "logits/rejected": 0.6968658566474915, + "logps/chosen": -443.04998779296875, + "logps/rejected": -367.45001220703125, + "loss": 0.5643, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.297332763671875, + "rewards/margins": 1.076171875, + "rewards/rejected": -0.7778075933456421, + "step": 2160 + }, + { + "epoch": 0.6984791180494085, + "grad_norm": 167.53541838414873, + "learning_rate": 8.254185447520927e-07, + "logits/chosen": 0.62664794921875, + "logits/rejected": 0.57122802734375, + "logps/chosen": -429.57501220703125, + "logps/rejected": -368.6499938964844, + "loss": 0.6259, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.44538575410842896, + "rewards/margins": 0.777587890625, + "rewards/rejected": -0.33240967988967896, + "step": 2170 + }, + { + "epoch": 0.7016979158284381, + "grad_norm": 126.47672501732542, + "learning_rate": 8.246136509980682e-07, + "logits/chosen": 0.622955322265625, + "logits/rejected": 0.52435302734375, + "logps/chosen": -379.1499938964844, + "logps/rejected": -334.7749938964844, + "loss": 0.5401, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.4681030213832855, + "rewards/margins": 0.93731689453125, + "rewards/rejected": -0.4688262939453125, + "step": 2180 + }, + { + "epoch": 0.7049167136074677, + "grad_norm": 100.5886805273311, + "learning_rate": 8.238087572440437e-07, + "logits/chosen": 0.6304519772529602, + "logits/rejected": 0.43433839082717896, + "logps/chosen": -469.4750061035156, + "logps/rejected": -385.82501220703125, + "loss": 0.5615, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.4978576600551605, + "rewards/margins": 1.0602538585662842, + "rewards/rejected": -0.5624023675918579, + "step": 2190 + }, + { + "epoch": 0.7081355113864971, + "grad_norm": 88.11585488257043, + "learning_rate": 8.230038634900193e-07, + "logits/chosen": 0.7540038824081421, + "logits/rejected": 0.7354278564453125, + "logps/chosen": -464.4750061035156, + "logps/rejected": -386.1000061035156, + "loss": 0.5103, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.45979005098342896, + "rewards/margins": 1.090368628501892, + "rewards/rejected": -0.630963146686554, + "step": 2200 + }, + { + "epoch": 0.7113543091655267, + "grad_norm": 150.59937462180037, + "learning_rate": 8.221989697359948e-07, + "logits/chosen": 0.684985339641571, + "logits/rejected": 0.5287231206893921, + "logps/chosen": -459.54998779296875, + "logps/rejected": -361.57501220703125, + "loss": 0.5698, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.5333709716796875, + "rewards/margins": 0.937121570110321, + "rewards/rejected": -0.40369874238967896, + "step": 2210 + }, + { + "epoch": 0.7145731069445562, + "grad_norm": 122.95140422218229, + "learning_rate": 8.213940759819703e-07, + "logits/chosen": 0.8478027582168579, + "logits/rejected": 0.823193371295929, + "logps/chosen": -382.3812561035156, + "logps/rejected": -334.29998779296875, + "loss": 0.6209, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.678070068359375, + "rewards/margins": 0.7892211675643921, + "rewards/rejected": -0.11069031059741974, + "step": 2220 + }, + { + "epoch": 0.7177919047235858, + "grad_norm": 118.41643678016068, + "learning_rate": 8.205891822279458e-07, + "logits/chosen": 0.728808581829071, + "logits/rejected": 0.665484607219696, + "logps/chosen": -442.29998779296875, + "logps/rejected": -375.625, + "loss": 0.5948, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.8303588628768921, + "rewards/margins": 1.0003662109375, + "rewards/rejected": -0.16900023818016052, + "step": 2230 + }, + { + "epoch": 0.7210107025026152, + "grad_norm": 144.0626215988013, + "learning_rate": 8.197842884739214e-07, + "logits/chosen": 0.7560364007949829, + "logits/rejected": 0.626635730266571, + "logps/chosen": -403.875, + "logps/rejected": -347.1000061035156, + "loss": 0.6467, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.9818359613418579, + "rewards/margins": 0.9332641363143921, + "rewards/rejected": 0.048492431640625, + "step": 2240 + }, + { + "epoch": 0.7242295002816448, + "grad_norm": 115.57164466571824, + "learning_rate": 8.189793947198969e-07, + "logits/chosen": 0.7542022466659546, + "logits/rejected": 0.791552722454071, + "logps/chosen": -393.8999938964844, + "logps/rejected": -390.67498779296875, + "loss": 0.6374, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 1.1778686046600342, + "rewards/margins": 0.8595489263534546, + "rewards/rejected": 0.3185257017612457, + "step": 2250 + }, + { + "epoch": 0.7274482980606743, + "grad_norm": 170.98833041444232, + "learning_rate": 8.181745009658725e-07, + "logits/chosen": 0.8707031011581421, + "logits/rejected": 0.8541504144668579, + "logps/chosen": -380.95001220703125, + "logps/rejected": -349.25, + "loss": 0.6406, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.8625457882881165, + "rewards/margins": 0.82208251953125, + "rewards/rejected": 0.040313720703125, + "step": 2260 + }, + { + "epoch": 0.7306670958397039, + "grad_norm": 135.47584145357715, + "learning_rate": 8.17369607211848e-07, + "logits/chosen": 0.695819079875946, + "logits/rejected": 0.691271960735321, + "logps/chosen": -425.5375061035156, + "logps/rejected": -402.0249938964844, + "loss": 0.6408, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.7590087652206421, + "rewards/margins": 0.744732677936554, + "rewards/rejected": 0.01366577111184597, + "step": 2270 + }, + { + "epoch": 0.7338858936187334, + "grad_norm": 85.20707579389351, + "learning_rate": 8.165647134578235e-07, + "logits/chosen": 0.7344635128974915, + "logits/rejected": 0.7755569219589233, + "logps/chosen": -410.125, + "logps/rejected": -393.9375, + "loss": 0.6372, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.8052123785018921, + "rewards/margins": 0.8388427495956421, + "rewards/rejected": -0.03415527194738388, + "step": 2280 + }, + { + "epoch": 0.7371046913977629, + "grad_norm": 113.15080714166211, + "learning_rate": 8.157598197037991e-07, + "logits/chosen": 0.8173583745956421, + "logits/rejected": 0.7352401614189148, + "logps/chosen": -391.92498779296875, + "logps/rejected": -352.13751220703125, + "loss": 0.5556, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.929699718952179, + "rewards/margins": 1.0279052257537842, + "rewards/rejected": -0.09769897162914276, + "step": 2290 + }, + { + "epoch": 0.7403234891767925, + "grad_norm": 98.51766943039499, + "learning_rate": 8.149549259497745e-07, + "logits/chosen": 0.8376709222793579, + "logits/rejected": 0.821533203125, + "logps/chosen": -401.51251220703125, + "logps/rejected": -367.5375061035156, + "loss": 0.546, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.768475353717804, + "rewards/margins": 1.0253417491912842, + "rewards/rejected": -0.257568359375, + "step": 2300 + }, + { + "epoch": 0.743542286955822, + "grad_norm": 90.65736208336023, + "learning_rate": 8.1415003219575e-07, + "logits/chosen": 0.8429321050643921, + "logits/rejected": 0.8155517578125, + "logps/chosen": -404.54998779296875, + "logps/rejected": -371.95001220703125, + "loss": 0.6812, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.36921387910842896, + "rewards/margins": 0.6606689691543579, + "rewards/rejected": -0.29119873046875, + "step": 2310 + }, + { + "epoch": 0.7467610847348516, + "grad_norm": 100.26417241593842, + "learning_rate": 8.133451384417257e-07, + "logits/chosen": 0.8605591058731079, + "logits/rejected": 0.866943359375, + "logps/chosen": -377.8999938964844, + "logps/rejected": -388.04998779296875, + "loss": 0.5957, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.517016589641571, + "rewards/margins": 1.0783202648162842, + "rewards/rejected": -0.56170654296875, + "step": 2320 + }, + { + "epoch": 0.749979882513881, + "grad_norm": 113.85767754005992, + "learning_rate": 8.125402446877012e-07, + "logits/chosen": 0.7427520751953125, + "logits/rejected": 0.792675793170929, + "logps/chosen": -365.42498779296875, + "logps/rejected": -387.45001220703125, + "loss": 0.6074, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.30488282442092896, + "rewards/margins": 0.9068603515625, + "rewards/rejected": -0.602703869342804, + "step": 2330 + }, + { + "epoch": 0.7531986802929106, + "grad_norm": 84.61998071106751, + "learning_rate": 8.117353509336767e-07, + "logits/chosen": 0.8431640863418579, + "logits/rejected": 0.8217529058456421, + "logps/chosen": -395.5249938964844, + "logps/rejected": -339.75, + "loss": 0.572, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.15087890625, + "rewards/margins": 0.9944626092910767, + "rewards/rejected": -0.8439575433731079, + "step": 2340 + }, + { + "epoch": 0.7564174780719402, + "grad_norm": 110.59911120449058, + "learning_rate": 8.109304571796523e-07, + "logits/chosen": 0.8056640625, + "logits/rejected": 0.8075927495956421, + "logps/chosen": -345.07501220703125, + "logps/rejected": -313.63751220703125, + "loss": 0.5, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.4482421875, + "rewards/margins": 1.027734398841858, + "rewards/rejected": -0.579571545124054, + "step": 2350 + }, + { + "epoch": 0.7596362758509697, + "grad_norm": 150.7477024135639, + "learning_rate": 8.101255634256278e-07, + "logits/chosen": 0.799060046672821, + "logits/rejected": 0.7042602300643921, + "logps/chosen": -400.54998779296875, + "logps/rejected": -348.4125061035156, + "loss": 0.6239, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.4732421934604645, + "rewards/margins": 0.878460705280304, + "rewards/rejected": -0.40550535917282104, + "step": 2360 + }, + { + "epoch": 0.7628550736299992, + "grad_norm": 95.85631166142494, + "learning_rate": 8.093206696716033e-07, + "logits/chosen": 0.859545886516571, + "logits/rejected": 0.8189941644668579, + "logps/chosen": -398.1000061035156, + "logps/rejected": -340.07501220703125, + "loss": 0.5541, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.653247058391571, + "rewards/margins": 1.086999535560608, + "rewards/rejected": -0.4336303770542145, + "step": 2370 + }, + { + "epoch": 0.7660738714090287, + "grad_norm": 126.96509727894686, + "learning_rate": 8.085157759175788e-07, + "logits/chosen": 0.8785156011581421, + "logits/rejected": 0.9088379144668579, + "logps/chosen": -421.92498779296875, + "logps/rejected": -382.0375061035156, + "loss": 0.5951, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.987597644329071, + "rewards/margins": 1.043847680091858, + "rewards/rejected": -0.05565796047449112, + "step": 2380 + }, + { + "epoch": 0.7692926691880583, + "grad_norm": 122.85473801179835, + "learning_rate": 8.077108821635543e-07, + "logits/chosen": 0.838452160358429, + "logits/rejected": 0.738476574420929, + "logps/chosen": -415.79998779296875, + "logps/rejected": -350.125, + "loss": 0.6183, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.8442962765693665, + "rewards/margins": 0.83050537109375, + "rewards/rejected": 0.01291503943502903, + "step": 2390 + }, + { + "epoch": 0.7725114669670878, + "grad_norm": 148.75696312544284, + "learning_rate": 8.069059884095299e-07, + "logits/chosen": 0.8839111328125, + "logits/rejected": 0.8441559076309204, + "logps/chosen": -393.875, + "logps/rejected": -372.70001220703125, + "loss": 0.4948, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.643280029296875, + "rewards/margins": 1.25323486328125, + "rewards/rejected": -0.6105712652206421, + "step": 2400 + }, + { + "epoch": 0.7757302647461173, + "grad_norm": 57.358024834424235, + "learning_rate": 8.061010946555055e-07, + "logits/chosen": 0.7000182867050171, + "logits/rejected": 0.6544189453125, + "logps/chosen": -406.70001220703125, + "logps/rejected": -376.88751220703125, + "loss": 0.5859, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.3321899473667145, + "rewards/margins": 0.967120349407196, + "rewards/rejected": -0.635235607624054, + "step": 2410 + }, + { + "epoch": 0.7789490625251468, + "grad_norm": 157.0082464966, + "learning_rate": 8.05296200901481e-07, + "logits/chosen": 0.638256847858429, + "logits/rejected": 0.6401306390762329, + "logps/chosen": -362.07501220703125, + "logps/rejected": -342.1499938964844, + "loss": 0.627, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.24470214545726776, + "rewards/margins": 0.841278076171875, + "rewards/rejected": -0.595843493938446, + "step": 2420 + }, + { + "epoch": 0.7821678603041764, + "grad_norm": 100.9766318810361, + "learning_rate": 8.044913071474565e-07, + "logits/chosen": 0.602795422077179, + "logits/rejected": 0.7162231206893921, + "logps/chosen": -431.04998779296875, + "logps/rejected": -399.54998779296875, + "loss": 0.5842, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.37529295682907104, + "rewards/margins": 1.05279541015625, + "rewards/rejected": -0.6774841547012329, + "step": 2430 + }, + { + "epoch": 0.785386658083206, + "grad_norm": 122.38824085100337, + "learning_rate": 8.036864133934321e-07, + "logits/chosen": 0.786572277545929, + "logits/rejected": 0.754046618938446, + "logps/chosen": -410.20001220703125, + "logps/rejected": -389.2749938964844, + "loss": 0.5046, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.45271605253219604, + "rewards/margins": 1.1660888195037842, + "rewards/rejected": -0.7139892578125, + "step": 2440 + }, + { + "epoch": 0.7886054558622354, + "grad_norm": 125.05609145900158, + "learning_rate": 8.028815196394075e-07, + "logits/chosen": 0.5677734613418579, + "logits/rejected": 0.5177032351493835, + "logps/chosen": -378.67498779296875, + "logps/rejected": -335.5, + "loss": 0.5803, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18701782822608948, + "rewards/margins": 0.919909656047821, + "rewards/rejected": -0.7322174310684204, + "step": 2450 + }, + { + "epoch": 0.791824253641265, + "grad_norm": 97.49691634742487, + "learning_rate": 8.02076625885383e-07, + "logits/chosen": 0.5220855474472046, + "logits/rejected": 0.5008910894393921, + "logps/chosen": -412.07501220703125, + "logps/rejected": -352.45001220703125, + "loss": 0.5928, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.39851683378219604, + "rewards/margins": 0.9180358648300171, + "rewards/rejected": -0.520281970500946, + "step": 2460 + }, + { + "epoch": 0.7950430514202945, + "grad_norm": 116.0252594281096, + "learning_rate": 8.012717321313587e-07, + "logits/chosen": 0.764904797077179, + "logits/rejected": 0.69549560546875, + "logps/chosen": -372.51251220703125, + "logps/rejected": -357.11248779296875, + "loss": 0.6373, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.540209949016571, + "rewards/margins": 0.8125854730606079, + "rewards/rejected": -0.27232664823532104, + "step": 2470 + }, + { + "epoch": 0.7982618491993241, + "grad_norm": 120.29925344421709, + "learning_rate": 8.004668383773342e-07, + "logits/chosen": 0.60498046875, + "logits/rejected": 0.5638183355331421, + "logps/chosen": -405.42498779296875, + "logps/rejected": -375.70001220703125, + "loss": 0.6324, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.590747058391571, + "rewards/margins": 0.982495129108429, + "rewards/rejected": -0.39039307832717896, + "step": 2480 + }, + { + "epoch": 0.8014806469783536, + "grad_norm": 106.10467242228678, + "learning_rate": 7.996619446233097e-07, + "logits/chosen": 0.629150390625, + "logits/rejected": 0.649584949016571, + "logps/chosen": -387.4750061035156, + "logps/rejected": -343.92498779296875, + "loss": 0.5933, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.575244128704071, + "rewards/margins": 0.874194324016571, + "rewards/rejected": -0.29963380098342896, + "step": 2490 + }, + { + "epoch": 0.8046994447573831, + "grad_norm": 137.70540267886204, + "learning_rate": 7.988570508692853e-07, + "logits/chosen": 0.762921154499054, + "logits/rejected": 0.7430664300918579, + "logps/chosen": -375.95001220703125, + "logps/rejected": -367.11248779296875, + "loss": 0.6815, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.46174317598342896, + "rewards/margins": 0.802050769329071, + "rewards/rejected": -0.34083253145217896, + "step": 2500 + }, + { + "epoch": 0.8079182425364126, + "grad_norm": 154.92151126067884, + "learning_rate": 7.980521571152608e-07, + "logits/chosen": 0.705737292766571, + "logits/rejected": 0.7157958745956421, + "logps/chosen": -376.95001220703125, + "logps/rejected": -359.07501220703125, + "loss": 0.5852, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3886474668979645, + "rewards/margins": 1.0194091796875, + "rewards/rejected": -0.6309417486190796, + "step": 2510 + }, + { + "epoch": 0.8111370403154422, + "grad_norm": 140.53543291999313, + "learning_rate": 7.972472633612363e-07, + "logits/chosen": 0.7715209722518921, + "logits/rejected": 0.7328583002090454, + "logps/chosen": -440.95001220703125, + "logps/rejected": -382.1499938964844, + "loss": 0.5714, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.28432005643844604, + "rewards/margins": 0.987231433391571, + "rewards/rejected": -0.702862560749054, + "step": 2520 + }, + { + "epoch": 0.8143558380944718, + "grad_norm": 87.69328829605095, + "learning_rate": 7.964423696072117e-07, + "logits/chosen": 0.654620349407196, + "logits/rejected": 0.590899646282196, + "logps/chosen": -402.73748779296875, + "logps/rejected": -363.29998779296875, + "loss": 0.4791, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.410797119140625, + "rewards/margins": 1.0967285633087158, + "rewards/rejected": -0.68621826171875, + "step": 2530 + }, + { + "epoch": 0.8175746358735012, + "grad_norm": 142.06067236425147, + "learning_rate": 7.956374758531873e-07, + "logits/chosen": 0.676647961139679, + "logits/rejected": 0.577685534954071, + "logps/chosen": -385.6625061035156, + "logps/rejected": -354.70001220703125, + "loss": 0.5757, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.4843200743198395, + "rewards/margins": 0.958203136920929, + "rewards/rejected": -0.47288817167282104, + "step": 2540 + }, + { + "epoch": 0.8207934336525308, + "grad_norm": 141.42795346780028, + "learning_rate": 7.948325820991629e-07, + "logits/chosen": 0.69647216796875, + "logits/rejected": 0.6935974359512329, + "logps/chosen": -425.11248779296875, + "logps/rejected": -386.79998779296875, + "loss": 0.5355, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.382781982421875, + "rewards/margins": 0.850903332233429, + "rewards/rejected": -0.46882933378219604, + "step": 2550 + }, + { + "epoch": 0.8240122314315603, + "grad_norm": 87.88819377767851, + "learning_rate": 7.940276883451384e-07, + "logits/chosen": 0.7207397222518921, + "logits/rejected": 0.698089599609375, + "logps/chosen": -426.9750061035156, + "logps/rejected": -394.9750061035156, + "loss": 0.5289, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.667614758014679, + "rewards/margins": 1.089257836341858, + "rewards/rejected": -0.4221435487270355, + "step": 2560 + }, + { + "epoch": 0.8272310292105899, + "grad_norm": 116.03305161225096, + "learning_rate": 7.93222794591114e-07, + "logits/chosen": 0.7443786859512329, + "logits/rejected": 0.6790771484375, + "logps/chosen": -400.1875, + "logps/rejected": -359.79998779296875, + "loss": 0.6098, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.5395644903182983, + "rewards/margins": 0.869030773639679, + "rewards/rejected": -0.3291076719760895, + "step": 2570 + }, + { + "epoch": 0.8304498269896193, + "grad_norm": 117.99463222292366, + "learning_rate": 7.924179008370895e-07, + "logits/chosen": 0.868237316608429, + "logits/rejected": 0.765271008014679, + "logps/chosen": -404.8500061035156, + "logps/rejected": -358.92498779296875, + "loss": 0.6726, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.779553234577179, + "rewards/margins": 0.7366698980331421, + "rewards/rejected": 0.04359130933880806, + "step": 2580 + }, + { + "epoch": 0.8336686247686489, + "grad_norm": 137.4550805821355, + "learning_rate": 7.91613007083065e-07, + "logits/chosen": 0.7901245355606079, + "logits/rejected": 0.7495483160018921, + "logps/chosen": -407.21875, + "logps/rejected": -378.17498779296875, + "loss": 0.6141, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.8029114007949829, + "rewards/margins": 0.794726550579071, + "rewards/rejected": 0.00826415978372097, + "step": 2590 + }, + { + "epoch": 0.8368874225476785, + "grad_norm": 114.76205425360803, + "learning_rate": 7.908081133290405e-07, + "logits/chosen": 0.845947265625, + "logits/rejected": 0.8078368902206421, + "logps/chosen": -374.88751220703125, + "logps/rejected": -361.32501220703125, + "loss": 0.5911, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7933059930801392, + "rewards/margins": 1.0328247547149658, + "rewards/rejected": -0.23976440727710724, + "step": 2600 + }, + { + "epoch": 0.840106220326708, + "grad_norm": 83.12300150325899, + "learning_rate": 7.90003219575016e-07, + "logits/chosen": 0.935961902141571, + "logits/rejected": 0.828173816204071, + "logps/chosen": -392.54998779296875, + "logps/rejected": -364.5249938964844, + "loss": 0.5408, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.9313415288925171, + "rewards/margins": 1.14141845703125, + "rewards/rejected": -0.21056517958641052, + "step": 2610 + }, + { + "epoch": 0.8433250181057375, + "grad_norm": 83.17168607263459, + "learning_rate": 7.891983258209915e-07, + "logits/chosen": 0.6249939203262329, + "logits/rejected": 0.607189953327179, + "logps/chosen": -411.11248779296875, + "logps/rejected": -375.6000061035156, + "loss": 0.6066, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.5674804449081421, + "rewards/margins": 0.8819214105606079, + "rewards/rejected": -0.3146118223667145, + "step": 2620 + }, + { + "epoch": 0.846543815884767, + "grad_norm": 105.5690124024398, + "learning_rate": 7.883934320669672e-07, + "logits/chosen": 0.6437225341796875, + "logits/rejected": 0.63037109375, + "logps/chosen": -396.8500061035156, + "logps/rejected": -376.63751220703125, + "loss": 0.6161, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.45338135957717896, + "rewards/margins": 0.9397827386856079, + "rewards/rejected": -0.4858642518520355, + "step": 2630 + }, + { + "epoch": 0.8497626136637966, + "grad_norm": 108.88330225179276, + "learning_rate": 7.875885383129427e-07, + "logits/chosen": 0.6846710443496704, + "logits/rejected": 0.6206725835800171, + "logps/chosen": -415.7250061035156, + "logps/rejected": -378.04998779296875, + "loss": 0.627, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.18450012803077698, + "rewards/margins": 0.913073718547821, + "rewards/rejected": -0.7281860113143921, + "step": 2640 + }, + { + "epoch": 0.8529814114428261, + "grad_norm": 116.75822970063682, + "learning_rate": 7.867836445589182e-07, + "logits/chosen": 0.623828113079071, + "logits/rejected": 0.6446288824081421, + "logps/chosen": -415.0, + "logps/rejected": -393.5, + "loss": 0.5075, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.3712219297885895, + "rewards/margins": 1.107507348060608, + "rewards/rejected": -0.736499011516571, + "step": 2650 + }, + { + "epoch": 0.8562002092218557, + "grad_norm": 92.07500124375007, + "learning_rate": 7.859787508048938e-07, + "logits/chosen": 0.768170177936554, + "logits/rejected": 0.7117675542831421, + "logps/chosen": -360.4750061035156, + "logps/rejected": -326.92498779296875, + "loss": 0.5236, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.40510255098342896, + "rewards/margins": 1.0669372081756592, + "rewards/rejected": -0.661376953125, + "step": 2660 + }, + { + "epoch": 0.8594190070008851, + "grad_norm": 155.27541085926134, + "learning_rate": 7.851738570508692e-07, + "logits/chosen": 0.588940441608429, + "logits/rejected": 0.5390380620956421, + "logps/chosen": -431.23748779296875, + "logps/rejected": -384.23748779296875, + "loss": 0.6621, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.17006225883960724, + "rewards/margins": 0.947949230670929, + "rewards/rejected": -0.7779906988143921, + "step": 2670 + }, + { + "epoch": 0.8626378047799147, + "grad_norm": 100.19680079030283, + "learning_rate": 7.843689632968447e-07, + "logits/chosen": 0.5938965082168579, + "logits/rejected": 0.508227527141571, + "logps/chosen": -387.6625061035156, + "logps/rejected": -358.5, + "loss": 0.5511, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.21757812798023224, + "rewards/margins": 0.998095691204071, + "rewards/rejected": -0.780072033405304, + "step": 2680 + }, + { + "epoch": 0.8658566025589443, + "grad_norm": 175.6147644318954, + "learning_rate": 7.835640695428203e-07, + "logits/chosen": 0.6938751339912415, + "logits/rejected": 0.6889923214912415, + "logps/chosen": -390.2749938964844, + "logps/rejected": -362.2250061035156, + "loss": 0.5627, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.3887390196323395, + "rewards/margins": 0.981274425983429, + "rewards/rejected": -0.5928589105606079, + "step": 2690 + }, + { + "epoch": 0.8690754003379738, + "grad_norm": 129.6527918520003, + "learning_rate": 7.827591757887958e-07, + "logits/chosen": 0.6568359136581421, + "logits/rejected": 0.704577624797821, + "logps/chosen": -382.625, + "logps/rejected": -351.125, + "loss": 0.6383, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.39868468046188354, + "rewards/margins": 0.902099609375, + "rewards/rejected": -0.5039917230606079, + "step": 2700 + }, + { + "epoch": 0.8722941981170033, + "grad_norm": 99.84241876931934, + "learning_rate": 7.819542820347714e-07, + "logits/chosen": 0.766979992389679, + "logits/rejected": 0.6591247320175171, + "logps/chosen": -416.45001220703125, + "logps/rejected": -349.4624938964844, + "loss": 0.6343, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.6684020757675171, + "rewards/margins": 0.918566882610321, + "rewards/rejected": -0.25053101778030396, + "step": 2710 + }, + { + "epoch": 0.8755129958960328, + "grad_norm": 125.4509147850626, + "learning_rate": 7.81149388280747e-07, + "logits/chosen": 0.6149749755859375, + "logits/rejected": 0.653820812702179, + "logps/chosen": -353.375, + "logps/rejected": -354.0249938964844, + "loss": 0.6892, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.29735106229782104, + "rewards/margins": 0.749682605266571, + "rewards/rejected": -0.4523681700229645, + "step": 2720 + }, + { + "epoch": 0.8787317936750624, + "grad_norm": 124.56833366289271, + "learning_rate": 7.803444945267225e-07, + "logits/chosen": 0.647564709186554, + "logits/rejected": 0.6595184206962585, + "logps/chosen": -385.63751220703125, + "logps/rejected": -362.7250061035156, + "loss": 0.5423, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.17413941025733948, + "rewards/margins": 1.0076904296875, + "rewards/rejected": -0.833691418170929, + "step": 2730 + }, + { + "epoch": 0.8819505914540919, + "grad_norm": 54.368298533744465, + "learning_rate": 7.79539600772698e-07, + "logits/chosen": 0.5551326870918274, + "logits/rejected": 0.588696300983429, + "logps/chosen": -410.2250061035156, + "logps/rejected": -373.5249938964844, + "loss": 0.503, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.16251221299171448, + "rewards/margins": 1.174353003501892, + "rewards/rejected": -1.01129150390625, + "step": 2740 + }, + { + "epoch": 0.8851693892331214, + "grad_norm": 104.28551295026791, + "learning_rate": 7.787347070186735e-07, + "logits/chosen": 0.621533215045929, + "logits/rejected": 0.6509948968887329, + "logps/chosen": -411.1625061035156, + "logps/rejected": -383.3999938964844, + "loss": 0.5109, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.219085693359375, + "rewards/margins": 1.3863646984100342, + "rewards/rejected": -1.167108178138733, + "step": 2750 + }, + { + "epoch": 0.8883881870121509, + "grad_norm": 118.65826032599489, + "learning_rate": 7.77929813264649e-07, + "logits/chosen": 0.63555908203125, + "logits/rejected": 0.589160144329071, + "logps/chosen": -394.17498779296875, + "logps/rejected": -408.04998779296875, + "loss": 0.6862, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13861083984375, + "rewards/margins": 0.7714599370956421, + "rewards/rejected": -0.6317535638809204, + "step": 2760 + }, + { + "epoch": 0.8916069847911805, + "grad_norm": 162.8744244597116, + "learning_rate": 7.771249195106245e-07, + "logits/chosen": 0.5758422613143921, + "logits/rejected": 0.6329711675643921, + "logps/chosen": -409.3500061035156, + "logps/rejected": -381.29998779296875, + "loss": 0.4875, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.20587158203125, + "rewards/margins": 1.1496124267578125, + "rewards/rejected": -0.943310558795929, + "step": 2770 + }, + { + "epoch": 0.8948257825702101, + "grad_norm": 189.30912339577455, + "learning_rate": 7.763200257566001e-07, + "logits/chosen": 0.6514831781387329, + "logits/rejected": 0.5666137933731079, + "logps/chosen": -437.875, + "logps/rejected": -389.75, + "loss": 0.6342, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.2760376036167145, + "rewards/margins": 0.987060546875, + "rewards/rejected": -0.7111145257949829, + "step": 2780 + }, + { + "epoch": 0.8980445803492395, + "grad_norm": 141.39332280940636, + "learning_rate": 7.755151320025757e-07, + "logits/chosen": 0.868518054485321, + "logits/rejected": 0.747210681438446, + "logps/chosen": -402.7250061035156, + "logps/rejected": -346.3500061035156, + "loss": 0.6062, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3653198182582855, + "rewards/margins": 1.1757323741912842, + "rewards/rejected": -0.8109496831893921, + "step": 2790 + }, + { + "epoch": 0.9012633781282691, + "grad_norm": 131.03311804779472, + "learning_rate": 7.747102382485512e-07, + "logits/chosen": 0.8633056879043579, + "logits/rejected": 0.86627197265625, + "logps/chosen": -370.92498779296875, + "logps/rejected": -324.6000061035156, + "loss": 0.4996, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.669323742389679, + "rewards/margins": 1.1739501953125, + "rewards/rejected": -0.5026305913925171, + "step": 2800 + }, + { + "epoch": 0.9044821759072986, + "grad_norm": 97.35815192735019, + "learning_rate": 7.739053444945268e-07, + "logits/chosen": 0.78564453125, + "logits/rejected": 0.7244888544082642, + "logps/chosen": -440.5249938964844, + "logps/rejected": -383.7250061035156, + "loss": 0.6068, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.8139892816543579, + "rewards/margins": 1.2917969226837158, + "rewards/rejected": -0.47882080078125, + "step": 2810 + }, + { + "epoch": 0.9077009736863282, + "grad_norm": 119.54054076680652, + "learning_rate": 7.731004507405022e-07, + "logits/chosen": 0.788952648639679, + "logits/rejected": 0.738598644733429, + "logps/chosen": -417.17498779296875, + "logps/rejected": -403.32501220703125, + "loss": 0.6626, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.38636475801467896, + "rewards/margins": 0.989178478717804, + "rewards/rejected": -0.6030517816543579, + "step": 2820 + }, + { + "epoch": 0.9109197714653577, + "grad_norm": 100.5647011843041, + "learning_rate": 7.722955569864777e-07, + "logits/chosen": 0.6720215082168579, + "logits/rejected": 0.5923736691474915, + "logps/chosen": -349.1875, + "logps/rejected": -307.6000061035156, + "loss": 0.4977, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.56744384765625, + "rewards/margins": 1.2391846179962158, + "rewards/rejected": -0.671740710735321, + "step": 2830 + }, + { + "epoch": 0.9141385692443872, + "grad_norm": 138.77145205325968, + "learning_rate": 7.714906632324532e-07, + "logits/chosen": 0.573071300983429, + "logits/rejected": 0.5213562250137329, + "logps/chosen": -466.125, + "logps/rejected": -409.0249938964844, + "loss": 0.632, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.17985840141773224, + "rewards/margins": 1.0274658203125, + "rewards/rejected": -0.8471130132675171, + "step": 2840 + }, + { + "epoch": 0.9173573670234167, + "grad_norm": 123.85701614188429, + "learning_rate": 7.706857694784288e-07, + "logits/chosen": 0.577685534954071, + "logits/rejected": 0.5956786870956421, + "logps/chosen": -398.0375061035156, + "logps/rejected": -362.32501220703125, + "loss": 0.5515, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.2716308534145355, + "rewards/margins": 1.0491943359375, + "rewards/rejected": -0.7783142328262329, + "step": 2850 + }, + { + "epoch": 0.9205761648024463, + "grad_norm": 129.4079292247724, + "learning_rate": 7.698808757244043e-07, + "logits/chosen": 0.638232409954071, + "logits/rejected": 0.5386596918106079, + "logps/chosen": -396.45001220703125, + "logps/rejected": -346.1499938964844, + "loss": 0.5287, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.31873780488967896, + "rewards/margins": 1.002050757408142, + "rewards/rejected": -0.6827392578125, + "step": 2860 + }, + { + "epoch": 0.9237949625814759, + "grad_norm": 99.61426295592578, + "learning_rate": 7.690759819703799e-07, + "logits/chosen": 0.693310558795929, + "logits/rejected": 0.6779540777206421, + "logps/chosen": -415.375, + "logps/rejected": -380.625, + "loss": 0.5285, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.6110442876815796, + "rewards/margins": 1.1763122081756592, + "rewards/rejected": -0.56494140625, + "step": 2870 + }, + { + "epoch": 0.9270137603605053, + "grad_norm": 165.41484450906222, + "learning_rate": 7.682710882163555e-07, + "logits/chosen": 0.701855480670929, + "logits/rejected": 0.6913818120956421, + "logps/chosen": -431.7749938964844, + "logps/rejected": -373.5, + "loss": 0.6182, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.637158215045929, + "rewards/margins": 0.968554675579071, + "rewards/rejected": -0.33180540800094604, + "step": 2880 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 159.09546636816913, + "learning_rate": 7.67466194462331e-07, + "logits/chosen": 0.692675769329071, + "logits/rejected": 0.7020263671875, + "logps/chosen": -387.125, + "logps/rejected": -388.82501220703125, + "loss": 0.7077, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.3422042727470398, + "rewards/margins": 0.740478515625, + "rewards/rejected": -0.397705078125, + "step": 2890 + }, + { + "epoch": 0.9334513559185644, + "grad_norm": 110.50236393025179, + "learning_rate": 7.666613007083064e-07, + "logits/chosen": 0.79736328125, + "logits/rejected": 0.742382824420929, + "logps/chosen": -420.7749938964844, + "logps/rejected": -382.67498779296875, + "loss": 0.5424, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.50384521484375, + "rewards/margins": 1.1374390125274658, + "rewards/rejected": -0.6332947015762329, + "step": 2900 + }, + { + "epoch": 0.936670153697594, + "grad_norm": 140.4183011082284, + "learning_rate": 7.65856406954282e-07, + "logits/chosen": 0.74505615234375, + "logits/rejected": 0.716064453125, + "logps/chosen": -369.6000061035156, + "logps/rejected": -332.9750061035156, + "loss": 0.52, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5161193609237671, + "rewards/margins": 1.155542016029358, + "rewards/rejected": -0.639678955078125, + "step": 2910 + }, + { + "epoch": 0.9398889514766234, + "grad_norm": 167.65202133924117, + "learning_rate": 7.650515132002575e-07, + "logits/chosen": 0.763110339641571, + "logits/rejected": 0.714770495891571, + "logps/chosen": -383.6000061035156, + "logps/rejected": -362.2749938964844, + "loss": 0.6019, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.38534241914749146, + "rewards/margins": 0.937121570110321, + "rewards/rejected": -0.55120849609375, + "step": 2920 + }, + { + "epoch": 0.943107749255653, + "grad_norm": 114.10050648788618, + "learning_rate": 7.64246619446233e-07, + "logits/chosen": 0.7569824457168579, + "logits/rejected": 0.733654797077179, + "logps/chosen": -380.26251220703125, + "logps/rejected": -370.7749938964844, + "loss": 0.6743, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.28826904296875, + "rewards/margins": 0.8707031011581421, + "rewards/rejected": -0.582489013671875, + "step": 2930 + }, + { + "epoch": 0.9463265470346826, + "grad_norm": 100.10463782375413, + "learning_rate": 7.634417256922086e-07, + "logits/chosen": 0.95343017578125, + "logits/rejected": 0.8630005121231079, + "logps/chosen": -434.20001220703125, + "logps/rejected": -382.125, + "loss": 0.5998, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.596630871295929, + "rewards/margins": 0.953015148639679, + "rewards/rejected": -0.35606080293655396, + "step": 2940 + }, + { + "epoch": 0.9495453448137121, + "grad_norm": 138.38013673669857, + "learning_rate": 7.626368319381842e-07, + "logits/chosen": 0.929736316204071, + "logits/rejected": 0.919995129108429, + "logps/chosen": -378.42498779296875, + "logps/rejected": -395.7875061035156, + "loss": 0.6401, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.972582995891571, + "rewards/margins": 1.0489013195037842, + "rewards/rejected": -0.07711486518383026, + "step": 2950 + }, + { + "epoch": 0.9527641425927416, + "grad_norm": 99.74892133676668, + "learning_rate": 7.618319381841597e-07, + "logits/chosen": 0.803417980670929, + "logits/rejected": 0.864916980266571, + "logps/chosen": -398.7250061035156, + "logps/rejected": -349.875, + "loss": 0.5414, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.1163330078125, + "rewards/margins": 0.986279308795929, + "rewards/rejected": 0.13035888969898224, + "step": 2960 + }, + { + "epoch": 0.9559829403717711, + "grad_norm": 114.21634841638566, + "learning_rate": 7.610270444301352e-07, + "logits/chosen": 0.831768810749054, + "logits/rejected": 0.829052746295929, + "logps/chosen": -425.2250061035156, + "logps/rejected": -406.2749938964844, + "loss": 0.6043, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.913586437702179, + "rewards/margins": 0.871691882610321, + "rewards/rejected": 0.04148559644818306, + "step": 2970 + }, + { + "epoch": 0.9592017381508007, + "grad_norm": 91.37934434512265, + "learning_rate": 7.602221506761107e-07, + "logits/chosen": 0.8346923589706421, + "logits/rejected": 0.762371838092804, + "logps/chosen": -455.3999938964844, + "logps/rejected": -402.29998779296875, + "loss": 0.5922, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 1.02862548828125, + "rewards/margins": 0.9399780035018921, + "rewards/rejected": 0.088287353515625, + "step": 2980 + }, + { + "epoch": 0.9624205359298302, + "grad_norm": 120.68708564964888, + "learning_rate": 7.594172569220862e-07, + "logits/chosen": 0.900341808795929, + "logits/rejected": 0.783905029296875, + "logps/chosen": -368.7250061035156, + "logps/rejected": -360.8999938964844, + "loss": 0.58, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.925262451171875, + "rewards/margins": 0.94049072265625, + "rewards/rejected": -0.01450195349752903, + "step": 2990 + }, + { + "epoch": 0.9656393337088598, + "grad_norm": 94.91348781201353, + "learning_rate": 7.586123631680618e-07, + "logits/chosen": 0.8617309331893921, + "logits/rejected": 0.8137878179550171, + "logps/chosen": -469.70001220703125, + "logps/rejected": -390.7250061035156, + "loss": 0.6396, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.967175304889679, + "rewards/margins": 1.0395996570587158, + "rewards/rejected": -0.07411499321460724, + "step": 3000 + }, + { + "epoch": 0.9688581314878892, + "grad_norm": 67.77708434943798, + "learning_rate": 7.578074694140373e-07, + "logits/chosen": 0.718432605266571, + "logits/rejected": 0.561572253704071, + "logps/chosen": -409.29998779296875, + "logps/rejected": -351.5249938964844, + "loss": 0.4773, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.9033569097518921, + "rewards/margins": 1.32513427734375, + "rewards/rejected": -0.42182618379592896, + "step": 3010 + }, + { + "epoch": 0.9720769292669188, + "grad_norm": 119.14480547760023, + "learning_rate": 7.570025756600128e-07, + "logits/chosen": 0.666918933391571, + "logits/rejected": 0.592669665813446, + "logps/chosen": -414.625, + "logps/rejected": -375.79998779296875, + "loss": 0.5983, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.40214842557907104, + "rewards/margins": 1.183007836341858, + "rewards/rejected": -0.7820190191268921, + "step": 3020 + }, + { + "epoch": 0.9752957270459484, + "grad_norm": 122.80019562989403, + "learning_rate": 7.561976819059885e-07, + "logits/chosen": 0.6526123285293579, + "logits/rejected": 0.47962647676467896, + "logps/chosen": -459.375, + "logps/rejected": -409.5, + "loss": 0.6493, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.15074463188648224, + "rewards/margins": 1.0448272228240967, + "rewards/rejected": -0.89373779296875, + "step": 3030 + }, + { + "epoch": 0.9785145248249779, + "grad_norm": 70.79728027641642, + "learning_rate": 7.55392788151964e-07, + "logits/chosen": 0.670764148235321, + "logits/rejected": 0.6051269769668579, + "logps/chosen": -348.04998779296875, + "logps/rejected": -331.38751220703125, + "loss": 0.6006, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.26850587129592896, + "rewards/margins": 0.917755126953125, + "rewards/rejected": -0.6485229730606079, + "step": 3040 + }, + { + "epoch": 0.9817333226040074, + "grad_norm": 83.80129263922844, + "learning_rate": 7.545878943979394e-07, + "logits/chosen": 0.6674133539199829, + "logits/rejected": 0.6324828863143921, + "logps/chosen": -391.1875, + "logps/rejected": -347.9750061035156, + "loss": 0.5723, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.3601318299770355, + "rewards/margins": 1.021520972251892, + "rewards/rejected": -0.66119384765625, + "step": 3050 + }, + { + "epoch": 0.9849521203830369, + "grad_norm": 124.31923691359488, + "learning_rate": 7.53783000643915e-07, + "logits/chosen": 0.6371444463729858, + "logits/rejected": 0.6250854730606079, + "logps/chosen": -386.0, + "logps/rejected": -385.29998779296875, + "loss": 0.6225, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.33576661348342896, + "rewards/margins": 0.867236316204071, + "rewards/rejected": -0.5317016839981079, + "step": 3060 + }, + { + "epoch": 0.9881709181620665, + "grad_norm": 110.44076202278397, + "learning_rate": 7.529781068898905e-07, + "logits/chosen": 0.667736828327179, + "logits/rejected": 0.6272827386856079, + "logps/chosen": -397.1499938964844, + "logps/rejected": -375.6625061035156, + "loss": 0.6277, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.29168701171875, + "rewards/margins": 0.8900390863418579, + "rewards/rejected": -0.597644031047821, + "step": 3070 + }, + { + "epoch": 0.991389715941096, + "grad_norm": 103.92991045584165, + "learning_rate": 7.52173213135866e-07, + "logits/chosen": 0.7188385128974915, + "logits/rejected": 0.67706298828125, + "logps/chosen": -388.375, + "logps/rejected": -385.1499938964844, + "loss": 0.6018, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.27649229764938354, + "rewards/margins": 0.933056652545929, + "rewards/rejected": -0.656658947467804, + "step": 3080 + }, + { + "epoch": 0.9946085137201255, + "grad_norm": 131.1654795362156, + "learning_rate": 7.513683193818416e-07, + "logits/chosen": 0.7007385492324829, + "logits/rejected": 0.649017333984375, + "logps/chosen": -399.57501220703125, + "logps/rejected": -349.0, + "loss": 0.6371, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.15358886122703552, + "rewards/margins": 0.9484008550643921, + "rewards/rejected": -0.794604480266571, + "step": 3090 + }, + { + "epoch": 0.997827311499155, + "grad_norm": 128.84788460925583, + "learning_rate": 7.505634256278171e-07, + "logits/chosen": 0.8606201410293579, + "logits/rejected": 0.8363891839981079, + "logps/chosen": -408.57501220703125, + "logps/rejected": -376.29998779296875, + "loss": 0.5151, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.39683836698532104, + "rewards/margins": 1.0883057117462158, + "rewards/rejected": -0.6910369992256165, + "step": 3100 + }, + { + "epoch": 1.000965639333709, + "grad_norm": 15.129215792779695, + "learning_rate": 7.497585318737927e-07, + "logits/chosen": 0.6515800356864929, + "logits/rejected": 0.6195036768913269, + "logps/chosen": -431.20513916015625, + "logps/rejected": -394.025634765625, + "loss": 0.3958, + "rewards/accuracies": 0.8205128312110901, + "rewards/chosen": 0.9037710428237915, + "rewards/margins": 1.9726186990737915, + "rewards/rejected": -1.0682278871536255, + "step": 3110 + }, + { + "epoch": 1.0041844371127384, + "grad_norm": 19.34652470902673, + "learning_rate": 7.489536381197682e-07, + "logits/chosen": 0.644604504108429, + "logits/rejected": 0.6614013910293579, + "logps/chosen": -429.3500061035156, + "logps/rejected": -433.0625, + "loss": 0.0818, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 1.6704590320587158, + "rewards/margins": 4.304491996765137, + "rewards/rejected": -2.633007764816284, + "step": 3120 + }, + { + "epoch": 1.0074032348917679, + "grad_norm": 36.052070663694565, + "learning_rate": 7.481487443657437e-07, + "logits/chosen": 0.588366687297821, + "logits/rejected": 0.4492248594760895, + "logps/chosen": -444.7124938964844, + "logps/rejected": -445.57501220703125, + "loss": 0.0695, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 1.678259253501892, + "rewards/margins": 4.659960746765137, + "rewards/rejected": -2.982714891433716, + "step": 3130 + }, + { + "epoch": 1.0106220326707975, + "grad_norm": 15.036647494327315, + "learning_rate": 7.473438506117192e-07, + "logits/chosen": 0.4008239805698395, + "logits/rejected": 0.28107911348342896, + "logps/chosen": -439.25, + "logps/rejected": -403.4750061035156, + "loss": 0.069, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.430395483970642, + "rewards/margins": 4.466015815734863, + "rewards/rejected": -3.0365233421325684, + "step": 3140 + }, + { + "epoch": 1.013840830449827, + "grad_norm": 16.892157445272016, + "learning_rate": 7.465389568576947e-07, + "logits/chosen": 0.37513428926467896, + "logits/rejected": 0.3186798095703125, + "logps/chosen": -404.45001220703125, + "logps/rejected": -409.67498779296875, + "loss": 0.0686, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4076049327850342, + "rewards/margins": 4.510937690734863, + "rewards/rejected": -3.1004881858825684, + "step": 3150 + }, + { + "epoch": 1.0170596282288564, + "grad_norm": 30.812104314035597, + "learning_rate": 7.457340631036703e-07, + "logits/chosen": 0.2608398497104645, + "logits/rejected": 0.2007904052734375, + "logps/chosen": -430.3999938964844, + "logps/rejected": -412.1000061035156, + "loss": 0.0912, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.5258910655975342, + "rewards/margins": 4.616015434265137, + "rewards/rejected": -3.0882811546325684, + "step": 3160 + }, + { + "epoch": 1.020278426007886, + "grad_norm": 30.586862098683138, + "learning_rate": 7.449291693496458e-07, + "logits/chosen": 0.3541412353515625, + "logits/rejected": 0.37103271484375, + "logps/chosen": -399.6000061035156, + "logps/rejected": -425.54998779296875, + "loss": 0.0723, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 1.404199242591858, + "rewards/margins": 4.628125190734863, + "rewards/rejected": -3.222460985183716, + "step": 3170 + }, + { + "epoch": 1.0234972237869155, + "grad_norm": 18.025624015066967, + "learning_rate": 7.441242755956213e-07, + "logits/chosen": 0.3204894959926605, + "logits/rejected": 0.226165771484375, + "logps/chosen": -420.2749938964844, + "logps/rejected": -416.4750061035156, + "loss": 0.0917, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.1534423828125, + "rewards/margins": 4.714062690734863, + "rewards/rejected": -3.5595703125, + "step": 3180 + }, + { + "epoch": 1.0267160215659452, + "grad_norm": 27.705997350429936, + "learning_rate": 7.43319381841597e-07, + "logits/chosen": 0.42500001192092896, + "logits/rejected": 0.335418701171875, + "logps/chosen": -410.01251220703125, + "logps/rejected": -413.04998779296875, + "loss": 0.0703, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3693358898162842, + "rewards/margins": 4.854101657867432, + "rewards/rejected": -3.486132860183716, + "step": 3190 + }, + { + "epoch": 1.0299348193449747, + "grad_norm": 12.967461832797234, + "learning_rate": 7.425144880875724e-07, + "logits/chosen": 0.24075011909008026, + "logits/rejected": 0.170928955078125, + "logps/chosen": -381.3500061035156, + "logps/rejected": -405.1499938964844, + "loss": 0.1067, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.702221691608429, + "rewards/margins": 4.726171970367432, + "rewards/rejected": -4.022851467132568, + "step": 3200 + }, + { + "epoch": 1.033153617124004, + "grad_norm": 32.64004038754801, + "learning_rate": 7.417095943335479e-07, + "logits/chosen": 0.19301147758960724, + "logits/rejected": 0.11619262397289276, + "logps/chosen": -389.6499938964844, + "logps/rejected": -384.2749938964844, + "loss": 0.0987, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.5364013910293579, + "rewards/margins": 4.309765815734863, + "rewards/rejected": -3.772656202316284, + "step": 3210 + }, + { + "epoch": 1.0363724149030338, + "grad_norm": 22.507065837818462, + "learning_rate": 7.409047005795235e-07, + "logits/chosen": 0.32136839628219604, + "logits/rejected": 0.12032928317785263, + "logps/chosen": -449.38751220703125, + "logps/rejected": -385.1499938964844, + "loss": 0.0879, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.9686816930770874, + "rewards/margins": 4.920312404632568, + "rewards/rejected": -3.9488282203674316, + "step": 3220 + }, + { + "epoch": 1.0395912126820632, + "grad_norm": 12.793512306863187, + "learning_rate": 7.40099806825499e-07, + "logits/chosen": 0.25418663024902344, + "logits/rejected": 0.13450317084789276, + "logps/chosen": -455.86248779296875, + "logps/rejected": -422.95001220703125, + "loss": 0.0871, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.6170898675918579, + "rewards/margins": 4.71875, + "rewards/rejected": -4.104101657867432, + "step": 3230 + }, + { + "epoch": 1.0428100104610927, + "grad_norm": 28.013407344668803, + "learning_rate": 7.392949130714745e-07, + "logits/chosen": 0.3136230409145355, + "logits/rejected": 0.13762816786766052, + "logps/chosen": -406.88751220703125, + "logps/rejected": -375.63751220703125, + "loss": 0.1152, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.8240722417831421, + "rewards/margins": 4.228662014007568, + "rewards/rejected": -3.4034180641174316, + "step": 3240 + }, + { + "epoch": 1.0460288082401223, + "grad_norm": 32.788356525781744, + "learning_rate": 7.384900193174501e-07, + "logits/chosen": 0.3187408447265625, + "logits/rejected": 0.2902465760707855, + "logps/chosen": -343.95001220703125, + "logps/rejected": -351.92498779296875, + "loss": 0.109, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.32828062772750854, + "rewards/margins": 4.512890815734863, + "rewards/rejected": -4.182812690734863, + "step": 3250 + }, + { + "epoch": 1.0492476060191518, + "grad_norm": 22.157564348487803, + "learning_rate": 7.376851255634255e-07, + "logits/chosen": 0.23244018852710724, + "logits/rejected": 0.06783447414636612, + "logps/chosen": -429.20001220703125, + "logps/rejected": -421.25, + "loss": 0.0779, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.6290038824081421, + "rewards/margins": 5.414843559265137, + "rewards/rejected": -4.789843559265137, + "step": 3260 + }, + { + "epoch": 1.0524664037981815, + "grad_norm": 9.514629480614875, + "learning_rate": 7.368802318094011e-07, + "logits/chosen": 0.14520263671875, + "logits/rejected": -0.04245300218462944, + "logps/chosen": -415.0874938964844, + "logps/rejected": -408.9750061035156, + "loss": 0.0777, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.2595458924770355, + "rewards/margins": 5.3046875, + "rewards/rejected": -5.044531345367432, + "step": 3270 + }, + { + "epoch": 1.055685201577211, + "grad_norm": 18.136884941483864, + "learning_rate": 7.360753380553767e-07, + "logits/chosen": 0.17172852158546448, + "logits/rejected": 0.03908691555261612, + "logps/chosen": -409.125, + "logps/rejected": -396.1499938964844, + "loss": 0.0979, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.33293455839157104, + "rewards/margins": 5.064453125, + "rewards/rejected": -4.733788967132568, + "step": 3280 + }, + { + "epoch": 1.0589039993562404, + "grad_norm": 59.58935650733624, + "learning_rate": 7.352704443013522e-07, + "logits/chosen": 0.22153624892234802, + "logits/rejected": 0.10827331244945526, + "logps/chosen": -447.3500061035156, + "logps/rejected": -477.2250061035156, + "loss": 0.0754, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.595782458782196, + "rewards/margins": 5.038281440734863, + "rewards/rejected": -4.446484565734863, + "step": 3290 + }, + { + "epoch": 1.06212279713527, + "grad_norm": 75.95632384392519, + "learning_rate": 7.344655505473277e-07, + "logits/chosen": 0.13720397651195526, + "logits/rejected": 0.0019104003440588713, + "logps/chosen": -428.95001220703125, + "logps/rejected": -422.5, + "loss": 0.0708, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.757080078125, + "rewards/margins": 5.119140625, + "rewards/rejected": -4.361328125, + "step": 3300 + }, + { + "epoch": 1.0653415949142995, + "grad_norm": 25.788571207375586, + "learning_rate": 7.336606567933033e-07, + "logits/chosen": 0.19261474907398224, + "logits/rejected": 0.02850341796875, + "logps/chosen": -445.3500061035156, + "logps/rejected": -415.875, + "loss": 0.0756, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.9813781976699829, + "rewards/margins": 4.972754001617432, + "rewards/rejected": -3.993359327316284, + "step": 3310 + }, + { + "epoch": 1.0685603926933291, + "grad_norm": 47.39325604870592, + "learning_rate": 7.328557630392788e-07, + "logits/chosen": 0.15581664443016052, + "logits/rejected": 0.07740478217601776, + "logps/chosen": -388.7124938964844, + "logps/rejected": -364.92498779296875, + "loss": 0.1057, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.6201232671737671, + "rewards/margins": 4.557812690734863, + "rewards/rejected": -3.937304735183716, + "step": 3320 + }, + { + "epoch": 1.0717791904723586, + "grad_norm": 47.270845385306, + "learning_rate": 7.320508692852542e-07, + "logits/chosen": 0.38020628690719604, + "logits/rejected": 0.2412261962890625, + "logps/chosen": -437.125, + "logps/rejected": -397.6499938964844, + "loss": 0.0748, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.767102062702179, + "rewards/margins": 4.846484184265137, + "rewards/rejected": -4.08203125, + "step": 3330 + }, + { + "epoch": 1.074997988251388, + "grad_norm": 6.117861215024379, + "learning_rate": 7.312459755312298e-07, + "logits/chosen": 0.3385376036167145, + "logits/rejected": 0.272796630859375, + "logps/chosen": -394.63751220703125, + "logps/rejected": -408.75, + "loss": 0.0855, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5574096441268921, + "rewards/margins": 4.791796684265137, + "rewards/rejected": -4.236523628234863, + "step": 3340 + }, + { + "epoch": 1.0782167860304177, + "grad_norm": 39.000725994251745, + "learning_rate": 7.304410817772054e-07, + "logits/chosen": 0.22906494140625, + "logits/rejected": 0.122467041015625, + "logps/chosen": -446.125, + "logps/rejected": -415.4750061035156, + "loss": 0.0829, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.537066638469696, + "rewards/margins": 4.8515625, + "rewards/rejected": -4.311718940734863, + "step": 3350 + }, + { + "epoch": 1.0814355838094472, + "grad_norm": 33.41197378858653, + "learning_rate": 7.296361880231809e-07, + "logits/chosen": 0.13576050102710724, + "logits/rejected": -0.07074890285730362, + "logps/chosen": -380.5249938964844, + "logps/rejected": -380.8500061035156, + "loss": 0.1071, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.5933166742324829, + "rewards/margins": 4.507226467132568, + "rewards/rejected": -3.9136719703674316, + "step": 3360 + }, + { + "epoch": 1.0846543815884766, + "grad_norm": 27.186858888393, + "learning_rate": 7.288312942691565e-07, + "logits/chosen": 0.24695129692554474, + "logits/rejected": 0.08087768405675888, + "logps/chosen": -380.48748779296875, + "logps/rejected": -387.7749938964844, + "loss": 0.0863, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.5919128656387329, + "rewards/margins": 4.710546970367432, + "rewards/rejected": -4.1201171875, + "step": 3370 + }, + { + "epoch": 1.0878731793675063, + "grad_norm": 28.313145917743295, + "learning_rate": 7.28026400515132e-07, + "logits/chosen": 0.32980042695999146, + "logits/rejected": 0.12893065810203552, + "logps/chosen": -421.1000061035156, + "logps/rejected": -388.17498779296875, + "loss": 0.1019, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.862408459186554, + "rewards/margins": 4.722265720367432, + "rewards/rejected": -3.860546827316284, + "step": 3380 + }, + { + "epoch": 1.0910919771465357, + "grad_norm": 20.191339866963887, + "learning_rate": 7.272215067611075e-07, + "logits/chosen": 0.40865784883499146, + "logits/rejected": 0.34281617403030396, + "logps/chosen": -436.3500061035156, + "logps/rejected": -446.7875061035156, + "loss": 0.0899, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.1233336925506592, + "rewards/margins": 4.6455078125, + "rewards/rejected": -3.5223631858825684, + "step": 3390 + }, + { + "epoch": 1.0943107749255654, + "grad_norm": 11.867983892774165, + "learning_rate": 7.264166130070831e-07, + "logits/chosen": 0.24184875190258026, + "logits/rejected": 0.20336608588695526, + "logps/chosen": -369.8500061035156, + "logps/rejected": -403.6000061035156, + "loss": 0.0764, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.84942626953125, + "rewards/margins": 4.562695503234863, + "rewards/rejected": -3.7132811546325684, + "step": 3400 + }, + { + "epoch": 1.0975295727045948, + "grad_norm": 32.94760497923458, + "learning_rate": 7.256117192530585e-07, + "logits/chosen": 0.0287628173828125, + "logits/rejected": -0.00750732421875, + "logps/chosen": -393.3374938964844, + "logps/rejected": -380.73748779296875, + "loss": 0.0845, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.4390625059604645, + "rewards/margins": 4.877734184265137, + "rewards/rejected": -4.4375, + "step": 3410 + }, + { + "epoch": 1.1007483704836243, + "grad_norm": 22.715160674898957, + "learning_rate": 7.24806825499034e-07, + "logits/chosen": 0.20836181938648224, + "logits/rejected": 0.11987914890050888, + "logps/chosen": -426.57501220703125, + "logps/rejected": -421.375, + "loss": 0.0815, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.523730456829071, + "rewards/margins": 4.774218559265137, + "rewards/rejected": -4.2490234375, + "step": 3420 + }, + { + "epoch": 1.103967168262654, + "grad_norm": 30.184289510452444, + "learning_rate": 7.240019317450097e-07, + "logits/chosen": 0.1925918608903885, + "logits/rejected": 0.10406494140625, + "logps/chosen": -376.75, + "logps/rejected": -374.8500061035156, + "loss": 0.1125, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.6316894292831421, + "rewards/margins": 4.391406059265137, + "rewards/rejected": -3.7621092796325684, + "step": 3430 + }, + { + "epoch": 1.1071859660416834, + "grad_norm": 33.67294857492197, + "learning_rate": 7.231970379909852e-07, + "logits/chosen": 0.1532547026872635, + "logits/rejected": 0.0582275390625, + "logps/chosen": -421.17498779296875, + "logps/rejected": -409.75, + "loss": 0.0733, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 1.2260010242462158, + "rewards/margins": 5.127539157867432, + "rewards/rejected": -3.8990235328674316, + "step": 3440 + }, + { + "epoch": 1.1104047638207128, + "grad_norm": 39.54918551363361, + "learning_rate": 7.223921442369607e-07, + "logits/chosen": 0.13020019233226776, + "logits/rejected": 0.02757721021771431, + "logps/chosen": -412.5, + "logps/rejected": -423.125, + "loss": 0.0726, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.001623511314392, + "rewards/margins": 4.964453220367432, + "rewards/rejected": -3.9632811546325684, + "step": 3450 + }, + { + "epoch": 1.1136235615997425, + "grad_norm": 41.75172441631893, + "learning_rate": 7.215872504829363e-07, + "logits/chosen": 0.13712158799171448, + "logits/rejected": 0.08489990234375, + "logps/chosen": -375.95001220703125, + "logps/rejected": -387.7749938964844, + "loss": 0.1076, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.66326904296875, + "rewards/margins": 4.557812690734863, + "rewards/rejected": -3.892773389816284, + "step": 3460 + }, + { + "epoch": 1.116842359378772, + "grad_norm": 9.22192199919613, + "learning_rate": 7.207823567289118e-07, + "logits/chosen": 0.14085082709789276, + "logits/rejected": -0.05171508714556694, + "logps/chosen": -402.9750061035156, + "logps/rejected": -366.45001220703125, + "loss": 0.0902, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.6869293451309204, + "rewards/margins": 4.472851753234863, + "rewards/rejected": -3.7865233421325684, + "step": 3470 + }, + { + "epoch": 1.1200611571578016, + "grad_norm": 7.507118635056902, + "learning_rate": 7.199774629748872e-07, + "logits/chosen": 0.10014037787914276, + "logits/rejected": 0.0872955322265625, + "logps/chosen": -418.92498779296875, + "logps/rejected": -423.8999938964844, + "loss": 0.081, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8292480707168579, + "rewards/margins": 5.073828220367432, + "rewards/rejected": -4.245312690734863, + "step": 3480 + }, + { + "epoch": 1.123279954936831, + "grad_norm": 24.98696311365265, + "learning_rate": 7.191725692208627e-07, + "logits/chosen": 0.01920166052877903, + "logits/rejected": -0.09750060737133026, + "logps/chosen": -412.1000061035156, + "logps/rejected": -438.8999938964844, + "loss": 0.089, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.5771850347518921, + "rewards/margins": 5.034765720367432, + "rewards/rejected": -4.4560546875, + "step": 3490 + }, + { + "epoch": 1.1264987527158605, + "grad_norm": 17.06223968024251, + "learning_rate": 7.183676754668383e-07, + "logits/chosen": 0.03824462741613388, + "logits/rejected": -0.04968871921300888, + "logps/chosen": -397.1000061035156, + "logps/rejected": -405.0, + "loss": 0.1086, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.4838623106479645, + "rewards/margins": 4.944921970367432, + "rewards/rejected": -4.462500095367432, + "step": 3500 + }, + { + "epoch": 1.1297175504948902, + "grad_norm": 31.375544754578033, + "learning_rate": 7.175627817128139e-07, + "logits/chosen": 0.06693115085363388, + "logits/rejected": 0.02343139611184597, + "logps/chosen": -374.57501220703125, + "logps/rejected": -408.4750061035156, + "loss": 0.0912, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.594696044921875, + "rewards/margins": 4.76171875, + "rewards/rejected": -4.166796684265137, + "step": 3510 + }, + { + "epoch": 1.1329363482739196, + "grad_norm": 23.17208593823621, + "learning_rate": 7.167578879587894e-07, + "logits/chosen": 0.05390777438879013, + "logits/rejected": 0.00994873046875, + "logps/chosen": -408.375, + "logps/rejected": -445.125, + "loss": 0.0865, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.688995361328125, + "rewards/margins": 5.103125095367432, + "rewards/rejected": -4.414843559265137, + "step": 3520 + }, + { + "epoch": 1.1361551460529493, + "grad_norm": 16.91159841329465, + "learning_rate": 7.15952994204765e-07, + "logits/chosen": -0.05025634914636612, + "logits/rejected": -0.1230621337890625, + "logps/chosen": -383.70001220703125, + "logps/rejected": -369.375, + "loss": 0.0977, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.385964959859848, + "rewards/margins": 4.631249904632568, + "rewards/rejected": -4.244531154632568, + "step": 3530 + }, + { + "epoch": 1.1393739438319788, + "grad_norm": 59.130511026803575, + "learning_rate": 7.151481004507405e-07, + "logits/chosen": 0.0324859619140625, + "logits/rejected": -0.08774413913488388, + "logps/chosen": -424.5375061035156, + "logps/rejected": -420.20001220703125, + "loss": 0.1144, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 0.3189453184604645, + "rewards/margins": 5.056250095367432, + "rewards/rejected": -4.736328125, + "step": 3540 + }, + { + "epoch": 1.1425927416110082, + "grad_norm": 37.66574279739906, + "learning_rate": 7.14343206696716e-07, + "logits/chosen": 0.08145751804113388, + "logits/rejected": 0.00274658203125, + "logps/chosen": -393.92498779296875, + "logps/rejected": -378.29998779296875, + "loss": 0.0941, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.104400634765625, + "rewards/margins": 4.596875190734863, + "rewards/rejected": -3.4906249046325684, + "step": 3550 + }, + { + "epoch": 1.1458115393900379, + "grad_norm": 30.578302750272893, + "learning_rate": 7.135383129426915e-07, + "logits/chosen": 0.24112090468406677, + "logits/rejected": 0.18911132216453552, + "logps/chosen": -366.25, + "logps/rejected": -362.2250061035156, + "loss": 0.1327, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8590332269668579, + "rewards/margins": 4.348828315734863, + "rewards/rejected": -3.48876953125, + "step": 3560 + }, + { + "epoch": 1.1490303371690673, + "grad_norm": 28.137017059580987, + "learning_rate": 7.12733419188667e-07, + "logits/chosen": 0.232666015625, + "logits/rejected": 0.09614868462085724, + "logps/chosen": -410.07501220703125, + "logps/rejected": -393.7250061035156, + "loss": 0.0698, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2458007335662842, + "rewards/margins": 4.863476753234863, + "rewards/rejected": -3.6201171875, + "step": 3570 + }, + { + "epoch": 1.152249134948097, + "grad_norm": 39.957564814637145, + "learning_rate": 7.119285254346425e-07, + "logits/chosen": 0.26519775390625, + "logits/rejected": 0.27386170625686646, + "logps/chosen": -414.04376220703125, + "logps/rejected": -415.82501220703125, + "loss": 0.0715, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 1.0481078624725342, + "rewards/margins": 4.735156059265137, + "rewards/rejected": -3.686328172683716, + "step": 3580 + }, + { + "epoch": 1.1554679327271264, + "grad_norm": 24.30055485496271, + "learning_rate": 7.111236316806182e-07, + "logits/chosen": 0.22472533583641052, + "logits/rejected": 0.15937499701976776, + "logps/chosen": -384.07501220703125, + "logps/rejected": -381.75, + "loss": 0.1028, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.36088865995407104, + "rewards/margins": 4.195703029632568, + "rewards/rejected": -3.836132764816284, + "step": 3590 + }, + { + "epoch": 1.158686730506156, + "grad_norm": 23.735912430254956, + "learning_rate": 7.103187379265937e-07, + "logits/chosen": 0.3221435546875, + "logits/rejected": 0.15773925185203552, + "logps/chosen": -423.91876220703125, + "logps/rejected": -388.67498779296875, + "loss": 0.1163, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.8077972531318665, + "rewards/margins": 4.878125190734863, + "rewards/rejected": -4.068359375, + "step": 3600 + }, + { + "epoch": 1.1619055282851856, + "grad_norm": 32.654065886303655, + "learning_rate": 7.095138441725692e-07, + "logits/chosen": 0.25059205293655396, + "logits/rejected": 0.16866150498390198, + "logps/chosen": -389.3500061035156, + "logps/rejected": -402.04998779296875, + "loss": 0.1234, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 0.7878051996231079, + "rewards/margins": 4.611523628234863, + "rewards/rejected": -3.822265625, + "step": 3610 + }, + { + "epoch": 1.165124326064215, + "grad_norm": 15.05593222034749, + "learning_rate": 7.087089504185448e-07, + "logits/chosen": 0.09567870944738388, + "logits/rejected": 0.005584716796875, + "logps/chosen": -386.45001220703125, + "logps/rejected": -426.6499938964844, + "loss": 0.1072, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.696514904499054, + "rewards/margins": 4.849999904632568, + "rewards/rejected": -4.156445503234863, + "step": 3620 + }, + { + "epoch": 1.1683431238432445, + "grad_norm": 24.414226279324126, + "learning_rate": 7.079040566645202e-07, + "logits/chosen": 0.047637939453125, + "logits/rejected": -0.0043853758834302425, + "logps/chosen": -392.54998779296875, + "logps/rejected": -402.4750061035156, + "loss": 0.1059, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.29499512910842896, + "rewards/margins": 4.810742378234863, + "rewards/rejected": -4.515234470367432, + "step": 3630 + }, + { + "epoch": 1.1715619216222741, + "grad_norm": 21.892397265809176, + "learning_rate": 7.070991629104957e-07, + "logits/chosen": 0.26855164766311646, + "logits/rejected": 0.1742706298828125, + "logps/chosen": -378.5249938964844, + "logps/rejected": -425.2250061035156, + "loss": 0.0707, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.842211902141571, + "rewards/margins": 5.1796875, + "rewards/rejected": -4.341015815734863, + "step": 3640 + }, + { + "epoch": 1.1747807194013036, + "grad_norm": 51.108977392538726, + "learning_rate": 7.062942691564713e-07, + "logits/chosen": 0.20552673935890198, + "logits/rejected": 0.20946654677391052, + "logps/chosen": -359.5625, + "logps/rejected": -383.2250061035156, + "loss": 0.118, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.714068591594696, + "rewards/margins": 4.435937404632568, + "rewards/rejected": -3.723437547683716, + "step": 3650 + }, + { + "epoch": 1.177999517180333, + "grad_norm": 34.1089624543139, + "learning_rate": 7.054893754024468e-07, + "logits/chosen": 0.08717040717601776, + "logits/rejected": 0.038646697998046875, + "logps/chosen": -377.82501220703125, + "logps/rejected": -363.25, + "loss": 0.1326, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.797595202922821, + "rewards/margins": 4.62109375, + "rewards/rejected": -3.8208985328674316, + "step": 3660 + }, + { + "epoch": 1.1812183149593627, + "grad_norm": 33.909379110044405, + "learning_rate": 7.046844816484224e-07, + "logits/chosen": 0.13117370009422302, + "logits/rejected": 0.01885681226849556, + "logps/chosen": -360.01251220703125, + "logps/rejected": -371.2124938964844, + "loss": 0.0822, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.9515380859375, + "rewards/margins": 4.633984565734863, + "rewards/rejected": -3.681640625, + "step": 3670 + }, + { + "epoch": 1.1844371127383921, + "grad_norm": 20.434220336754045, + "learning_rate": 7.03879587894398e-07, + "logits/chosen": 0.19266967475414276, + "logits/rejected": 0.11001281440258026, + "logps/chosen": -353.57501220703125, + "logps/rejected": -364.04998779296875, + "loss": 0.0906, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.055419921875, + "rewards/margins": 4.634765625, + "rewards/rejected": -3.579296827316284, + "step": 3680 + }, + { + "epoch": 1.1876559105174218, + "grad_norm": 59.27339403658297, + "learning_rate": 7.030746941403735e-07, + "logits/chosen": 0.15883788466453552, + "logits/rejected": 0.12996825575828552, + "logps/chosen": -408.0249938964844, + "logps/rejected": -404.125, + "loss": 0.1076, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.9482787847518921, + "rewards/margins": 4.684374809265137, + "rewards/rejected": -3.7388672828674316, + "step": 3690 + }, + { + "epoch": 1.1908747082964513, + "grad_norm": 28.84329533003731, + "learning_rate": 7.02269800386349e-07, + "logits/chosen": 0.0633544921875, + "logits/rejected": 0.08895263820886612, + "logps/chosen": -370.0874938964844, + "logps/rejected": -401.3500061035156, + "loss": 0.1329, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.728808581829071, + "rewards/margins": 4.5087890625, + "rewards/rejected": -3.782519578933716, + "step": 3700 + }, + { + "epoch": 1.1940935060754807, + "grad_norm": 17.687804262535945, + "learning_rate": 7.014649066323245e-07, + "logits/chosen": 0.147308349609375, + "logits/rejected": 0.022186279296875, + "logps/chosen": -402.5625, + "logps/rejected": -399.1875, + "loss": 0.1131, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 1.0063598155975342, + "rewards/margins": 5.027929782867432, + "rewards/rejected": -4.021093845367432, + "step": 3710 + }, + { + "epoch": 1.1973123038545104, + "grad_norm": 24.77861160053137, + "learning_rate": 7.006600128783e-07, + "logits/chosen": 0.11234893649816513, + "logits/rejected": -0.00665283203125, + "logps/chosen": -362.42498779296875, + "logps/rejected": -376.7749938964844, + "loss": 0.1123, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.64556884765625, + "rewards/margins": 4.446484565734863, + "rewards/rejected": -3.8017578125, + "step": 3720 + }, + { + "epoch": 1.2005311016335398, + "grad_norm": 30.853498754416336, + "learning_rate": 6.998551191242755e-07, + "logits/chosen": 0.17337647080421448, + "logits/rejected": 0.13140258193016052, + "logps/chosen": -406.57501220703125, + "logps/rejected": -429.4750061035156, + "loss": 0.0985, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.8588501214981079, + "rewards/margins": 4.903906345367432, + "rewards/rejected": -4.045702934265137, + "step": 3730 + }, + { + "epoch": 1.2037498994125695, + "grad_norm": 29.450979363911525, + "learning_rate": 6.990502253702511e-07, + "logits/chosen": 0.276580810546875, + "logits/rejected": 0.14074096083641052, + "logps/chosen": -424.3374938964844, + "logps/rejected": -438.82501220703125, + "loss": 0.0854, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.867431640625, + "rewards/margins": 5.028710842132568, + "rewards/rejected": -4.159081935882568, + "step": 3740 + }, + { + "epoch": 1.206968697191599, + "grad_norm": 44.18835303159895, + "learning_rate": 6.982453316162267e-07, + "logits/chosen": 0.17947998642921448, + "logits/rejected": 0.04878845065832138, + "logps/chosen": -389.3374938964844, + "logps/rejected": -405.5249938964844, + "loss": 0.1078, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.7165282964706421, + "rewards/margins": 4.893945217132568, + "rewards/rejected": -4.178124904632568, + "step": 3750 + }, + { + "epoch": 1.2101874949706284, + "grad_norm": 15.700983676275097, + "learning_rate": 6.974404378622022e-07, + "logits/chosen": 0.160186767578125, + "logits/rejected": 0.0439453125, + "logps/chosen": -401.36248779296875, + "logps/rejected": -398.5, + "loss": 0.0878, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.766497790813446, + "rewards/margins": 5.219922065734863, + "rewards/rejected": -4.451952934265137, + "step": 3760 + }, + { + "epoch": 1.213406292749658, + "grad_norm": 27.559085710667816, + "learning_rate": 6.966355441081778e-07, + "logits/chosen": 0.124786376953125, + "logits/rejected": 0.09946288913488388, + "logps/chosen": -386.54998779296875, + "logps/rejected": -394.5, + "loss": 0.0794, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.632678210735321, + "rewards/margins": 5.017578125, + "rewards/rejected": -4.38671875, + "step": 3770 + }, + { + "epoch": 1.2166250905286875, + "grad_norm": 32.57301934233551, + "learning_rate": 6.958306503541532e-07, + "logits/chosen": 0.31534117460250854, + "logits/rejected": 0.17431029677391052, + "logps/chosen": -399.32501220703125, + "logps/rejected": -383.57501220703125, + "loss": 0.1075, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7266510128974915, + "rewards/margins": 4.821093559265137, + "rewards/rejected": -4.091601371765137, + "step": 3780 + }, + { + "epoch": 1.2198438883077172, + "grad_norm": 30.78919936371543, + "learning_rate": 6.950257566001287e-07, + "logits/chosen": -0.09363403171300888, + "logits/rejected": -0.02550659142434597, + "logps/chosen": -371.9125061035156, + "logps/rejected": -407.36248779296875, + "loss": 0.0852, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.15339355170726776, + "rewards/margins": 5.029882907867432, + "rewards/rejected": -4.873437404632568, + "step": 3790 + }, + { + "epoch": 1.2230626860867466, + "grad_norm": 26.359994065133158, + "learning_rate": 6.942208628461042e-07, + "logits/chosen": -0.005999756045639515, + "logits/rejected": -0.05394287034869194, + "logps/chosen": -433.9750061035156, + "logps/rejected": -444.1000061035156, + "loss": 0.0664, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.3089843690395355, + "rewards/margins": 5.440234184265137, + "rewards/rejected": -5.131249904632568, + "step": 3800 + }, + { + "epoch": 1.226281483865776, + "grad_norm": 16.303886839533924, + "learning_rate": 6.934159690920798e-07, + "logits/chosen": 0.09896240383386612, + "logits/rejected": -0.05314941331744194, + "logps/chosen": -453.67498779296875, + "logps/rejected": -438.42498779296875, + "loss": 0.0849, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.3594421446323395, + "rewards/margins": 5.233593940734863, + "rewards/rejected": -4.876953125, + "step": 3810 + }, + { + "epoch": 1.2295002816448057, + "grad_norm": 29.14705218330332, + "learning_rate": 6.926110753380553e-07, + "logits/chosen": 0.07645263522863388, + "logits/rejected": 0.00952758826315403, + "logps/chosen": -408.4750061035156, + "logps/rejected": -409.1625061035156, + "loss": 0.1018, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.17090149223804474, + "rewards/margins": 4.955078125, + "rewards/rejected": -4.783593654632568, + "step": 3820 + }, + { + "epoch": 1.2327190794238352, + "grad_norm": 27.0476870369674, + "learning_rate": 6.918061815840309e-07, + "logits/chosen": -0.05864257737994194, + "logits/rejected": -0.16047362983226776, + "logps/chosen": -438.8999938964844, + "logps/rejected": -461.42498779296875, + "loss": 0.0692, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.16944579780101776, + "rewards/margins": 5.50390625, + "rewards/rejected": -5.673437595367432, + "step": 3830 + }, + { + "epoch": 1.2359378772028649, + "grad_norm": 8.745287050245846, + "learning_rate": 6.910012878300065e-07, + "logits/chosen": -0.20604553818702698, + "logits/rejected": -0.24652099609375, + "logps/chosen": -394.2250061035156, + "logps/rejected": -418.04998779296875, + "loss": 0.0721, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.22413940727710724, + "rewards/margins": 5.093359470367432, + "rewards/rejected": -5.319726467132568, + "step": 3840 + }, + { + "epoch": 1.2391566749818943, + "grad_norm": 16.44908500621224, + "learning_rate": 6.90196394075982e-07, + "logits/chosen": -0.12526854872703552, + "logits/rejected": -0.24386902153491974, + "logps/chosen": -414.54998779296875, + "logps/rejected": -430.32501220703125, + "loss": 0.0709, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.17121581733226776, + "rewards/margins": 5.709374904632568, + "rewards/rejected": -5.538281440734863, + "step": 3850 + }, + { + "epoch": 1.2423754727609237, + "grad_norm": 30.427881255162124, + "learning_rate": 6.893915003219574e-07, + "logits/chosen": 0.01242675818502903, + "logits/rejected": -0.13436278700828552, + "logps/chosen": -385.26251220703125, + "logps/rejected": -389.3999938964844, + "loss": 0.0887, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.04581298679113388, + "rewards/margins": 5.246874809265137, + "rewards/rejected": -5.203515529632568, + "step": 3860 + }, + { + "epoch": 1.2455942705399534, + "grad_norm": 60.9479806439901, + "learning_rate": 6.88586606567933e-07, + "logits/chosen": 0.08955688774585724, + "logits/rejected": -0.05073242262005806, + "logps/chosen": -447.0, + "logps/rejected": -468.67498779296875, + "loss": 0.1334, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.19309692084789276, + "rewards/margins": 5.234765529632568, + "rewards/rejected": -5.426953315734863, + "step": 3870 + }, + { + "epoch": 1.2488130683189829, + "grad_norm": 31.587808401179426, + "learning_rate": 6.877817128139085e-07, + "logits/chosen": 0.02855224534869194, + "logits/rejected": -0.1221923828125, + "logps/chosen": -415.4375, + "logps/rejected": -396.17498779296875, + "loss": 0.1188, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.3105102479457855, + "rewards/margins": 4.843163967132568, + "rewards/rejected": -4.534375190734863, + "step": 3880 + }, + { + "epoch": 1.2520318660980123, + "grad_norm": 45.11108294246491, + "learning_rate": 6.86976819059884e-07, + "logits/chosen": 0.0787506103515625, + "logits/rejected": 0.0018447876209393144, + "logps/chosen": -403.88751220703125, + "logps/rejected": -432.0249938964844, + "loss": 0.0788, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.96923828125, + "rewards/margins": 5.071875095367432, + "rewards/rejected": -4.104296684265137, + "step": 3890 + }, + { + "epoch": 1.255250663877042, + "grad_norm": 63.263168672058995, + "learning_rate": 6.861719253058596e-07, + "logits/chosen": 0.05616912990808487, + "logits/rejected": 0.03765869140625, + "logps/chosen": -415.8500061035156, + "logps/rejected": -429.125, + "loss": 0.0955, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.0338256359100342, + "rewards/margins": 4.880663871765137, + "rewards/rejected": -3.841601610183716, + "step": 3900 + }, + { + "epoch": 1.2584694616560714, + "grad_norm": 15.837482296693448, + "learning_rate": 6.853670315518352e-07, + "logits/chosen": 0.05369872972369194, + "logits/rejected": 0.04509277269244194, + "logps/chosen": -394.5249938964844, + "logps/rejected": -396.25, + "loss": 0.0904, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.622692883014679, + "rewards/margins": 4.805078029632568, + "rewards/rejected": -4.177538871765137, + "step": 3910 + }, + { + "epoch": 1.2616882594351009, + "grad_norm": 31.667004766909784, + "learning_rate": 6.845621377978107e-07, + "logits/chosen": 0.034515380859375, + "logits/rejected": -0.025614166632294655, + "logps/chosen": -383.9125061035156, + "logps/rejected": -417.13751220703125, + "loss": 0.0988, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.47808837890625, + "rewards/margins": 5.042089939117432, + "rewards/rejected": -4.5654296875, + "step": 3920 + }, + { + "epoch": 1.2649070572141305, + "grad_norm": 59.15970937850976, + "learning_rate": 6.837572440437862e-07, + "logits/chosen": -0.1024322509765625, + "logits/rejected": -0.21962738037109375, + "logps/chosen": -397.15625, + "logps/rejected": -410.125, + "loss": 0.0706, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.05047607421875, + "rewards/margins": 5.265234470367432, + "rewards/rejected": -5.213281154632568, + "step": 3930 + }, + { + "epoch": 1.26812585499316, + "grad_norm": 55.81315170518486, + "learning_rate": 6.829523502897617e-07, + "logits/chosen": -0.15672607719898224, + "logits/rejected": -0.33135986328125, + "logps/chosen": -412.0, + "logps/rejected": -417.7749938964844, + "loss": 0.0749, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.23487548530101776, + "rewards/margins": 5.755468845367432, + "rewards/rejected": -5.521484375, + "step": 3940 + }, + { + "epoch": 1.2713446527721897, + "grad_norm": 18.078651602480292, + "learning_rate": 6.821474565357372e-07, + "logits/chosen": 0.015167236328125, + "logits/rejected": -0.11549071967601776, + "logps/chosen": -401.8999938964844, + "logps/rejected": -404.4750061035156, + "loss": 0.0843, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.2832275331020355, + "rewards/margins": 4.946484565734863, + "rewards/rejected": -4.6640625, + "step": 3950 + }, + { + "epoch": 1.2745634505512191, + "grad_norm": 10.728334264849941, + "learning_rate": 6.813425627817128e-07, + "logits/chosen": 0.01788940466940403, + "logits/rejected": -0.11822891235351562, + "logps/chosen": -388.5, + "logps/rejected": -401.79998779296875, + "loss": 0.0652, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.42845457792282104, + "rewards/margins": 5.49609375, + "rewards/rejected": -5.0703125, + "step": 3960 + }, + { + "epoch": 1.2777822483302486, + "grad_norm": 38.93997501163191, + "learning_rate": 6.805376690276883e-07, + "logits/chosen": -0.10703124850988388, + "logits/rejected": -0.09844207763671875, + "logps/chosen": -380.32501220703125, + "logps/rejected": -454.29998779296875, + "loss": 0.0564, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.06067504733800888, + "rewards/margins": 6.371484279632568, + "rewards/rejected": -6.3125, + "step": 3970 + }, + { + "epoch": 1.2810010461092782, + "grad_norm": 18.329806971804686, + "learning_rate": 6.797327752736638e-07, + "logits/chosen": -0.22577515244483948, + "logits/rejected": -0.2769317626953125, + "logps/chosen": -368.4750061035156, + "logps/rejected": -421.70001220703125, + "loss": 0.1099, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.3246780335903168, + "rewards/margins": 5.890625, + "rewards/rejected": -5.568749904632568, + "step": 3980 + }, + { + "epoch": 1.2842198438883077, + "grad_norm": 59.02216114452859, + "learning_rate": 6.789278815196395e-07, + "logits/chosen": 0.01652831956744194, + "logits/rejected": -0.10218505561351776, + "logps/chosen": -418.04998779296875, + "logps/rejected": -413.23748779296875, + "loss": 0.0837, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.0425567626953125, + "rewards/margins": 5.525781154632568, + "rewards/rejected": -5.566015720367432, + "step": 3990 + }, + { + "epoch": 1.2874386416673373, + "grad_norm": 48.027939716066555, + "learning_rate": 6.781229877656149e-07, + "logits/chosen": -0.04545898362994194, + "logits/rejected": -0.13904419541358948, + "logps/chosen": -393.17498779296875, + "logps/rejected": -422.51251220703125, + "loss": 0.0918, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.09403076022863388, + "rewards/margins": 5.298828125, + "rewards/rejected": -5.391211032867432, + "step": 4000 + }, + { + "epoch": 1.2906574394463668, + "grad_norm": 15.984713886485936, + "learning_rate": 6.773180940115904e-07, + "logits/chosen": 0.05025024339556694, + "logits/rejected": 0.0008117675897665322, + "logps/chosen": -433.29998779296875, + "logps/rejected": -467.625, + "loss": 0.0914, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.22646483778953552, + "rewards/margins": 5.805078029632568, + "rewards/rejected": -5.581250190734863, + "step": 4010 + }, + { + "epoch": 1.2938762372253962, + "grad_norm": 16.931160253633315, + "learning_rate": 6.76513200257566e-07, + "logits/chosen": 0.10808715969324112, + "logits/rejected": -0.0042633055709302425, + "logps/chosen": -437.5625, + "logps/rejected": -438.1499938964844, + "loss": 0.1214, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.39344483613967896, + "rewards/margins": 5.5, + "rewards/rejected": -5.105859279632568, + "step": 4020 + }, + { + "epoch": 1.297095035004426, + "grad_norm": 15.04258286486589, + "learning_rate": 6.757083065035415e-07, + "logits/chosen": 0.06659965217113495, + "logits/rejected": -0.02254028245806694, + "logps/chosen": -417.92498779296875, + "logps/rejected": -426.29998779296875, + "loss": 0.086, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.5378051996231079, + "rewards/margins": 5.652734279632568, + "rewards/rejected": -5.116015434265137, + "step": 4030 + }, + { + "epoch": 1.3003138327834554, + "grad_norm": 34.19354267736491, + "learning_rate": 6.74903412749517e-07, + "logits/chosen": 0.171966552734375, + "logits/rejected": 0.08466186374425888, + "logps/chosen": -434.20001220703125, + "logps/rejected": -407.92498779296875, + "loss": 0.0766, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.7345336675643921, + "rewards/margins": 5.134375095367432, + "rewards/rejected": -4.401562690734863, + "step": 4040 + }, + { + "epoch": 1.303532630562485, + "grad_norm": 28.36768033746519, + "learning_rate": 6.740985189954926e-07, + "logits/chosen": 0.09619140625, + "logits/rejected": -0.00867309607565403, + "logps/chosen": -368.5874938964844, + "logps/rejected": -356.625, + "loss": 0.107, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.15446777641773224, + "rewards/margins": 4.430468559265137, + "rewards/rejected": -4.586718559265137, + "step": 4050 + }, + { + "epoch": 1.3067514283415145, + "grad_norm": 24.738843858928092, + "learning_rate": 6.732936252414681e-07, + "logits/chosen": 0.164703369140625, + "logits/rejected": 0.05294189602136612, + "logps/chosen": -404.0, + "logps/rejected": -382.26251220703125, + "loss": 0.0858, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.07716064155101776, + "rewards/margins": 4.948437690734863, + "rewards/rejected": -4.870312690734863, + "step": 4060 + }, + { + "epoch": 1.309970226120544, + "grad_norm": 19.96969473139889, + "learning_rate": 6.724887314874436e-07, + "logits/chosen": 0.05486450344324112, + "logits/rejected": -0.09609679877758026, + "logps/chosen": -405.4375, + "logps/rejected": -382.1000061035156, + "loss": 0.0935, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02144775353372097, + "rewards/margins": 4.926465034484863, + "rewards/rejected": -4.908203125, + "step": 4070 + }, + { + "epoch": 1.3131890238995734, + "grad_norm": 40.495182856475736, + "learning_rate": 6.716838377334192e-07, + "logits/chosen": 0.140167236328125, + "logits/rejected": 0.011045074090361595, + "logps/chosen": -416.4750061035156, + "logps/rejected": -404.8999938964844, + "loss": 0.1176, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09373779594898224, + "rewards/margins": 5.014062404632568, + "rewards/rejected": -4.920117378234863, + "step": 4080 + }, + { + "epoch": 1.316407821678603, + "grad_norm": 46.04527077508931, + "learning_rate": 6.708789439793947e-07, + "logits/chosen": 0.15576782822608948, + "logits/rejected": -0.02394714392721653, + "logps/chosen": -379.4750061035156, + "logps/rejected": -397.57501220703125, + "loss": 0.0931, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.19361571967601776, + "rewards/margins": 5.03515625, + "rewards/rejected": -5.232421875, + "step": 4090 + }, + { + "epoch": 1.3196266194576327, + "grad_norm": 9.236113968321147, + "learning_rate": 6.700740502253702e-07, + "logits/chosen": 0.011395263485610485, + "logits/rejected": -0.17100830376148224, + "logps/chosen": -404.0625, + "logps/rejected": -410.04998779296875, + "loss": 0.0591, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.104339599609375, + "rewards/margins": 5.285937309265137, + "rewards/rejected": -5.390234470367432, + "step": 4100 + }, + { + "epoch": 1.3228454172366622, + "grad_norm": 38.58462427309358, + "learning_rate": 6.692691564713457e-07, + "logits/chosen": 0.03755798190832138, + "logits/rejected": -0.11359252780675888, + "logps/chosen": -382.6000061035156, + "logps/rejected": -378.1499938964844, + "loss": 0.0954, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.37371826171875, + "rewards/margins": 4.78515625, + "rewards/rejected": -5.16015625, + "step": 4110 + }, + { + "epoch": 1.3260642150156916, + "grad_norm": 60.83065355080023, + "learning_rate": 6.684642627173213e-07, + "logits/chosen": 0.13038940727710724, + "logits/rejected": 0.02085113525390625, + "logps/chosen": -392.32501220703125, + "logps/rejected": -384.07501220703125, + "loss": 0.0919, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.30412596464157104, + "rewards/margins": 4.983984470367432, + "rewards/rejected": -5.288671970367432, + "step": 4120 + }, + { + "epoch": 1.329283012794721, + "grad_norm": 22.51761538245697, + "learning_rate": 6.676593689632968e-07, + "logits/chosen": -0.07775573432445526, + "logits/rejected": -0.09187011420726776, + "logps/chosen": -433.0249938964844, + "logps/rejected": -430.04998779296875, + "loss": 0.0993, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.3796020448207855, + "rewards/margins": 5.073437690734863, + "rewards/rejected": -4.696875095367432, + "step": 4130 + }, + { + "epoch": 1.3325018105737507, + "grad_norm": 25.125738591742063, + "learning_rate": 6.668544752092722e-07, + "logits/chosen": 0.10197754204273224, + "logits/rejected": -0.02750244177877903, + "logps/chosen": -415.79998779296875, + "logps/rejected": -399.70001220703125, + "loss": 0.0974, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.38640135526657104, + "rewards/margins": 5.0234375, + "rewards/rejected": -4.635937690734863, + "step": 4140 + }, + { + "epoch": 1.3357206083527802, + "grad_norm": 39.07530885966664, + "learning_rate": 6.660495814552479e-07, + "logits/chosen": 0.24863891303539276, + "logits/rejected": 0.19878235459327698, + "logps/chosen": -397.79998779296875, + "logps/rejected": -421.875, + "loss": 0.071, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6745849847793579, + "rewards/margins": 5.031640529632568, + "rewards/rejected": -4.357812404632568, + "step": 4150 + }, + { + "epoch": 1.3389394061318098, + "grad_norm": 46.314234170436436, + "learning_rate": 6.652446877012234e-07, + "logits/chosen": -0.02519531175494194, + "logits/rejected": -0.07564391940832138, + "logps/chosen": -421.2124938964844, + "logps/rejected": -424.0625, + "loss": 0.0832, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.3768676817417145, + "rewards/margins": 5.114453315734863, + "rewards/rejected": -4.737500190734863, + "step": 4160 + }, + { + "epoch": 1.3421582039108393, + "grad_norm": 28.636530075521303, + "learning_rate": 6.644397939471989e-07, + "logits/chosen": 0.09256897121667862, + "logits/rejected": 0.04421386867761612, + "logps/chosen": -378.0249938964844, + "logps/rejected": -430.42498779296875, + "loss": 0.079, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.2627197206020355, + "rewards/margins": 4.973437309265137, + "rewards/rejected": -4.708203315734863, + "step": 4170 + }, + { + "epoch": 1.3453770016898687, + "grad_norm": 19.44360580279575, + "learning_rate": 6.636349001931745e-07, + "logits/chosen": 0.18400421738624573, + "logits/rejected": 0.11618652194738388, + "logps/chosen": -439.4750061035156, + "logps/rejected": -461.7250061035156, + "loss": 0.0848, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.27631837129592896, + "rewards/margins": 5.175976753234863, + "rewards/rejected": -4.898828029632568, + "step": 4180 + }, + { + "epoch": 1.3485957994688984, + "grad_norm": 25.623699855600503, + "learning_rate": 6.6283000643915e-07, + "logits/chosen": 0.15620727837085724, + "logits/rejected": 0.06724853813648224, + "logps/chosen": -430.5, + "logps/rejected": -445.125, + "loss": 0.0913, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.4233642518520355, + "rewards/margins": 5.060937404632568, + "rewards/rejected": -4.638671875, + "step": 4190 + }, + { + "epoch": 1.3518145972479279, + "grad_norm": 43.003789585113616, + "learning_rate": 6.620251126851255e-07, + "logits/chosen": -0.0026000975631177425, + "logits/rejected": -0.07552490383386612, + "logps/chosen": -403.70001220703125, + "logps/rejected": -403.0249938964844, + "loss": 0.0948, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.4774719178676605, + "rewards/margins": 4.988867282867432, + "rewards/rejected": -4.509765625, + "step": 4200 + }, + { + "epoch": 1.3550333950269575, + "grad_norm": 13.067743220146138, + "learning_rate": 6.612202189311011e-07, + "logits/chosen": 0.0232391357421875, + "logits/rejected": -0.01422729529440403, + "logps/chosen": -400.0249938964844, + "logps/rejected": -408.1875, + "loss": 0.0966, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.589263916015625, + "rewards/margins": 5.4169921875, + "rewards/rejected": -4.827343940734863, + "step": 4210 + }, + { + "epoch": 1.358252192805987, + "grad_norm": 36.30627889306842, + "learning_rate": 6.604153251770765e-07, + "logits/chosen": 0.14351806044578552, + "logits/rejected": -0.03668365627527237, + "logps/chosen": -388.7749938964844, + "logps/rejected": -400.7749938964844, + "loss": 0.0821, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5236175656318665, + "rewards/margins": 5.106249809265137, + "rewards/rejected": -4.583593845367432, + "step": 4220 + }, + { + "epoch": 1.3614709905850164, + "grad_norm": 28.57636649359089, + "learning_rate": 6.596104314230521e-07, + "logits/chosen": 0.19674071669578552, + "logits/rejected": 0.120941162109375, + "logps/chosen": -441.20001220703125, + "logps/rejected": -425.5, + "loss": 0.0834, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.5794311761856079, + "rewards/margins": 5.279687404632568, + "rewards/rejected": -4.700390815734863, + "step": 4230 + }, + { + "epoch": 1.364689788364046, + "grad_norm": 27.39527797875861, + "learning_rate": 6.588055376690277e-07, + "logits/chosen": 0.2148178070783615, + "logits/rejected": 0.18697205185890198, + "logps/chosen": -422.1000061035156, + "logps/rejected": -420.67498779296875, + "loss": 0.0862, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.6568847894668579, + "rewards/margins": 4.928515434265137, + "rewards/rejected": -4.271288871765137, + "step": 4240 + }, + { + "epoch": 1.3679085861430755, + "grad_norm": 24.91947744002902, + "learning_rate": 6.580006439150032e-07, + "logits/chosen": 0.18180541694164276, + "logits/rejected": -0.03119964525103569, + "logps/chosen": -455.0249938964844, + "logps/rejected": -429.125, + "loss": 0.0783, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.3790687620639801, + "rewards/margins": 5.104687690734863, + "rewards/rejected": -4.730664253234863, + "step": 4250 + }, + { + "epoch": 1.3711273839221052, + "grad_norm": 14.512662282824195, + "learning_rate": 6.571957501609787e-07, + "logits/chosen": -0.05780639499425888, + "logits/rejected": -0.01131591759622097, + "logps/chosen": -374.625, + "logps/rejected": -437.0249938964844, + "loss": 0.0956, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.09890136867761612, + "rewards/margins": 4.998827934265137, + "rewards/rejected": -5.098828315734863, + "step": 4260 + }, + { + "epoch": 1.3743461817011347, + "grad_norm": 27.72560602435211, + "learning_rate": 6.563908564069543e-07, + "logits/chosen": -0.09939880669116974, + "logits/rejected": -0.1706192046403885, + "logps/chosen": -448.5249938964844, + "logps/rejected": -428.7875061035156, + "loss": 0.0739, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.23575440049171448, + "rewards/margins": 5.660937309265137, + "rewards/rejected": -5.894921779632568, + "step": 4270 + }, + { + "epoch": 1.377564979480164, + "grad_norm": 24.20844215798876, + "learning_rate": 6.555859626529298e-07, + "logits/chosen": -0.02216796949505806, + "logits/rejected": -0.12272338569164276, + "logps/chosen": -398.95001220703125, + "logps/rejected": -433.2749938964844, + "loss": 0.0838, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.67364501953125, + "rewards/margins": 5.069921970367432, + "rewards/rejected": -5.741796970367432, + "step": 4280 + }, + { + "epoch": 1.3807837772591938, + "grad_norm": 20.584429792098263, + "learning_rate": 6.547810688989052e-07, + "logits/chosen": 0.09197387844324112, + "logits/rejected": -0.10872497409582138, + "logps/chosen": -449.8500061035156, + "logps/rejected": -405.04998779296875, + "loss": 0.1155, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.758837878704071, + "rewards/margins": 4.814453125, + "rewards/rejected": -5.571484565734863, + "step": 4290 + }, + { + "epoch": 1.3840025750382232, + "grad_norm": 34.632945296171656, + "learning_rate": 6.539761751448808e-07, + "logits/chosen": 0.053081512451171875, + "logits/rejected": 0.0009704589610919356, + "logps/chosen": -426.57501220703125, + "logps/rejected": -426.63751220703125, + "loss": 0.0653, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.17559814453125, + "rewards/margins": 5.255273342132568, + "rewards/rejected": -5.427734375, + "step": 4300 + }, + { + "epoch": 1.3872213728172529, + "grad_norm": 52.947519863728935, + "learning_rate": 6.531712813908564e-07, + "logits/chosen": 0.0054565430618822575, + "logits/rejected": -0.06818237155675888, + "logps/chosen": -410.54998779296875, + "logps/rejected": -421.5, + "loss": 0.1213, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -0.53424072265625, + "rewards/margins": 5.128125190734863, + "rewards/rejected": -5.662890434265137, + "step": 4310 + }, + { + "epoch": 1.3904401705962823, + "grad_norm": 56.25062354335611, + "learning_rate": 6.523663876368319e-07, + "logits/chosen": 0.13584594428539276, + "logits/rejected": -0.03050689771771431, + "logps/chosen": -413.17498779296875, + "logps/rejected": -367.5, + "loss": 0.0931, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.3894409239292145, + "rewards/margins": 4.574804782867432, + "rewards/rejected": -4.966796875, + "step": 4320 + }, + { + "epoch": 1.3936589683753118, + "grad_norm": 13.990329818129398, + "learning_rate": 6.515614938828075e-07, + "logits/chosen": 0.04324188083410263, + "logits/rejected": -0.09816894680261612, + "logps/chosen": -422.5, + "logps/rejected": -427.82501220703125, + "loss": 0.0578, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05156249925494194, + "rewards/margins": 5.280468940734863, + "rewards/rejected": -5.330468654632568, + "step": 4330 + }, + { + "epoch": 1.3968777661543412, + "grad_norm": 13.524230926733628, + "learning_rate": 6.50756600128783e-07, + "logits/chosen": -0.04895935207605362, + "logits/rejected": -0.18629150092601776, + "logps/chosen": -386.42498779296875, + "logps/rejected": -392.82501220703125, + "loss": 0.0859, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.647509753704071, + "rewards/margins": 4.990429878234863, + "rewards/rejected": -5.638671875, + "step": 4340 + }, + { + "epoch": 1.400096563933371, + "grad_norm": 57.11571426548283, + "learning_rate": 6.499517063747585e-07, + "logits/chosen": 0.03632812574505806, + "logits/rejected": -0.14433899521827698, + "logps/chosen": -415.82501220703125, + "logps/rejected": -406.04998779296875, + "loss": 0.0877, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.02032470703125, + "rewards/margins": 5.250586032867432, + "rewards/rejected": -5.231054782867432, + "step": 4350 + }, + { + "epoch": 1.4033153617124003, + "grad_norm": 18.494412000152465, + "learning_rate": 6.491468126207341e-07, + "logits/chosen": -0.02980346605181694, + "logits/rejected": -0.15522918105125427, + "logps/chosen": -443.59375, + "logps/rejected": -417.29998779296875, + "loss": 0.0799, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.41541749238967896, + "rewards/margins": 5.439062595367432, + "rewards/rejected": -5.021484375, + "step": 4360 + }, + { + "epoch": 1.40653415949143, + "grad_norm": 36.88570438855509, + "learning_rate": 6.483419188667095e-07, + "logits/chosen": 0.04229126125574112, + "logits/rejected": -0.12023620307445526, + "logps/chosen": -419.3125, + "logps/rejected": -389.8999938964844, + "loss": 0.0918, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.333038330078125, + "rewards/margins": 4.908984184265137, + "rewards/rejected": -4.576952934265137, + "step": 4370 + }, + { + "epoch": 1.4097529572704595, + "grad_norm": 55.75677356038742, + "learning_rate": 6.47537025112685e-07, + "logits/chosen": 0.10421142727136612, + "logits/rejected": -0.025726318359375, + "logps/chosen": -406.2749938964844, + "logps/rejected": -400.01251220703125, + "loss": 0.0974, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.590161144733429, + "rewards/margins": 4.866991996765137, + "rewards/rejected": -4.276562690734863, + "step": 4380 + }, + { + "epoch": 1.412971755049489, + "grad_norm": 26.54640518456936, + "learning_rate": 6.467321313586607e-07, + "logits/chosen": 0.20191040635108948, + "logits/rejected": 0.14274902641773224, + "logps/chosen": -436.875, + "logps/rejected": -425.125, + "loss": 0.0689, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.59210205078125, + "rewards/margins": 4.939453125, + "rewards/rejected": -4.345312595367432, + "step": 4390 + }, + { + "epoch": 1.4161905528285186, + "grad_norm": 28.868513945403414, + "learning_rate": 6.459272376046362e-07, + "logits/chosen": 0.02066650427877903, + "logits/rejected": -0.0045166015625, + "logps/chosen": -420.79998779296875, + "logps/rejected": -420.95001220703125, + "loss": 0.0789, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.12874755263328552, + "rewards/margins": 5.041211128234863, + "rewards/rejected": -4.914453029632568, + "step": 4400 + }, + { + "epoch": 1.419409350607548, + "grad_norm": 19.769104541703605, + "learning_rate": 6.451223438506117e-07, + "logits/chosen": 0.13425903022289276, + "logits/rejected": 0.01158218365162611, + "logps/chosen": -412.3999938964844, + "logps/rejected": -426.1499938964844, + "loss": 0.1674, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.23200683295726776, + "rewards/margins": 4.873046875, + "rewards/rejected": -5.106249809265137, + "step": 4410 + }, + { + "epoch": 1.4226281483865777, + "grad_norm": 42.357976281813144, + "learning_rate": 6.443174500965873e-07, + "logits/chosen": 0.0003540039178915322, + "logits/rejected": -0.08789672702550888, + "logps/chosen": -423.5, + "logps/rejected": -425.25, + "loss": 0.104, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2610107362270355, + "rewards/margins": 5.34375, + "rewards/rejected": -5.606249809265137, + "step": 4420 + }, + { + "epoch": 1.4258469461656071, + "grad_norm": 14.15626415922928, + "learning_rate": 6.435125563425628e-07, + "logits/chosen": 0.16103515028953552, + "logits/rejected": -0.09690704196691513, + "logps/chosen": -440.23748779296875, + "logps/rejected": -404.54998779296875, + "loss": 0.1084, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.5318237543106079, + "rewards/margins": 5.000781059265137, + "rewards/rejected": -5.530468940734863, + "step": 4430 + }, + { + "epoch": 1.4290657439446366, + "grad_norm": 42.53296778075217, + "learning_rate": 6.427076625885382e-07, + "logits/chosen": 0.05939941480755806, + "logits/rejected": -0.07339783012866974, + "logps/chosen": -405.86248779296875, + "logps/rejected": -402.7749938964844, + "loss": 0.0847, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.07672424614429474, + "rewards/margins": 4.982031345367432, + "rewards/rejected": -5.05859375, + "step": 4440 + }, + { + "epoch": 1.4322845417236663, + "grad_norm": 30.719670477620916, + "learning_rate": 6.419027688345137e-07, + "logits/chosen": 0.18912963569164276, + "logits/rejected": 0.10110779106616974, + "logps/chosen": -363.29998779296875, + "logps/rejected": -367.42498779296875, + "loss": 0.1014, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3601318299770355, + "rewards/margins": 4.757421970367432, + "rewards/rejected": -4.397265434265137, + "step": 4450 + }, + { + "epoch": 1.4355033395026957, + "grad_norm": 26.54274682873219, + "learning_rate": 6.410978750804893e-07, + "logits/chosen": 0.174754336476326, + "logits/rejected": 0.08510436862707138, + "logps/chosen": -358.2124938964844, + "logps/rejected": -386.7749938964844, + "loss": 0.119, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.02849121019244194, + "rewards/margins": 4.716406345367432, + "rewards/rejected": -4.744726657867432, + "step": 4460 + }, + { + "epoch": 1.4387221372817254, + "grad_norm": 20.343949364903903, + "learning_rate": 6.402929813264649e-07, + "logits/chosen": 0.16326598823070526, + "logits/rejected": 0.04320678859949112, + "logps/chosen": -433.6000061035156, + "logps/rejected": -422.625, + "loss": 0.0626, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.3660644590854645, + "rewards/margins": 5.239062309265137, + "rewards/rejected": -4.870507717132568, + "step": 4470 + }, + { + "epoch": 1.4419409350607548, + "grad_norm": 59.90020152583027, + "learning_rate": 6.394880875724404e-07, + "logits/chosen": 0.05208129808306694, + "logits/rejected": -0.08226928859949112, + "logps/chosen": -418.54998779296875, + "logps/rejected": -417.1000061035156, + "loss": 0.0974, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13521727919578552, + "rewards/margins": 5.392578125, + "rewards/rejected": -5.523046970367432, + "step": 4480 + }, + { + "epoch": 1.4451597328397843, + "grad_norm": 43.50753701426076, + "learning_rate": 6.38683193818416e-07, + "logits/chosen": -0.13032226264476776, + "logits/rejected": -0.08205566555261612, + "logps/chosen": -388.23748779296875, + "logps/rejected": -428.2250061035156, + "loss": 0.1392, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6077880859375, + "rewards/margins": 5.343163967132568, + "rewards/rejected": -5.951562404632568, + "step": 4490 + }, + { + "epoch": 1.448378530618814, + "grad_norm": 79.84374752750237, + "learning_rate": 6.378783000643915e-07, + "logits/chosen": -0.06506957858800888, + "logits/rejected": -0.1981964111328125, + "logps/chosen": -405.07501220703125, + "logps/rejected": -398.79998779296875, + "loss": 0.1086, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.2598510682582855, + "rewards/margins": 4.857421875, + "rewards/rejected": -5.11328125, + "step": 4500 + }, + { + "epoch": 1.4515973283978434, + "grad_norm": 62.70588489964126, + "learning_rate": 6.37073406310367e-07, + "logits/chosen": -0.05344238132238388, + "logits/rejected": -0.12480469048023224, + "logps/chosen": -414.6499938964844, + "logps/rejected": -427.17498779296875, + "loss": 0.1007, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.14470215141773224, + "rewards/margins": 4.997851371765137, + "rewards/rejected": -5.141797065734863, + "step": 4510 + }, + { + "epoch": 1.454816126176873, + "grad_norm": 88.7136435189087, + "learning_rate": 6.362685125563425e-07, + "logits/chosen": -0.02416229248046875, + "logits/rejected": -0.00592041015625, + "logps/chosen": -377.7124938964844, + "logps/rejected": -432.75, + "loss": 0.0988, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.26860350370407104, + "rewards/margins": 5.204687595367432, + "rewards/rejected": -4.935937404632568, + "step": 4520 + }, + { + "epoch": 1.4580349239559025, + "grad_norm": 34.15265795037574, + "learning_rate": 6.35463618802318e-07, + "logits/chosen": -0.06022949144244194, + "logits/rejected": -0.05738067626953125, + "logps/chosen": -328.8812561035156, + "logps/rejected": -382.5249938964844, + "loss": 0.0973, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.03916626051068306, + "rewards/margins": 4.774121284484863, + "rewards/rejected": -4.732617378234863, + "step": 4530 + }, + { + "epoch": 1.461253721734932, + "grad_norm": 50.22514495451012, + "learning_rate": 6.346587250482935e-07, + "logits/chosen": -0.18529053032398224, + "logits/rejected": -0.24292907118797302, + "logps/chosen": -378.32501220703125, + "logps/rejected": -413.0, + "loss": 0.1546, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.08527831733226776, + "rewards/margins": 5.1787109375, + "rewards/rejected": -5.091796875, + "step": 4540 + }, + { + "epoch": 1.4644725195139614, + "grad_norm": 43.80314866032298, + "learning_rate": 6.338538312942692e-07, + "logits/chosen": -0.10334472358226776, + "logits/rejected": -0.27866822481155396, + "logps/chosen": -413.79998779296875, + "logps/rejected": -382.04998779296875, + "loss": 0.109, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.03872070461511612, + "rewards/margins": 4.879687309265137, + "rewards/rejected": -4.920702934265137, + "step": 4550 + }, + { + "epoch": 1.467691317292991, + "grad_norm": 4.3697728385803165, + "learning_rate": 6.330489375402447e-07, + "logits/chosen": -0.04896240308880806, + "logits/rejected": -0.12282714992761612, + "logps/chosen": -430.92498779296875, + "logps/rejected": -471.1875, + "loss": 0.0844, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.28596192598342896, + "rewards/margins": 5.737890720367432, + "rewards/rejected": -5.449999809265137, + "step": 4560 + }, + { + "epoch": 1.4709101150720205, + "grad_norm": 71.57130727898026, + "learning_rate": 6.322440437862202e-07, + "logits/chosen": -0.122894287109375, + "logits/rejected": -0.14557495713233948, + "logps/chosen": -415.7124938964844, + "logps/rejected": -424.1499938964844, + "loss": 0.0991, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.14570312201976776, + "rewards/margins": 5.088671684265137, + "rewards/rejected": -4.944921970367432, + "step": 4570 + }, + { + "epoch": 1.4741289128510502, + "grad_norm": 22.320210024165075, + "learning_rate": 6.314391500321958e-07, + "logits/chosen": 0.12545165419578552, + "logits/rejected": -0.03869323804974556, + "logps/chosen": -441.63751220703125, + "logps/rejected": -417.95001220703125, + "loss": 0.0956, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.29962158203125, + "rewards/margins": 4.908203125, + "rewards/rejected": -4.611328125, + "step": 4580 + }, + { + "epoch": 1.4773477106300796, + "grad_norm": 55.01616520756505, + "learning_rate": 6.306342562781712e-07, + "logits/chosen": -0.14332275092601776, + "logits/rejected": -0.17826537787914276, + "logps/chosen": -346.95001220703125, + "logps/rejected": -392.2250061035156, + "loss": 0.1061, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.05250244215130806, + "rewards/margins": 5.065234184265137, + "rewards/rejected": -5.010546684265137, + "step": 4590 + }, + { + "epoch": 1.480566508409109, + "grad_norm": 38.3966691627447, + "learning_rate": 6.298293625241467e-07, + "logits/chosen": -0.12503357231616974, + "logits/rejected": -0.10434265434741974, + "logps/chosen": -356.82501220703125, + "logps/rejected": -391.57501220703125, + "loss": 0.0978, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.16569824516773224, + "rewards/margins": 4.919531345367432, + "rewards/rejected": -4.75390625, + "step": 4600 + }, + { + "epoch": 1.4837853061881388, + "grad_norm": 48.27185350442524, + "learning_rate": 6.290244687701223e-07, + "logits/chosen": 0.17261353135108948, + "logits/rejected": 0.05634307861328125, + "logps/chosen": -402.0874938964844, + "logps/rejected": -383.92498779296875, + "loss": 0.0867, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.325906366109848, + "rewards/margins": 4.825781345367432, + "rewards/rejected": -4.501562595367432, + "step": 4610 + }, + { + "epoch": 1.4870041039671682, + "grad_norm": 52.598427708933464, + "learning_rate": 6.282195750160978e-07, + "logits/chosen": 0.1331787109375, + "logits/rejected": 0.03883666917681694, + "logps/chosen": -409.0249938964844, + "logps/rejected": -411.75, + "loss": 0.0765, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.26872557401657104, + "rewards/margins": 5.098437309265137, + "rewards/rejected": -4.829687595367432, + "step": 4620 + }, + { + "epoch": 1.4902229017461979, + "grad_norm": 44.85375695959892, + "learning_rate": 6.274146812620734e-07, + "logits/chosen": 0.25860291719436646, + "logits/rejected": 0.20173950493335724, + "logps/chosen": -452.48748779296875, + "logps/rejected": -422.75, + "loss": 0.0762, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.740222156047821, + "rewards/margins": 5.181250095367432, + "rewards/rejected": -4.441015720367432, + "step": 4630 + }, + { + "epoch": 1.4934416995252273, + "grad_norm": 14.351534126338397, + "learning_rate": 6.26609787508049e-07, + "logits/chosen": 0.11026611179113388, + "logits/rejected": 0.01331176795065403, + "logps/chosen": -382.2749938964844, + "logps/rejected": -415.0, + "loss": 0.099, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.399392694234848, + "rewards/margins": 5.176171779632568, + "rewards/rejected": -4.776953220367432, + "step": 4640 + }, + { + "epoch": 1.4966604973042568, + "grad_norm": 35.243994004186526, + "learning_rate": 6.258048937540245e-07, + "logits/chosen": 0.10019531100988388, + "logits/rejected": -0.0668182373046875, + "logps/chosen": -448.0375061035156, + "logps/rejected": -426.01251220703125, + "loss": 0.0708, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.502825915813446, + "rewards/margins": 5.366015434265137, + "rewards/rejected": -4.865234375, + "step": 4650 + }, + { + "epoch": 1.4998792950832864, + "grad_norm": 184.77718559721862, + "learning_rate": 6.249999999999999e-07, + "logits/chosen": -0.0081787109375, + "logits/rejected": -0.11568603664636612, + "logps/chosen": -405.20001220703125, + "logps/rejected": -399.79998779296875, + "loss": 0.1442, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.3648620545864105, + "rewards/margins": 5.092187404632568, + "rewards/rejected": -4.726953029632568, + "step": 4660 + }, + { + "epoch": 1.5030980928623159, + "grad_norm": 52.43390456500897, + "learning_rate": 6.241951062459755e-07, + "logits/chosen": -0.08719482272863388, + "logits/rejected": -0.18755492568016052, + "logps/chosen": -385.54998779296875, + "logps/rejected": -395.2250061035156, + "loss": 0.1636, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.2268631011247635, + "rewards/margins": 5.201367378234863, + "rewards/rejected": -4.970703125, + "step": 4670 + }, + { + "epoch": 1.5063168906413456, + "grad_norm": 15.808155641239377, + "learning_rate": 6.23390212491951e-07, + "logits/chosen": -0.05051269382238388, + "logits/rejected": -0.11607055366039276, + "logps/chosen": -401.73748779296875, + "logps/rejected": -419.3500061035156, + "loss": 0.0404, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4193115234375, + "rewards/margins": 5.512499809265137, + "rewards/rejected": -5.092187404632568, + "step": 4680 + }, + { + "epoch": 1.509535688420375, + "grad_norm": 42.667086142582235, + "learning_rate": 6.225853187379265e-07, + "logits/chosen": 0.118743896484375, + "logits/rejected": -0.02200012281537056, + "logps/chosen": -414.54998779296875, + "logps/rejected": -388.32501220703125, + "loss": 0.0797, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.30766600370407104, + "rewards/margins": 5.101953029632568, + "rewards/rejected": -4.791406154632568, + "step": 4690 + }, + { + "epoch": 1.5127544861994044, + "grad_norm": 14.237538792733986, + "learning_rate": 6.217804249839022e-07, + "logits/chosen": 0.13725586235523224, + "logits/rejected": -0.05618896335363388, + "logps/chosen": -432.3500061035156, + "logps/rejected": -417.9750061035156, + "loss": 0.0661, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.44176024198532104, + "rewards/margins": 5.118359565734863, + "rewards/rejected": -4.676367282867432, + "step": 4700 + }, + { + "epoch": 1.515973283978434, + "grad_norm": 41.0077844618049, + "learning_rate": 6.209755312298777e-07, + "logits/chosen": -0.02202758751809597, + "logits/rejected": -0.03616943210363388, + "logps/chosen": -377.3374938964844, + "logps/rejected": -378.25, + "loss": 0.1036, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.16951903700828552, + "rewards/margins": 4.417578220367432, + "rewards/rejected": -4.247265815734863, + "step": 4710 + }, + { + "epoch": 1.5191920817574636, + "grad_norm": 18.429347206024236, + "learning_rate": 6.201706374758532e-07, + "logits/chosen": 0.10724487155675888, + "logits/rejected": -0.03243713453412056, + "logps/chosen": -399.5625, + "logps/rejected": -406.6625061035156, + "loss": 0.0729, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.6122802495956421, + "rewards/margins": 5.296484470367432, + "rewards/rejected": -4.684960842132568, + "step": 4720 + }, + { + "epoch": 1.5224108795364932, + "grad_norm": 38.324914507026804, + "learning_rate": 6.193657437218288e-07, + "logits/chosen": -0.1450958251953125, + "logits/rejected": -0.17911987006664276, + "logps/chosen": -400.70001220703125, + "logps/rejected": -412.9750061035156, + "loss": 0.0739, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.38325196504592896, + "rewards/margins": 5.408789157867432, + "rewards/rejected": -5.025781154632568, + "step": 4730 + }, + { + "epoch": 1.5256296773155227, + "grad_norm": 9.810043485151866, + "learning_rate": 6.185608499678042e-07, + "logits/chosen": -0.007250976748764515, + "logits/rejected": -0.05468139797449112, + "logps/chosen": -428.6499938964844, + "logps/rejected": -465.625, + "loss": 0.0634, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.3071655333042145, + "rewards/margins": 5.560156345367432, + "rewards/rejected": -5.251172065734863, + "step": 4740 + }, + { + "epoch": 1.5288484750945521, + "grad_norm": 43.294753486981215, + "learning_rate": 6.177559562137797e-07, + "logits/chosen": 0.07520751655101776, + "logits/rejected": -0.006398010067641735, + "logps/chosen": -423.20001220703125, + "logps/rejected": -429.8999938964844, + "loss": 0.0733, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.05881347507238388, + "rewards/margins": 5.536328315734863, + "rewards/rejected": -5.478515625, + "step": 4750 + }, + { + "epoch": 1.5320672728735816, + "grad_norm": 21.726781714075013, + "learning_rate": 6.169510624597552e-07, + "logits/chosen": 0.08274231106042862, + "logits/rejected": 0.02755126915872097, + "logps/chosen": -434.9750061035156, + "logps/rejected": -481.95001220703125, + "loss": 0.0743, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.23409423232078552, + "rewards/margins": 5.607421875, + "rewards/rejected": -5.844922065734863, + "step": 4760 + }, + { + "epoch": 1.5352860706526112, + "grad_norm": 12.54450510154017, + "learning_rate": 6.161461687057308e-07, + "logits/chosen": 0.042351532727479935, + "logits/rejected": -0.09636230766773224, + "logps/chosen": -414.67498779296875, + "logps/rejected": -415.8500061035156, + "loss": 0.0763, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.09592285007238388, + "rewards/margins": 5.310742378234863, + "rewards/rejected": -5.404687404632568, + "step": 4770 + }, + { + "epoch": 1.538504868431641, + "grad_norm": 73.86048378896736, + "learning_rate": 6.153412749517064e-07, + "logits/chosen": -0.05509033054113388, + "logits/rejected": -0.1612548828125, + "logps/chosen": -408.54376220703125, + "logps/rejected": -385.45001220703125, + "loss": 0.0872, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.14168700575828552, + "rewards/margins": 5.122265815734863, + "rewards/rejected": -4.980078220367432, + "step": 4780 + }, + { + "epoch": 1.5417236662106704, + "grad_norm": 7.859601451933072, + "learning_rate": 6.145363811976819e-07, + "logits/chosen": -0.05615234375, + "logits/rejected": -0.16723021864891052, + "logps/chosen": -376.3999938964844, + "logps/rejected": -418.375, + "loss": 0.0952, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.17527465522289276, + "rewards/margins": 5.11328125, + "rewards/rejected": -5.292675971984863, + "step": 4790 + }, + { + "epoch": 1.5449424639896998, + "grad_norm": 35.41529288018435, + "learning_rate": 6.137314874436575e-07, + "logits/chosen": 0.22768554091453552, + "logits/rejected": 0.09839477390050888, + "logps/chosen": -413.375, + "logps/rejected": -416.2124938964844, + "loss": 0.1317, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.0401611328125, + "rewards/margins": 5.276562690734863, + "rewards/rejected": -5.3125, + "step": 4800 + }, + { + "epoch": 1.5481612617687293, + "grad_norm": 56.8753685198255, + "learning_rate": 6.129265936896329e-07, + "logits/chosen": -0.18248291313648224, + "logits/rejected": -0.24096068739891052, + "logps/chosen": -384.2250061035156, + "logps/rejected": -403.29998779296875, + "loss": 0.1107, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.2791503965854645, + "rewards/margins": 5.181640625, + "rewards/rejected": -5.458203315734863, + "step": 4810 + }, + { + "epoch": 1.551380059547759, + "grad_norm": 14.06392604883331, + "learning_rate": 6.121216999356084e-07, + "logits/chosen": -0.07631683349609375, + "logits/rejected": -0.17962646484375, + "logps/chosen": -425.3125, + "logps/rejected": -421.2250061035156, + "loss": 0.1206, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.7420898675918579, + "rewards/margins": 5.632031440734863, + "rewards/rejected": -6.37109375, + "step": 4820 + }, + { + "epoch": 1.5545988573267886, + "grad_norm": 69.92058358729543, + "learning_rate": 6.11316806181584e-07, + "logits/chosen": -0.09947510063648224, + "logits/rejected": -0.20902100205421448, + "logps/chosen": -390.2250061035156, + "logps/rejected": -414.04998779296875, + "loss": 0.111, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.4158569276332855, + "rewards/margins": 5.423437595367432, + "rewards/rejected": -5.842187404632568, + "step": 4830 + }, + { + "epoch": 1.557817655105818, + "grad_norm": 24.218546847149756, + "learning_rate": 6.105119124275595e-07, + "logits/chosen": 0.005969238467514515, + "logits/rejected": 0.00971832312643528, + "logps/chosen": -462.5, + "logps/rejected": -481.6000061035156, + "loss": 0.0556, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.21365967392921448, + "rewards/margins": 6.123827934265137, + "rewards/rejected": -6.3359375, + "step": 4840 + }, + { + "epoch": 1.5610364528848475, + "grad_norm": 30.64009779265794, + "learning_rate": 6.09707018673535e-07, + "logits/chosen": -0.033843994140625, + "logits/rejected": -0.05351562425494194, + "logps/chosen": -372.2250061035156, + "logps/rejected": -397.5249938964844, + "loss": 0.0842, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.0662841796875, + "rewards/margins": 5.193359375, + "rewards/rejected": -5.258008003234863, + "step": 4850 + }, + { + "epoch": 1.564255250663877, + "grad_norm": 51.84683603703308, + "learning_rate": 6.089021249195107e-07, + "logits/chosen": 0.04569091647863388, + "logits/rejected": -0.06856689602136612, + "logps/chosen": -397.9750061035156, + "logps/rejected": -403.95001220703125, + "loss": 0.0892, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.43433839082717896, + "rewards/margins": 4.984765529632568, + "rewards/rejected": -5.416406154632568, + "step": 4860 + }, + { + "epoch": 1.5674740484429066, + "grad_norm": 8.269810923997715, + "learning_rate": 6.080972311654862e-07, + "logits/chosen": -0.015216064639389515, + "logits/rejected": -0.21654053032398224, + "logps/chosen": -430.125, + "logps/rejected": -402.1499938964844, + "loss": 0.0487, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30695801973342896, + "rewards/margins": 5.457812309265137, + "rewards/rejected": -5.766015529632568, + "step": 4870 + }, + { + "epoch": 1.570692846221936, + "grad_norm": 34.389130246796135, + "learning_rate": 6.072923374114616e-07, + "logits/chosen": -0.29664307832717896, + "logits/rejected": -0.3540588319301605, + "logps/chosen": -410.375, + "logps/rejected": -419.4750061035156, + "loss": 0.1123, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.599963366985321, + "rewards/margins": 5.297265529632568, + "rewards/rejected": -5.891797065734863, + "step": 4880 + }, + { + "epoch": 1.5739116440009657, + "grad_norm": 85.19903848081404, + "learning_rate": 6.064874436574372e-07, + "logits/chosen": -0.1480758637189865, + "logits/rejected": -0.23078003525733948, + "logps/chosen": -392.25, + "logps/rejected": -426.7250061035156, + "loss": 0.1193, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.573254406452179, + "rewards/margins": 5.186718940734863, + "rewards/rejected": -5.762499809265137, + "step": 4890 + }, + { + "epoch": 1.5771304417799952, + "grad_norm": 47.18776487853506, + "learning_rate": 6.056825499034127e-07, + "logits/chosen": -0.02877197228372097, + "logits/rejected": -0.05253295972943306, + "logps/chosen": -401.92498779296875, + "logps/rejected": -458.13751220703125, + "loss": 0.1159, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.16820068657398224, + "rewards/margins": 5.2021484375, + "rewards/rejected": -5.373827934265137, + "step": 4900 + }, + { + "epoch": 1.5803492395590246, + "grad_norm": 36.57027607622411, + "learning_rate": 6.048776561493882e-07, + "logits/chosen": -0.02325134351849556, + "logits/rejected": -0.05443115159869194, + "logps/chosen": -359.6499938964844, + "logps/rejected": -380.92498779296875, + "loss": 0.0989, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.16103515028953552, + "rewards/margins": 5.282031059265137, + "rewards/rejected": -5.125390529632568, + "step": 4910 + }, + { + "epoch": 1.5835680373380543, + "grad_norm": 55.94001657595937, + "learning_rate": 6.040727623953638e-07, + "logits/chosen": 0.07227172702550888, + "logits/rejected": -0.13942870497703552, + "logps/chosen": -401.45001220703125, + "logps/rejected": -400.45001220703125, + "loss": 0.0736, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.3301452696323395, + "rewards/margins": 4.98828125, + "rewards/rejected": -4.660546779632568, + "step": 4920 + }, + { + "epoch": 1.5867868351170837, + "grad_norm": 64.8019152070003, + "learning_rate": 6.032678686413393e-07, + "logits/chosen": 0.05472411960363388, + "logits/rejected": -0.048248291015625, + "logps/chosen": -398.36248779296875, + "logps/rejected": -382.36248779296875, + "loss": 0.0994, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.578277587890625, + "rewards/margins": 4.9619140625, + "rewards/rejected": -4.3857421875, + "step": 4930 + }, + { + "epoch": 1.5900056328961134, + "grad_norm": 17.36076818487377, + "learning_rate": 6.024629748873149e-07, + "logits/chosen": 0.05193481594324112, + "logits/rejected": -0.07624969631433487, + "logps/chosen": -369.57501220703125, + "logps/rejected": -380.5249938964844, + "loss": 0.0887, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.166748046875, + "rewards/margins": 5.166015625, + "rewards/rejected": -5.004687309265137, + "step": 4940 + }, + { + "epoch": 1.5932244306751429, + "grad_norm": 91.90193482762164, + "learning_rate": 6.016580811332905e-07, + "logits/chosen": -0.11846618354320526, + "logits/rejected": -0.1707107573747635, + "logps/chosen": -344.3125, + "logps/rejected": -375.1499938964844, + "loss": 0.1097, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.02230224572122097, + "rewards/margins": 4.669140815734863, + "rewards/rejected": -4.644921779632568, + "step": 4950 + }, + { + "epoch": 1.5964432284541723, + "grad_norm": 47.34686516768612, + "learning_rate": 6.008531873792659e-07, + "logits/chosen": 0.09849853813648224, + "logits/rejected": 0.02328796312212944, + "logps/chosen": -422.6499938964844, + "logps/rejected": -423.8374938964844, + "loss": 0.0973, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.01503906212747097, + "rewards/margins": 5.090624809265137, + "rewards/rejected": -5.108202934265137, + "step": 4960 + }, + { + "epoch": 1.5996620262332018, + "grad_norm": 13.218140436658523, + "learning_rate": 6.000482936252414e-07, + "logits/chosen": 0.19055786728858948, + "logits/rejected": 0.03245239332318306, + "logps/chosen": -389.7749938964844, + "logps/rejected": -378.0249938964844, + "loss": 0.1192, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.819775402545929, + "rewards/margins": 5.070605278015137, + "rewards/rejected": -4.249609470367432, + "step": 4970 + }, + { + "epoch": 1.6028808240122314, + "grad_norm": 37.922178772761484, + "learning_rate": 5.99243399871217e-07, + "logits/chosen": 0.10764770209789276, + "logits/rejected": 0.02503051795065403, + "logps/chosen": -426.9750061035156, + "logps/rejected": -448.45001220703125, + "loss": 0.0884, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.38554078340530396, + "rewards/margins": 5.776953220367432, + "rewards/rejected": -5.397070407867432, + "step": 4980 + }, + { + "epoch": 1.606099621791261, + "grad_norm": 19.289826977746493, + "learning_rate": 5.984385061171925e-07, + "logits/chosen": 0.13674315810203552, + "logits/rejected": -0.01171264611184597, + "logps/chosen": -398.4750061035156, + "logps/rejected": -415.4375, + "loss": 0.0806, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.39018553495407104, + "rewards/margins": 5.153906345367432, + "rewards/rejected": -4.764062404632568, + "step": 4990 + }, + { + "epoch": 1.6093184195702905, + "grad_norm": 21.54451627158186, + "learning_rate": 5.97633612363168e-07, + "logits/chosen": 0.056243896484375, + "logits/rejected": -0.10329589992761612, + "logps/chosen": -426.98748779296875, + "logps/rejected": -422.6499938964844, + "loss": 0.0898, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.4134765565395355, + "rewards/margins": 5.3916015625, + "rewards/rejected": -4.977246284484863, + "step": 5000 + }, + { + "epoch": 1.61253721734932, + "grad_norm": 13.88258847397583, + "learning_rate": 5.968287186091436e-07, + "logits/chosen": 0.03161315992474556, + "logits/rejected": -0.12070770561695099, + "logps/chosen": -413.67498779296875, + "logps/rejected": -429.82501220703125, + "loss": 0.0631, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.2560668885707855, + "rewards/margins": 5.227734565734863, + "rewards/rejected": -5.485937595367432, + "step": 5010 + }, + { + "epoch": 1.6157560151283494, + "grad_norm": 28.019955799633816, + "learning_rate": 5.960238248551192e-07, + "logits/chosen": 0.05401306226849556, + "logits/rejected": -0.08273772895336151, + "logps/chosen": -457.9750061035156, + "logps/rejected": -446.0625, + "loss": 0.0924, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.22094115614891052, + "rewards/margins": 5.399804592132568, + "rewards/rejected": -5.180761814117432, + "step": 5020 + }, + { + "epoch": 1.618974812907379, + "grad_norm": 53.91718361111996, + "learning_rate": 5.952189311010946e-07, + "logits/chosen": 0.14776611328125, + "logits/rejected": -0.14444580674171448, + "logps/chosen": -449.45001220703125, + "logps/rejected": -410.5, + "loss": 0.082, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.14873656630516052, + "rewards/margins": 5.235156059265137, + "rewards/rejected": -5.089062690734863, + "step": 5030 + }, + { + "epoch": 1.6221936106864088, + "grad_norm": 63.80784189329625, + "learning_rate": 5.944140373470702e-07, + "logits/chosen": 0.13528136909008026, + "logits/rejected": 0.10750122368335724, + "logps/chosen": -366.0874938964844, + "logps/rejected": -391.8500061035156, + "loss": 0.1891, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": 0.23789063096046448, + "rewards/margins": 4.5751953125, + "rewards/rejected": -4.338476657867432, + "step": 5040 + }, + { + "epoch": 1.6254124084654382, + "grad_norm": 28.81754729279938, + "learning_rate": 5.936091435930457e-07, + "logits/chosen": 0.2219085693359375, + "logits/rejected": 0.06989135593175888, + "logps/chosen": -405.38751220703125, + "logps/rejected": -380.9750061035156, + "loss": 0.0976, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.939465343952179, + "rewards/margins": 4.745703220367432, + "rewards/rejected": -3.8046875, + "step": 5050 + }, + { + "epoch": 1.6286312062444677, + "grad_norm": 74.8827657504156, + "learning_rate": 5.928042498390212e-07, + "logits/chosen": 0.16120299696922302, + "logits/rejected": 0.03300781175494194, + "logps/chosen": -413.3999938964844, + "logps/rejected": -426.2250061035156, + "loss": 0.111, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.131750464439392, + "rewards/margins": 4.901562690734863, + "rewards/rejected": -3.7669920921325684, + "step": 5060 + }, + { + "epoch": 1.6318500040234971, + "grad_norm": 32.48728936846885, + "learning_rate": 5.919993560849967e-07, + "logits/chosen": 0.07919921725988388, + "logits/rejected": 0.08055724948644638, + "logps/chosen": -393.3999938964844, + "logps/rejected": -413.9125061035156, + "loss": 0.0673, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.52215576171875, + "rewards/margins": 4.870312690734863, + "rewards/rejected": -4.350390434265137, + "step": 5070 + }, + { + "epoch": 1.6350688018025268, + "grad_norm": 29.291147901829923, + "learning_rate": 5.911944623309723e-07, + "logits/chosen": -0.001953125, + "logits/rejected": -0.03779907152056694, + "logps/chosen": -378.5249938964844, + "logps/rejected": -384.57501220703125, + "loss": 0.0765, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.29779052734375, + "rewards/margins": 4.876562595367432, + "rewards/rejected": -4.576757907867432, + "step": 5080 + }, + { + "epoch": 1.6382875995815565, + "grad_norm": 34.408939994839805, + "learning_rate": 5.903895685769478e-07, + "logits/chosen": -0.03223877027630806, + "logits/rejected": -0.160552978515625, + "logps/chosen": -405.2749938964844, + "logps/rejected": -410.2749938964844, + "loss": 0.1167, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.02466430701315403, + "rewards/margins": 5.077734470367432, + "rewards/rejected": -5.103906154632568, + "step": 5090 + }, + { + "epoch": 1.641506397360586, + "grad_norm": 74.14767101805762, + "learning_rate": 5.895846748229233e-07, + "logits/chosen": -0.01896972581744194, + "logits/rejected": -0.16153565049171448, + "logps/chosen": -424.23748779296875, + "logps/rejected": -428.75, + "loss": 0.1083, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06845702975988388, + "rewards/margins": 5.24072265625, + "rewards/rejected": -5.308203220367432, + "step": 5100 + }, + { + "epoch": 1.6447251951396153, + "grad_norm": 15.317335842193545, + "learning_rate": 5.887797810688989e-07, + "logits/chosen": -0.21103516221046448, + "logits/rejected": -0.26556396484375, + "logps/chosen": -430.375, + "logps/rejected": -441.9750061035156, + "loss": 0.0767, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.27903443574905396, + "rewards/margins": 5.610156059265137, + "rewards/rejected": -5.890625, + "step": 5110 + }, + { + "epoch": 1.6479439929186448, + "grad_norm": 38.95128241185428, + "learning_rate": 5.879748873148744e-07, + "logits/chosen": -0.09046630561351776, + "logits/rejected": -0.15576171875, + "logps/chosen": -435.2749938964844, + "logps/rejected": -444.7250061035156, + "loss": 0.0988, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.07082519680261612, + "rewards/margins": 5.3076171875, + "rewards/rejected": -5.376953125, + "step": 5120 + }, + { + "epoch": 1.6511627906976745, + "grad_norm": 9.119135370688992, + "learning_rate": 5.871699935608499e-07, + "logits/chosen": -0.06135864183306694, + "logits/rejected": -0.17351074516773224, + "logps/chosen": -359.54998779296875, + "logps/rejected": -384.625, + "loss": 0.0838, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.2835449278354645, + "rewards/margins": 5.092382907867432, + "rewards/rejected": -5.379492282867432, + "step": 5130 + }, + { + "epoch": 1.654381588476704, + "grad_norm": 10.027518053023979, + "learning_rate": 5.863650998068255e-07, + "logits/chosen": 0.06485595554113388, + "logits/rejected": -0.09185180813074112, + "logps/chosen": -385.17498779296875, + "logps/rejected": -367.54998779296875, + "loss": 0.14, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.08388672024011612, + "rewards/margins": 4.881249904632568, + "rewards/rejected": -4.963281154632568, + "step": 5140 + }, + { + "epoch": 1.6576003862557336, + "grad_norm": 57.76042243162388, + "learning_rate": 5.85560206052801e-07, + "logits/chosen": -0.02582397498190403, + "logits/rejected": -0.08989258110523224, + "logps/chosen": -414.07501220703125, + "logps/rejected": -441.375, + "loss": 0.0852, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.10450439155101776, + "rewards/margins": 5.267187595367432, + "rewards/rejected": -5.372265815734863, + "step": 5150 + }, + { + "epoch": 1.660819184034763, + "grad_norm": 37.24766096434797, + "learning_rate": 5.847553122987765e-07, + "logits/chosen": -0.09901733696460724, + "logits/rejected": -0.2057037353515625, + "logps/chosen": -433.7250061035156, + "logps/rejected": -460.0, + "loss": 0.0664, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.05523681640625, + "rewards/margins": 5.773828029632568, + "rewards/rejected": -5.827734470367432, + "step": 5160 + }, + { + "epoch": 1.6640379818137925, + "grad_norm": 23.88625407899546, + "learning_rate": 5.839504185447521e-07, + "logits/chosen": 0.12502136826515198, + "logits/rejected": -0.05021972581744194, + "logps/chosen": -408.75, + "logps/rejected": -382.17498779296875, + "loss": 0.0913, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.016274262219667435, + "rewards/margins": 5.018164157867432, + "rewards/rejected": -4.999609470367432, + "step": 5170 + }, + { + "epoch": 1.667256779592822, + "grad_norm": 29.703397257968735, + "learning_rate": 5.831455247907276e-07, + "logits/chosen": -0.013378906063735485, + "logits/rejected": -0.13432617485523224, + "logps/chosen": -415.11248779296875, + "logps/rejected": -416.1499938964844, + "loss": 0.0881, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.1396484375, + "rewards/margins": 5.148633003234863, + "rewards/rejected": -5.287304878234863, + "step": 5180 + }, + { + "epoch": 1.6704755773718516, + "grad_norm": 54.56183346371394, + "learning_rate": 5.823406310367031e-07, + "logits/chosen": 0.05142822116613388, + "logits/rejected": -0.08287353813648224, + "logps/chosen": -431.54998779296875, + "logps/rejected": -437.20001220703125, + "loss": 0.0752, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.09383545070886612, + "rewards/margins": 5.37890625, + "rewards/rejected": -5.475390434265137, + "step": 5190 + }, + { + "epoch": 1.6736943751508813, + "grad_norm": 11.615297966032365, + "learning_rate": 5.815357372826787e-07, + "logits/chosen": -0.0016845703357830644, + "logits/rejected": -0.02978515625, + "logps/chosen": -416.54998779296875, + "logps/rejected": -454.8999938964844, + "loss": 0.0562, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.212646484375, + "rewards/margins": 5.382421970367432, + "rewards/rejected": -5.167187690734863, + "step": 5200 + }, + { + "epoch": 1.6769131729299107, + "grad_norm": 53.27849184990999, + "learning_rate": 5.807308435286542e-07, + "logits/chosen": 0.0215301513671875, + "logits/rejected": -0.02957305870950222, + "logps/chosen": -384.2250061035156, + "logps/rejected": -409.45001220703125, + "loss": 0.1013, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.12636718153953552, + "rewards/margins": 5.021093845367432, + "rewards/rejected": -4.897265434265137, + "step": 5210 + }, + { + "epoch": 1.6801319707089402, + "grad_norm": 28.162194518672113, + "learning_rate": 5.799259497746297e-07, + "logits/chosen": 0.03743324428796768, + "logits/rejected": -0.20595093071460724, + "logps/chosen": -429.9750061035156, + "logps/rejected": -397.8999938964844, + "loss": 0.0632, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.20622558891773224, + "rewards/margins": 5.214453220367432, + "rewards/rejected": -5.422656059265137, + "step": 5220 + }, + { + "epoch": 1.6833507684879696, + "grad_norm": 56.47503880191933, + "learning_rate": 5.791210560206053e-07, + "logits/chosen": -0.06904296576976776, + "logits/rejected": -0.14968261122703552, + "logps/chosen": -397.5, + "logps/rejected": -417.125, + "loss": 0.093, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.16915282607078552, + "rewards/margins": 5.411328315734863, + "rewards/rejected": -5.581250190734863, + "step": 5230 + }, + { + "epoch": 1.6865695662669993, + "grad_norm": 11.21426443550689, + "learning_rate": 5.783161622665808e-07, + "logits/chosen": 0.07001342624425888, + "logits/rejected": -0.16109009087085724, + "logps/chosen": -422.42498779296875, + "logps/rejected": -403.7250061035156, + "loss": 0.078, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.3660827577114105, + "rewards/margins": 5.62109375, + "rewards/rejected": -5.986328125, + "step": 5240 + }, + { + "epoch": 1.689788364046029, + "grad_norm": 35.914289531940945, + "learning_rate": 5.775112685125562e-07, + "logits/chosen": -0.07921753078699112, + "logits/rejected": -0.08941650390625, + "logps/chosen": -385.8187561035156, + "logps/rejected": -393.20001220703125, + "loss": 0.1283, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.54150390625, + "rewards/margins": 4.792578220367432, + "rewards/rejected": -5.33203125, + "step": 5250 + }, + { + "epoch": 1.6930071618250584, + "grad_norm": 34.25286904538571, + "learning_rate": 5.767063747585319e-07, + "logits/chosen": 0.08542480319738388, + "logits/rejected": -0.03588562086224556, + "logps/chosen": -455.25, + "logps/rejected": -460.92498779296875, + "loss": 0.0628, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.12021484225988388, + "rewards/margins": 6.033593654632568, + "rewards/rejected": -5.911328315734863, + "step": 5260 + }, + { + "epoch": 1.6962259596040878, + "grad_norm": 34.75112665969798, + "learning_rate": 5.759014810045074e-07, + "logits/chosen": -0.06679077446460724, + "logits/rejected": -0.1524505615234375, + "logps/chosen": -401.0, + "logps/rejected": -413.29998779296875, + "loss": 0.1122, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.4183105528354645, + "rewards/margins": 5.403515815734863, + "rewards/rejected": -5.821093559265137, + "step": 5270 + }, + { + "epoch": 1.6994447573831173, + "grad_norm": 20.307856436237355, + "learning_rate": 5.750965872504829e-07, + "logits/chosen": 0.23883056640625, + "logits/rejected": 0.16657105088233948, + "logps/chosen": -413.3500061035156, + "logps/rejected": -413.7250061035156, + "loss": 0.0799, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.161865234375, + "rewards/margins": 5.234375, + "rewards/rejected": -5.074999809265137, + "step": 5280 + }, + { + "epoch": 1.702663555162147, + "grad_norm": 48.77221195880645, + "learning_rate": 5.742916934964585e-07, + "logits/chosen": 0.282156378030777, + "logits/rejected": 0.15446510910987854, + "logps/chosen": -403.86248779296875, + "logps/rejected": -400.82501220703125, + "loss": 0.0952, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.23720702528953552, + "rewards/margins": 5.20703125, + "rewards/rejected": -4.971875190734863, + "step": 5290 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 23.219377004318186, + "learning_rate": 5.73486799742434e-07, + "logits/chosen": -0.02510986290872097, + "logits/rejected": -0.05061645433306694, + "logps/chosen": -378.125, + "logps/rejected": -398.26251220703125, + "loss": 0.0911, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.2743896543979645, + "rewards/margins": 4.848828315734863, + "rewards/rejected": -5.123827934265137, + "step": 5300 + }, + { + "epoch": 1.709101150720206, + "grad_norm": 54.273466239975534, + "learning_rate": 5.726819059884095e-07, + "logits/chosen": 0.0029724121559411287, + "logits/rejected": -0.12654419243335724, + "logps/chosen": -397.875, + "logps/rejected": -373.07501220703125, + "loss": 0.1076, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.13458251953125, + "rewards/margins": 5.096484184265137, + "rewards/rejected": -5.228906154632568, + "step": 5310 + }, + { + "epoch": 1.7123199484992355, + "grad_norm": 32.48864327580565, + "learning_rate": 5.71877012234385e-07, + "logits/chosen": 0.12347106635570526, + "logits/rejected": -0.03631286695599556, + "logps/chosen": -405.5, + "logps/rejected": -413.04998779296875, + "loss": 0.0963, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.34315794706344604, + "rewards/margins": 5.3125, + "rewards/rejected": -5.655859470367432, + "step": 5320 + }, + { + "epoch": 1.715538746278265, + "grad_norm": 33.70307551183803, + "learning_rate": 5.710721184803605e-07, + "logits/chosen": 0.02800598181784153, + "logits/rejected": -0.02991943433880806, + "logps/chosen": -422.17498779296875, + "logps/rejected": -421.6499938964844, + "loss": 0.0947, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.19608154892921448, + "rewards/margins": 5.195703029632568, + "rewards/rejected": -5.391406059265137, + "step": 5330 + }, + { + "epoch": 1.7187575440572946, + "grad_norm": 51.2152949648931, + "learning_rate": 5.702672247263361e-07, + "logits/chosen": 0.01667480543255806, + "logits/rejected": -0.11459960788488388, + "logps/chosen": -436.9375, + "logps/rejected": -463.0375061035156, + "loss": 0.1073, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.15732422471046448, + "rewards/margins": 5.320361137390137, + "rewards/rejected": -5.475781440734863, + "step": 5340 + }, + { + "epoch": 1.721976341836324, + "grad_norm": 25.171877082882357, + "learning_rate": 5.694623309723117e-07, + "logits/chosen": 0.05442504957318306, + "logits/rejected": -0.14942017197608948, + "logps/chosen": -428.125, + "logps/rejected": -434.92498779296875, + "loss": 0.1049, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.22333984076976776, + "rewards/margins": 5.390625, + "rewards/rejected": -5.614453315734863, + "step": 5350 + }, + { + "epoch": 1.7251951396153538, + "grad_norm": 28.243512681026143, + "learning_rate": 5.686574372182872e-07, + "logits/chosen": -0.05059509351849556, + "logits/rejected": -0.15891113877296448, + "logps/chosen": -407.07501220703125, + "logps/rejected": -402.3500061035156, + "loss": 0.1119, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5645385980606079, + "rewards/margins": 5.107421875, + "rewards/rejected": -5.673437595367432, + "step": 5360 + }, + { + "epoch": 1.7284139373943832, + "grad_norm": 30.193702232169453, + "learning_rate": 5.678525434642627e-07, + "logits/chosen": -0.21523436903953552, + "logits/rejected": -0.17093506455421448, + "logps/chosen": -409.57501220703125, + "logps/rejected": -442.375, + "loss": 0.1133, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -1.279296875, + "rewards/margins": 5.2578125, + "rewards/rejected": -6.538281440734863, + "step": 5370 + }, + { + "epoch": 1.7316327351734127, + "grad_norm": 21.754222293576507, + "learning_rate": 5.670476497102382e-07, + "logits/chosen": -0.09686279296875, + "logits/rejected": -0.24172362685203552, + "logps/chosen": -420.29998779296875, + "logps/rejected": -403.0249938964844, + "loss": 0.1173, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.543896496295929, + "rewards/margins": 5.177148342132568, + "rewards/rejected": -5.72265625, + "step": 5380 + }, + { + "epoch": 1.734851532952442, + "grad_norm": 28.263455328672478, + "learning_rate": 5.662427559562138e-07, + "logits/chosen": -0.03082275390625, + "logits/rejected": -0.11839599907398224, + "logps/chosen": -387.1499938964844, + "logps/rejected": -437.8500061035156, + "loss": 0.0633, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.24897460639476776, + "rewards/margins": 5.221875190734863, + "rewards/rejected": -5.467187404632568, + "step": 5390 + }, + { + "epoch": 1.7380703307314718, + "grad_norm": 36.31276728200759, + "learning_rate": 5.654378622021892e-07, + "logits/chosen": 0.227813720703125, + "logits/rejected": 0.09493408352136612, + "logps/chosen": -436.25, + "logps/rejected": -405.4750061035156, + "loss": 0.1027, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.6228393316268921, + "rewards/margins": 4.763671875, + "rewards/rejected": -5.385937690734863, + "step": 5400 + }, + { + "epoch": 1.7412891285105014, + "grad_norm": 38.30078430195414, + "learning_rate": 5.646329684481647e-07, + "logits/chosen": -0.11586914211511612, + "logits/rejected": -0.11640014499425888, + "logps/chosen": -453.04998779296875, + "logps/rejected": -450.625, + "loss": 0.0775, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.309539794921875, + "rewards/margins": 5.762890815734863, + "rewards/rejected": -6.070703029632568, + "step": 5410 + }, + { + "epoch": 1.7445079262895309, + "grad_norm": 67.30996014382626, + "learning_rate": 5.638280746941404e-07, + "logits/chosen": -0.06264648586511612, + "logits/rejected": -0.19999389350414276, + "logps/chosen": -406.54998779296875, + "logps/rejected": -432.3500061035156, + "loss": 0.0804, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.4731079041957855, + "rewards/margins": 5.482421875, + "rewards/rejected": -5.954297065734863, + "step": 5420 + }, + { + "epoch": 1.7477267240685603, + "grad_norm": 19.686779149421522, + "learning_rate": 5.630231809401159e-07, + "logits/chosen": 0.03400878980755806, + "logits/rejected": -0.11299743503332138, + "logps/chosen": -424.9125061035156, + "logps/rejected": -443.3500061035156, + "loss": 0.0914, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.11964111030101776, + "rewards/margins": 5.209374904632568, + "rewards/rejected": -5.330078125, + "step": 5430 + }, + { + "epoch": 1.7509455218475898, + "grad_norm": 16.439323075346653, + "learning_rate": 5.622182871860914e-07, + "logits/chosen": -0.0742034912109375, + "logits/rejected": -0.12915953993797302, + "logps/chosen": -392.42498779296875, + "logps/rejected": -431.125, + "loss": 0.0919, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.3968444764614105, + "rewards/margins": 5.107812404632568, + "rewards/rejected": -5.508203029632568, + "step": 5440 + }, + { + "epoch": 1.7541643196266195, + "grad_norm": 37.89222234533041, + "learning_rate": 5.61413393432067e-07, + "logits/chosen": -0.00217437744140625, + "logits/rejected": -0.15357360243797302, + "logps/chosen": -404.54998779296875, + "logps/rejected": -385.3999938964844, + "loss": 0.0863, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.10839233547449112, + "rewards/margins": 4.886328220367432, + "rewards/rejected": -4.99609375, + "step": 5450 + }, + { + "epoch": 1.7573831174056491, + "grad_norm": 15.408853250833733, + "learning_rate": 5.606084996780425e-07, + "logits/chosen": 0.03195800632238388, + "logits/rejected": 0.01547851599752903, + "logps/chosen": -400.0, + "logps/rejected": -414.73748779296875, + "loss": 0.1325, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.09441528469324112, + "rewards/margins": 4.919043064117432, + "rewards/rejected": -5.010937690734863, + "step": 5460 + }, + { + "epoch": 1.7606019151846786, + "grad_norm": 13.377957343453204, + "learning_rate": 5.598036059240179e-07, + "logits/chosen": 0.13852843642234802, + "logits/rejected": -0.02703857421875, + "logps/chosen": -429.32501220703125, + "logps/rejected": -408.375, + "loss": 0.0823, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.11385498195886612, + "rewards/margins": 4.93359375, + "rewards/rejected": -4.818359375, + "step": 5470 + }, + { + "epoch": 1.763820712963708, + "grad_norm": 10.715572663544982, + "learning_rate": 5.589987121699935e-07, + "logits/chosen": -0.03743133693933487, + "logits/rejected": -0.2098388671875, + "logps/chosen": -408.0874938964844, + "logps/rejected": -416.625, + "loss": 0.108, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.2562927305698395, + "rewards/margins": 5.341992378234863, + "rewards/rejected": -5.084374904632568, + "step": 5480 + }, + { + "epoch": 1.7670395107427375, + "grad_norm": 76.75092866864732, + "learning_rate": 5.58193818415969e-07, + "logits/chosen": 0.05698242038488388, + "logits/rejected": -0.06632690131664276, + "logps/chosen": -388.375, + "logps/rejected": -391.54998779296875, + "loss": 0.0847, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.12616272270679474, + "rewards/margins": 5.244921684265137, + "rewards/rejected": -5.11328125, + "step": 5490 + }, + { + "epoch": 1.7702583085217671, + "grad_norm": 51.46229245347473, + "learning_rate": 5.573889246619446e-07, + "logits/chosen": 0.01715698279440403, + "logits/rejected": -0.1409912109375, + "logps/chosen": -388.0249938964844, + "logps/rejected": -385.32501220703125, + "loss": 0.0711, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.27717286348342896, + "rewards/margins": 4.859765529632568, + "rewards/rejected": -5.139062404632568, + "step": 5500 + }, + { + "epoch": 1.7734771063007968, + "grad_norm": 57.8944939835135, + "learning_rate": 5.565840309079202e-07, + "logits/chosen": -0.012316894717514515, + "logits/rejected": -0.22545012831687927, + "logps/chosen": -472.79998779296875, + "logps/rejected": -468.9750061035156, + "loss": 0.0698, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.15323486924171448, + "rewards/margins": 5.940625190734863, + "rewards/rejected": -6.092577934265137, + "step": 5510 + }, + { + "epoch": 1.7766959040798262, + "grad_norm": 69.68452258752046, + "learning_rate": 5.557791371538957e-07, + "logits/chosen": -0.17203369736671448, + "logits/rejected": -0.24419251084327698, + "logps/chosen": -449.92498779296875, + "logps/rejected": -461.04998779296875, + "loss": 0.0729, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.734478771686554, + "rewards/margins": 5.7431640625, + "rewards/rejected": -6.480078220367432, + "step": 5520 + }, + { + "epoch": 1.7799147018588557, + "grad_norm": 22.781046080573898, + "learning_rate": 5.549742433998712e-07, + "logits/chosen": 0.0010559081565588713, + "logits/rejected": 0.006085204891860485, + "logps/chosen": -386.42498779296875, + "logps/rejected": -431.8999938964844, + "loss": 0.0626, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.23251953721046448, + "rewards/margins": 5.787890434265137, + "rewards/rejected": -6.022656440734863, + "step": 5530 + }, + { + "epoch": 1.7831334996378851, + "grad_norm": 33.6482819417497, + "learning_rate": 5.541693496458468e-07, + "logits/chosen": -0.09620971977710724, + "logits/rejected": -0.08370361477136612, + "logps/chosen": -350.54998779296875, + "logps/rejected": -413.20001220703125, + "loss": 0.0928, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.14736327528953552, + "rewards/margins": 5.485937595367432, + "rewards/rejected": -5.632421970367432, + "step": 5540 + }, + { + "epoch": 1.7863522974169148, + "grad_norm": 24.492126869197786, + "learning_rate": 5.533644558918222e-07, + "logits/chosen": 0.01373291015625, + "logits/rejected": -0.07628784328699112, + "logps/chosen": -396.57501220703125, + "logps/rejected": -428.7250061035156, + "loss": 0.09, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.05596923828125, + "rewards/margins": 5.3828125, + "rewards/rejected": -5.432714939117432, + "step": 5550 + }, + { + "epoch": 1.7895710951959443, + "grad_norm": 30.128350890139128, + "learning_rate": 5.525595621377977e-07, + "logits/chosen": -0.029984284192323685, + "logits/rejected": -0.07957153022289276, + "logps/chosen": -383.1000061035156, + "logps/rejected": -397.82501220703125, + "loss": 0.081, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.1512451171875, + "rewards/margins": 5.18359375, + "rewards/rejected": -5.037890434265137, + "step": 5560 + }, + { + "epoch": 1.792789892974974, + "grad_norm": 29.174230233516685, + "learning_rate": 5.517546683837733e-07, + "logits/chosen": -0.061737060546875, + "logits/rejected": -0.12739257514476776, + "logps/chosen": -451.54998779296875, + "logps/rejected": -453.70001220703125, + "loss": 0.0682, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.0381927490234375, + "rewards/margins": 5.263281345367432, + "rewards/rejected": -5.228125095367432, + "step": 5570 + }, + { + "epoch": 1.7960086907540034, + "grad_norm": 10.677484584564025, + "learning_rate": 5.509497746297489e-07, + "logits/chosen": -0.02377929724752903, + "logits/rejected": -0.01401672326028347, + "logps/chosen": -423.8999938964844, + "logps/rejected": -449.3999938964844, + "loss": 0.0616, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.02583007887005806, + "rewards/margins": 5.329687595367432, + "rewards/rejected": -5.354296684265137, + "step": 5580 + }, + { + "epoch": 1.7992274885330328, + "grad_norm": 51.0032718258054, + "learning_rate": 5.501448808757244e-07, + "logits/chosen": 0.0076538086868822575, + "logits/rejected": -0.03267822414636612, + "logps/chosen": -417.82501220703125, + "logps/rejected": -410.3500061035156, + "loss": 0.0925, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.12806396186351776, + "rewards/margins": 5.360937595367432, + "rewards/rejected": -5.232421875, + "step": 5590 + }, + { + "epoch": 1.8024462863120625, + "grad_norm": 56.7006456233599, + "learning_rate": 5.493399871217e-07, + "logits/chosen": 0.03216552734375, + "logits/rejected": -0.1558837890625, + "logps/chosen": -422.79998779296875, + "logps/rejected": -433.92498779296875, + "loss": 0.0917, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.08355712890625, + "rewards/margins": 5.501953125, + "rewards/rejected": -5.4189453125, + "step": 5600 + }, + { + "epoch": 1.805665084091092, + "grad_norm": 31.93536900367813, + "learning_rate": 5.485350933676755e-07, + "logits/chosen": 0.09628906100988388, + "logits/rejected": -0.01728515699505806, + "logps/chosen": -465.1000061035156, + "logps/rejected": -463.79998779296875, + "loss": 0.1243, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.13656005263328552, + "rewards/margins": 5.513281345367432, + "rewards/rejected": -5.378515720367432, + "step": 5610 + }, + { + "epoch": 1.8088838818701216, + "grad_norm": 23.435236311226774, + "learning_rate": 5.477301996136509e-07, + "logits/chosen": -0.12521973252296448, + "logits/rejected": -0.24482421576976776, + "logps/chosen": -428.07501220703125, + "logps/rejected": -421.5249938964844, + "loss": 0.0693, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.06112975999712944, + "rewards/margins": 5.358984470367432, + "rewards/rejected": -5.294140815734863, + "step": 5620 + }, + { + "epoch": 1.812102679649151, + "grad_norm": 13.405548087655784, + "learning_rate": 5.469253058596265e-07, + "logits/chosen": -0.07551880180835724, + "logits/rejected": -0.18749085068702698, + "logps/chosen": -392.07501220703125, + "logps/rejected": -398.625, + "loss": 0.062, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.01263427734375, + "rewards/margins": 5.313672065734863, + "rewards/rejected": -5.324609279632568, + "step": 5630 + }, + { + "epoch": 1.8153214774281805, + "grad_norm": 15.993321291009698, + "learning_rate": 5.46120412105602e-07, + "logits/chosen": -0.13175658881664276, + "logits/rejected": -0.3073974549770355, + "logps/chosen": -408.73748779296875, + "logps/rejected": -411.73748779296875, + "loss": 0.0942, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.3757568299770355, + "rewards/margins": 5.083788871765137, + "rewards/rejected": -5.459374904632568, + "step": 5640 + }, + { + "epoch": 1.81854027520721, + "grad_norm": 17.15782477151709, + "learning_rate": 5.453155183515775e-07, + "logits/chosen": -0.07475890964269638, + "logits/rejected": -0.23723144829273224, + "logps/chosen": -368.95001220703125, + "logps/rejected": -385.0, + "loss": 0.0974, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4431396424770355, + "rewards/margins": 5.078906059265137, + "rewards/rejected": -5.521093845367432, + "step": 5650 + }, + { + "epoch": 1.8217590729862396, + "grad_norm": 31.617639066480677, + "learning_rate": 5.445106245975532e-07, + "logits/chosen": 0.04910736158490181, + "logits/rejected": -0.1310272216796875, + "logps/chosen": -453.4624938964844, + "logps/rejected": -430.0249938964844, + "loss": 0.0696, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.2867797911167145, + "rewards/margins": 5.708203315734863, + "rewards/rejected": -5.419921875, + "step": 5660 + }, + { + "epoch": 1.8249778707652693, + "grad_norm": 27.99031108223575, + "learning_rate": 5.437057308435287e-07, + "logits/chosen": -0.07198486477136612, + "logits/rejected": -0.14740410447120667, + "logps/chosen": -409.9125061035156, + "logps/rejected": -430.67498779296875, + "loss": 0.1046, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.19514159858226776, + "rewards/margins": 5.350781440734863, + "rewards/rejected": -5.548437595367432, + "step": 5670 + }, + { + "epoch": 1.8281966685442987, + "grad_norm": 51.20695553803731, + "learning_rate": 5.429008370895042e-07, + "logits/chosen": -0.11983032524585724, + "logits/rejected": -0.17756958305835724, + "logps/chosen": -420.75, + "logps/rejected": -445.5, + "loss": 0.1408, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.46014404296875, + "rewards/margins": 5.318749904632568, + "rewards/rejected": -5.778515815734863, + "step": 5680 + }, + { + "epoch": 1.8314154663233282, + "grad_norm": 27.025406143720822, + "learning_rate": 5.420959433354798e-07, + "logits/chosen": 0.012115478515625, + "logits/rejected": -0.06123657152056694, + "logps/chosen": -414.3500061035156, + "logps/rejected": -442.6499938964844, + "loss": 0.1174, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 0.114013671875, + "rewards/margins": 5.2783203125, + "rewards/rejected": -5.164453029632568, + "step": 5690 + }, + { + "epoch": 1.8346342641023576, + "grad_norm": 15.69196075810465, + "learning_rate": 5.412910495814552e-07, + "logits/chosen": -0.0036865235306322575, + "logits/rejected": -0.01983947679400444, + "logps/chosen": -399.5249938964844, + "logps/rejected": -447.875, + "loss": 0.1064, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.10947265475988388, + "rewards/margins": 5.482031345367432, + "rewards/rejected": -5.596484184265137, + "step": 5700 + }, + { + "epoch": 1.8378530618813873, + "grad_norm": 16.139217775282113, + "learning_rate": 5.404861558274307e-07, + "logits/chosen": -0.09920654445886612, + "logits/rejected": -0.11429748684167862, + "logps/chosen": -381.9750061035156, + "logps/rejected": -397.4750061035156, + "loss": 0.081, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.32891845703125, + "rewards/margins": 5.451562404632568, + "rewards/rejected": -5.125586032867432, + "step": 5710 + }, + { + "epoch": 1.841071859660417, + "grad_norm": 37.72967989541999, + "learning_rate": 5.396812620734062e-07, + "logits/chosen": -0.14149780571460724, + "logits/rejected": -0.23679199814796448, + "logps/chosen": -395.875, + "logps/rejected": -438.8500061035156, + "loss": 0.0943, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.430288702249527, + "rewards/margins": 5.139843940734863, + "rewards/rejected": -5.56640625, + "step": 5720 + }, + { + "epoch": 1.8442906574394464, + "grad_norm": 39.827200004727295, + "learning_rate": 5.388763683193818e-07, + "logits/chosen": 0.02748413011431694, + "logits/rejected": -0.16055908799171448, + "logps/chosen": -440.625, + "logps/rejected": -432.92498779296875, + "loss": 0.0671, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.37553709745407104, + "rewards/margins": 5.367578029632568, + "rewards/rejected": -4.993750095367432, + "step": 5730 + }, + { + "epoch": 1.8475094552184759, + "grad_norm": 31.236244352149118, + "learning_rate": 5.380714745653574e-07, + "logits/chosen": -0.04915771633386612, + "logits/rejected": -0.06885375827550888, + "logps/chosen": -375.7124938964844, + "logps/rejected": -395.42498779296875, + "loss": 0.1333, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.4226928651332855, + "rewards/margins": 4.459374904632568, + "rewards/rejected": -4.880859375, + "step": 5740 + }, + { + "epoch": 1.8507282529975053, + "grad_norm": 14.072218814341934, + "learning_rate": 5.372665808113329e-07, + "logits/chosen": -0.11764831840991974, + "logits/rejected": -0.26634520292282104, + "logps/chosen": -380.79998779296875, + "logps/rejected": -425.26251220703125, + "loss": 0.1189, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.13212890923023224, + "rewards/margins": 5.4296875, + "rewards/rejected": -5.559765815734863, + "step": 5750 + }, + { + "epoch": 1.853947050776535, + "grad_norm": 75.73465714826303, + "learning_rate": 5.364616870573085e-07, + "logits/chosen": 0.07594604790210724, + "logits/rejected": -0.15406647324562073, + "logps/chosen": -402.42498779296875, + "logps/rejected": -371.07501220703125, + "loss": 0.1196, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.26666259765625, + "rewards/margins": 4.916015625, + "rewards/rejected": -5.1806640625, + "step": 5760 + }, + { + "epoch": 1.8571658485555647, + "grad_norm": 29.7096364564263, + "learning_rate": 5.356567933032839e-07, + "logits/chosen": 0.06697387993335724, + "logits/rejected": -0.0765380859375, + "logps/chosen": -384.75, + "logps/rejected": -379.2749938964844, + "loss": 0.1114, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.08334960788488388, + "rewards/margins": 5.232812404632568, + "rewards/rejected": -5.318359375, + "step": 5770 + }, + { + "epoch": 1.860384646334594, + "grad_norm": 11.80706343139211, + "learning_rate": 5.348518995492594e-07, + "logits/chosen": 0.01434326171875, + "logits/rejected": -0.06100768968462944, + "logps/chosen": -353.38751220703125, + "logps/rejected": -384.7749938964844, + "loss": 0.1037, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.11032714694738388, + "rewards/margins": 4.827734470367432, + "rewards/rejected": -4.717577934265137, + "step": 5780 + }, + { + "epoch": 1.8636034441136236, + "grad_norm": 20.953258925216318, + "learning_rate": 5.34047005795235e-07, + "logits/chosen": -0.06215209886431694, + "logits/rejected": -0.15737304091453552, + "logps/chosen": -485.29998779296875, + "logps/rejected": -456.17498779296875, + "loss": 0.0693, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.03281860426068306, + "rewards/margins": 5.401562690734863, + "rewards/rejected": -5.3671875, + "step": 5790 + }, + { + "epoch": 1.866822241892653, + "grad_norm": 106.46399482537045, + "learning_rate": 5.332421120412105e-07, + "logits/chosen": 0.01157226599752903, + "logits/rejected": -0.1777603179216385, + "logps/chosen": -407.17498779296875, + "logps/rejected": -426.5249938964844, + "loss": 0.118, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.4495849609375, + "rewards/margins": 5.596093654632568, + "rewards/rejected": -6.039843559265137, + "step": 5800 + }, + { + "epoch": 1.8700410396716827, + "grad_norm": 74.49175591176547, + "learning_rate": 5.32437218287186e-07, + "logits/chosen": -0.10709228366613388, + "logits/rejected": -0.24411621689796448, + "logps/chosen": -396.4750061035156, + "logps/rejected": -408.7875061035156, + "loss": 0.0878, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.44763487577438354, + "rewards/margins": 5.201171875, + "rewards/rejected": -5.650000095367432, + "step": 5810 + }, + { + "epoch": 1.8732598374507121, + "grad_norm": 18.1895231465656, + "learning_rate": 5.316323245331617e-07, + "logits/chosen": 0.026006316766142845, + "logits/rejected": -0.16553039848804474, + "logps/chosen": -438.98748779296875, + "logps/rejected": -404.6499938964844, + "loss": 0.086, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.22098389267921448, + "rewards/margins": 5.841406345367432, + "rewards/rejected": -5.617968559265137, + "step": 5820 + }, + { + "epoch": 1.8764786352297418, + "grad_norm": 18.51686829273399, + "learning_rate": 5.308274307791372e-07, + "logits/chosen": 0.02306518517434597, + "logits/rejected": -0.09453125298023224, + "logps/chosen": -388.92498779296875, + "logps/rejected": -390.3500061035156, + "loss": 0.0802, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.20462647080421448, + "rewards/margins": 5.262499809265137, + "rewards/rejected": -5.466406345367432, + "step": 5830 + }, + { + "epoch": 1.8796974330087712, + "grad_norm": 38.09250656371761, + "learning_rate": 5.300225370251126e-07, + "logits/chosen": -0.0587158203125, + "logits/rejected": -0.12989501655101776, + "logps/chosen": -404.6499938964844, + "logps/rejected": -415.7250061035156, + "loss": 0.0915, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.4361938536167145, + "rewards/margins": 5.351953029632568, + "rewards/rejected": -5.789843559265137, + "step": 5840 + }, + { + "epoch": 1.8829162307878007, + "grad_norm": 38.790321758474626, + "learning_rate": 5.292176432710882e-07, + "logits/chosen": -0.0757904052734375, + "logits/rejected": -0.15140381455421448, + "logps/chosen": -416.07501220703125, + "logps/rejected": -425.54998779296875, + "loss": 0.0718, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.39442139863967896, + "rewards/margins": 5.2294921875, + "rewards/rejected": -5.625390529632568, + "step": 5850 + }, + { + "epoch": 1.8861350285668301, + "grad_norm": 69.63446579746764, + "learning_rate": 5.284127495170637e-07, + "logits/chosen": -0.256866455078125, + "logits/rejected": -0.42626649141311646, + "logps/chosen": -395.875, + "logps/rejected": -398.2749938964844, + "loss": 0.1084, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5063232183456421, + "rewards/margins": 5.186327934265137, + "rewards/rejected": -5.69140625, + "step": 5860 + }, + { + "epoch": 1.8893538263458598, + "grad_norm": 63.321449520542956, + "learning_rate": 5.276078557630392e-07, + "logits/chosen": 0.05665893480181694, + "logits/rejected": -0.06230468675494194, + "logps/chosen": -378.1499938964844, + "logps/rejected": -414.79998779296875, + "loss": 0.0872, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.765698254108429, + "rewards/margins": 5.325390815734863, + "rewards/rejected": -6.087890625, + "step": 5870 + }, + { + "epoch": 1.8925726241248895, + "grad_norm": 34.022384452409526, + "learning_rate": 5.268029620090148e-07, + "logits/chosen": -0.05726318433880806, + "logits/rejected": -0.19705811142921448, + "logps/chosen": -410.32501220703125, + "logps/rejected": -386.17498779296875, + "loss": 0.0731, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.07871093600988388, + "rewards/margins": 5.07421875, + "rewards/rejected": -5.151953220367432, + "step": 5880 + }, + { + "epoch": 1.895791421903919, + "grad_norm": 37.42547057407804, + "learning_rate": 5.259980682549903e-07, + "logits/chosen": -0.07297363132238388, + "logits/rejected": -0.11457519233226776, + "logps/chosen": -406.375, + "logps/rejected": -421.79998779296875, + "loss": 0.0975, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.05499267578125, + "rewards/margins": 5.183984279632568, + "rewards/rejected": -5.235937595367432, + "step": 5890 + }, + { + "epoch": 1.8990102196829484, + "grad_norm": 23.852091121005458, + "learning_rate": 5.251931745009659e-07, + "logits/chosen": -0.06105651706457138, + "logits/rejected": -0.10124512016773224, + "logps/chosen": -411.70001220703125, + "logps/rejected": -436.125, + "loss": 0.0916, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.24478760361671448, + "rewards/margins": 5.387499809265137, + "rewards/rejected": -5.633593559265137, + "step": 5900 + }, + { + "epoch": 1.9022290174619778, + "grad_norm": 12.336807667454446, + "learning_rate": 5.243882807469415e-07, + "logits/chosen": -0.16686400771141052, + "logits/rejected": -0.19860534369945526, + "logps/chosen": -354.5249938964844, + "logps/rejected": -371.1499938964844, + "loss": 0.1052, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.700488269329071, + "rewards/margins": 5.0322265625, + "rewards/rejected": -5.729296684265137, + "step": 5910 + }, + { + "epoch": 1.9054478152410075, + "grad_norm": 36.529422111669426, + "learning_rate": 5.235833869929169e-07, + "logits/chosen": -0.07281494140625, + "logits/rejected": -0.1433265656232834, + "logps/chosen": -400.8500061035156, + "logps/rejected": -410.8999938964844, + "loss": 0.0957, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.26002198457717896, + "rewards/margins": 5.455859184265137, + "rewards/rejected": -5.713671684265137, + "step": 5920 + }, + { + "epoch": 1.9086666130200372, + "grad_norm": 33.67091934111222, + "learning_rate": 5.227784932388924e-07, + "logits/chosen": -0.18136902153491974, + "logits/rejected": -0.28101807832717896, + "logps/chosen": -378.25, + "logps/rejected": -436.8999938964844, + "loss": 0.1019, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.29193115234375, + "rewards/margins": 5.6455078125, + "rewards/rejected": -5.935937404632568, + "step": 5930 + }, + { + "epoch": 1.9118854107990666, + "grad_norm": 18.90712105703796, + "learning_rate": 5.21973599484868e-07, + "logits/chosen": -0.09476013481616974, + "logits/rejected": -0.11300048977136612, + "logps/chosen": -372.2749938964844, + "logps/rejected": -391.1499938964844, + "loss": 0.0967, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.2460174560546875, + "rewards/margins": 5.203906059265137, + "rewards/rejected": -5.446484565734863, + "step": 5940 + }, + { + "epoch": 1.915104208578096, + "grad_norm": 47.62229987407999, + "learning_rate": 5.211687057308435e-07, + "logits/chosen": -0.13174133002758026, + "logits/rejected": -0.188018798828125, + "logps/chosen": -418.2875061035156, + "logps/rejected": -405.2749938964844, + "loss": 0.1301, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 0.03687744215130806, + "rewards/margins": 5.544921875, + "rewards/rejected": -5.507421970367432, + "step": 5950 + }, + { + "epoch": 1.9183230063571255, + "grad_norm": 30.36782025317925, + "learning_rate": 5.20363811976819e-07, + "logits/chosen": 0.008181762881577015, + "logits/rejected": -0.0306396484375, + "logps/chosen": -444.9375, + "logps/rejected": -474.13751220703125, + "loss": 0.0893, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.05295410007238388, + "rewards/margins": 5.920702934265137, + "rewards/rejected": -5.972851753234863, + "step": 5960 + }, + { + "epoch": 1.9215418041361552, + "grad_norm": 9.091398492455006, + "learning_rate": 5.195589182227946e-07, + "logits/chosen": -0.15596923232078552, + "logits/rejected": -0.188568115234375, + "logps/chosen": -417.04998779296875, + "logps/rejected": -442.5249938964844, + "loss": 0.0894, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.2807861268520355, + "rewards/margins": 5.7900390625, + "rewards/rejected": -5.51171875, + "step": 5970 + }, + { + "epoch": 1.9247606019151848, + "grad_norm": 41.54188762948916, + "learning_rate": 5.187540244687702e-07, + "logits/chosen": -0.0860137939453125, + "logits/rejected": -0.21099853515625, + "logps/chosen": -421.5, + "logps/rejected": -396.17498779296875, + "loss": 0.1007, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.0055908202193677425, + "rewards/margins": 5.253515720367432, + "rewards/rejected": -5.260156154632568, + "step": 5980 + }, + { + "epoch": 1.9279793996942143, + "grad_norm": 38.47249101399203, + "learning_rate": 5.179491307147456e-07, + "logits/chosen": -0.2519180178642273, + "logits/rejected": -0.28176575899124146, + "logps/chosen": -408.95001220703125, + "logps/rejected": -437.51251220703125, + "loss": 0.0773, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.02382202073931694, + "rewards/margins": 5.330273628234863, + "rewards/rejected": -5.305468559265137, + "step": 5990 + }, + { + "epoch": 1.9311981974732437, + "grad_norm": 64.22269389370365, + "learning_rate": 5.171442369607212e-07, + "logits/chosen": -0.1780853271484375, + "logits/rejected": -0.21173401176929474, + "logps/chosen": -405.70001220703125, + "logps/rejected": -412.2749938964844, + "loss": 0.0791, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.23454590141773224, + "rewards/margins": 5.4375, + "rewards/rejected": -5.20703125, + "step": 6000 + }, + { + "epoch": 1.9344169952522732, + "grad_norm": 43.241020906901, + "learning_rate": 5.163393432066967e-07, + "logits/chosen": -0.18626098334789276, + "logits/rejected": -0.3854126036167145, + "logps/chosen": -423.6499938964844, + "logps/rejected": -430.7749938964844, + "loss": 0.0791, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.09475097805261612, + "rewards/margins": 5.420312404632568, + "rewards/rejected": -5.330468654632568, + "step": 6010 + }, + { + "epoch": 1.9376357930313028, + "grad_norm": 20.789649854270753, + "learning_rate": 5.155344494526722e-07, + "logits/chosen": -0.2843689024448395, + "logits/rejected": -0.36576539278030396, + "logps/chosen": -402.20001220703125, + "logps/rejected": -434.375, + "loss": 0.0532, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.39653319120407104, + "rewards/margins": 5.133203029632568, + "rewards/rejected": -5.534375190734863, + "step": 6020 + }, + { + "epoch": 1.9408545908103323, + "grad_norm": 76.51625101269835, + "learning_rate": 5.147295556986477e-07, + "logits/chosen": -0.26530760526657104, + "logits/rejected": -0.349478155374527, + "logps/chosen": -371.3500061035156, + "logps/rejected": -407.25, + "loss": 0.1404, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.79144287109375, + "rewards/margins": 5.164648532867432, + "rewards/rejected": -5.953906059265137, + "step": 6030 + }, + { + "epoch": 1.944073388589362, + "grad_norm": 29.68754211541463, + "learning_rate": 5.139246619446233e-07, + "logits/chosen": -0.22160644829273224, + "logits/rejected": -0.3105835020542145, + "logps/chosen": -398.8125, + "logps/rejected": -397.375, + "loss": 0.1148, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7256835699081421, + "rewards/margins": 5.153124809265137, + "rewards/rejected": -5.880859375, + "step": 6040 + }, + { + "epoch": 1.9472921863683914, + "grad_norm": 15.89951403118714, + "learning_rate": 5.131197681905988e-07, + "logits/chosen": -0.24526062607765198, + "logits/rejected": -0.31263428926467896, + "logps/chosen": -414.8500061035156, + "logps/rejected": -429.4125061035156, + "loss": 0.0759, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.8549255132675171, + "rewards/margins": 5.357031345367432, + "rewards/rejected": -6.210156440734863, + "step": 6050 + }, + { + "epoch": 1.9505109841474209, + "grad_norm": 59.987317595988294, + "learning_rate": 5.123148744365743e-07, + "logits/chosen": -0.27629393339157104, + "logits/rejected": -0.36912232637405396, + "logps/chosen": -427.04998779296875, + "logps/rejected": -443.04998779296875, + "loss": 0.0702, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.6917968988418579, + "rewards/margins": 5.720312595367432, + "rewards/rejected": -6.412890434265137, + "step": 6060 + }, + { + "epoch": 1.9537297819264505, + "grad_norm": 23.03384349679821, + "learning_rate": 5.115099806825499e-07, + "logits/chosen": -0.2095947265625, + "logits/rejected": -0.299917608499527, + "logps/chosen": -386.875, + "logps/rejected": -396.67498779296875, + "loss": 0.1182, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.38691407442092896, + "rewards/margins": 5.184179782867432, + "rewards/rejected": -5.5732421875, + "step": 6070 + }, + { + "epoch": 1.95694857970548, + "grad_norm": 8.196698718463159, + "learning_rate": 5.107050869285254e-07, + "logits/chosen": -0.0756072998046875, + "logits/rejected": -0.158660888671875, + "logps/chosen": -414.0625, + "logps/rejected": -415.20001220703125, + "loss": 0.0899, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.24333496391773224, + "rewards/margins": 5.342871189117432, + "rewards/rejected": -5.585546970367432, + "step": 6080 + }, + { + "epoch": 1.9601673774845096, + "grad_norm": 45.48152342342879, + "learning_rate": 5.099001931745009e-07, + "logits/chosen": -0.042877197265625, + "logits/rejected": -0.21034851670265198, + "logps/chosen": -398.0874938964844, + "logps/rejected": -390.45001220703125, + "loss": 0.0799, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.24392089247703552, + "rewards/margins": 5.235937595367432, + "rewards/rejected": -5.478515625, + "step": 6090 + }, + { + "epoch": 1.963386175263539, + "grad_norm": 20.61163013690488, + "learning_rate": 5.090952994204765e-07, + "logits/chosen": -0.01190795935690403, + "logits/rejected": -0.0335693359375, + "logps/chosen": -387.3500061035156, + "logps/rejected": -436.95001220703125, + "loss": 0.0654, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.15241698920726776, + "rewards/margins": 5.588671684265137, + "rewards/rejected": -5.739453315734863, + "step": 6100 + }, + { + "epoch": 1.9666049730425685, + "grad_norm": 38.438503610976746, + "learning_rate": 5.08290405666452e-07, + "logits/chosen": 0.010913086123764515, + "logits/rejected": -0.10046996921300888, + "logps/chosen": -420.8500061035156, + "logps/rejected": -408.9750061035156, + "loss": 0.0748, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.08948974311351776, + "rewards/margins": 4.952538967132568, + "rewards/rejected": -4.864062309265137, + "step": 6110 + }, + { + "epoch": 1.969823770821598, + "grad_norm": 23.5695811342812, + "learning_rate": 5.074855119124275e-07, + "logits/chosen": 0.05112915113568306, + "logits/rejected": -0.07587890326976776, + "logps/chosen": -423.75, + "logps/rejected": -463.1499938964844, + "loss": 0.0828, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.5304809808731079, + "rewards/margins": 5.128125190734863, + "rewards/rejected": -5.654296875, + "step": 6120 + }, + { + "epoch": 1.9730425686006277, + "grad_norm": 19.514263089905295, + "learning_rate": 5.06680618158403e-07, + "logits/chosen": -0.14366455376148224, + "logits/rejected": -0.3200927674770355, + "logps/chosen": -397.8125, + "logps/rejected": -390.1000061035156, + "loss": 0.125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3211120665073395, + "rewards/margins": 5.1181640625, + "rewards/rejected": -5.437890529632568, + "step": 6130 + }, + { + "epoch": 1.9762613663796573, + "grad_norm": 27.405193848453475, + "learning_rate": 5.058757244043786e-07, + "logits/chosen": -0.07869720458984375, + "logits/rejected": -0.23430176079273224, + "logps/chosen": -410.7250061035156, + "logps/rejected": -404.07501220703125, + "loss": 0.0912, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.22084960341453552, + "rewards/margins": 5.119140625, + "rewards/rejected": -5.337500095367432, + "step": 6140 + }, + { + "epoch": 1.9794801641586868, + "grad_norm": 5.057869016262051, + "learning_rate": 5.050708306503541e-07, + "logits/chosen": -0.15687866508960724, + "logits/rejected": -0.24101562798023224, + "logps/chosen": -430.9125061035156, + "logps/rejected": -422.75, + "loss": 0.0776, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.11321716010570526, + "rewards/margins": 5.671484470367432, + "rewards/rejected": -5.783203125, + "step": 6150 + }, + { + "epoch": 1.9826989619377162, + "grad_norm": 44.71964302443655, + "learning_rate": 5.042659368963297e-07, + "logits/chosen": -0.04865569993853569, + "logits/rejected": -0.195343017578125, + "logps/chosen": -368.45001220703125, + "logps/rejected": -381.1499938964844, + "loss": 0.1169, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.24116210639476776, + "rewards/margins": 4.916601657867432, + "rewards/rejected": -5.155077934265137, + "step": 6160 + }, + { + "epoch": 1.9859177597167457, + "grad_norm": 92.46781038274051, + "learning_rate": 5.034610431423052e-07, + "logits/chosen": -0.2903808653354645, + "logits/rejected": -0.3939208984375, + "logps/chosen": -432.4750061035156, + "logps/rejected": -421.7250061035156, + "loss": 0.0816, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.3770751953125, + "rewards/margins": 5.705859184265137, + "rewards/rejected": -6.083984375, + "step": 6170 + }, + { + "epoch": 1.9891365574957753, + "grad_norm": 53.711816033988846, + "learning_rate": 5.026561493882807e-07, + "logits/chosen": -0.044342041015625, + "logits/rejected": -0.13478393852710724, + "logps/chosen": -363.42498779296875, + "logps/rejected": -398.4375, + "loss": 0.1277, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.11953125149011612, + "rewards/margins": 5.123632907867432, + "rewards/rejected": -5.005859375, + "step": 6180 + }, + { + "epoch": 1.992355355274805, + "grad_norm": 48.1312937811, + "learning_rate": 5.018512556342563e-07, + "logits/chosen": -0.01826171949505806, + "logits/rejected": -0.13971558213233948, + "logps/chosen": -425.70001220703125, + "logps/rejected": -460.8999938964844, + "loss": 0.0803, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.03101806715130806, + "rewards/margins": 5.546484470367432, + "rewards/rejected": -5.519726753234863, + "step": 6190 + }, + { + "epoch": 1.9955741530538345, + "grad_norm": 17.29681341874368, + "learning_rate": 5.010463618802318e-07, + "logits/chosen": -0.05106201022863388, + "logits/rejected": -0.14269104599952698, + "logps/chosen": -424.875, + "logps/rejected": -435.375, + "loss": 0.0849, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.617279052734375, + "rewards/margins": 5.404296875, + "rewards/rejected": -4.788671970367432, + "step": 6200 + }, + { + "epoch": 1.998792950832864, + "grad_norm": 52.19682708757806, + "learning_rate": 5.002414681262072e-07, + "logits/chosen": -0.06317138671875, + "logits/rejected": -0.1997833251953125, + "logps/chosen": -415.5249938964844, + "logps/rejected": -430.4750061035156, + "loss": 0.0994, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.18785400688648224, + "rewards/margins": 5.373437404632568, + "rewards/rejected": -5.189453125, + "step": 6210 + }, + { + "epoch": 2.001931278667418, + "grad_norm": 5.2862057784612055, + "learning_rate": 4.994365743721828e-07, + "logits/chosen": -0.13613970577716827, + "logits/rejected": -0.319091796875, + "logps/chosen": -403.5384521484375, + "logps/rejected": -404.20513916015625, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007011217996478081, + "rewards/margins": 5.90745210647583, + "rewards/rejected": -5.913862228393555, + "step": 6220 + }, + { + "epoch": 2.0051500764464474, + "grad_norm": 1.1087277318941557, + "learning_rate": 4.986316806181583e-07, + "logits/chosen": -0.23941650986671448, + "logits/rejected": -0.25799560546875, + "logps/chosen": -377.1625061035156, + "logps/rejected": -405.17498779296875, + "loss": 0.0203, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4531494081020355, + "rewards/margins": 6.939843654632568, + "rewards/rejected": -6.483984470367432, + "step": 6230 + }, + { + "epoch": 2.008368874225477, + "grad_norm": 40.271292387002774, + "learning_rate": 4.978267868641339e-07, + "logits/chosen": -0.16410216689109802, + "logits/rejected": -0.3857055604457855, + "logps/chosen": -412.7250061035156, + "logps/rejected": -422.5, + "loss": 0.0213, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.240966796875, + "rewards/margins": 7.298828125, + "rewards/rejected": -7.542187690734863, + "step": 6240 + }, + { + "epoch": 2.0115876720045063, + "grad_norm": 32.17054685716859, + "learning_rate": 4.970218931101095e-07, + "logits/chosen": -0.31768798828125, + "logits/rejected": -0.4900756776332855, + "logps/chosen": -424.7749938964844, + "logps/rejected": -452.45001220703125, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5352813601493835, + "rewards/margins": 7.739062309265137, + "rewards/rejected": -8.275781631469727, + "step": 6250 + }, + { + "epoch": 2.0148064697835357, + "grad_norm": 1.1479393923460086, + "learning_rate": 4.96216999356085e-07, + "logits/chosen": -0.4256729185581207, + "logits/rejected": -0.521533191204071, + "logps/chosen": -421.82501220703125, + "logps/rejected": -463.875, + "loss": 0.0144, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.491943359375, + "rewards/margins": 7.634375095367432, + "rewards/rejected": -9.125391006469727, + "step": 6260 + }, + { + "epoch": 2.018025267562565, + "grad_norm": 4.262035820612848, + "learning_rate": 4.954121056020605e-07, + "logits/chosen": -0.5904022455215454, + "logits/rejected": -0.754711925983429, + "logps/chosen": -423.04998779296875, + "logps/rejected": -456.7250061035156, + "loss": 0.0194, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.570703148841858, + "rewards/margins": 8.009374618530273, + "rewards/rejected": -9.580469131469727, + "step": 6270 + }, + { + "epoch": 2.021244065341595, + "grad_norm": 10.087180704621389, + "learning_rate": 4.94607211848036e-07, + "logits/chosen": -0.56317138671875, + "logits/rejected": -0.6486450433731079, + "logps/chosen": -423.6499938964844, + "logps/rejected": -442.25, + "loss": 0.0373, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.490881323814392, + "rewards/margins": 7.346093654632568, + "rewards/rejected": -8.834375381469727, + "step": 6280 + }, + { + "epoch": 2.0244628631206245, + "grad_norm": 6.444764833502953, + "learning_rate": 4.938023180940115e-07, + "logits/chosen": -0.4296936094760895, + "logits/rejected": -0.54046630859375, + "logps/chosen": -416.875, + "logps/rejected": -444.17498779296875, + "loss": 0.0212, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.351904273033142, + "rewards/margins": 7.407031059265137, + "rewards/rejected": -8.762499809265137, + "step": 6290 + }, + { + "epoch": 2.027681660899654, + "grad_norm": 3.6872122915495447, + "learning_rate": 4.929974243399871e-07, + "logits/chosen": -0.5174560546875, + "logits/rejected": -0.657946765422821, + "logps/chosen": -437.95001220703125, + "logps/rejected": -457.1499938964844, + "loss": 0.0134, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.3420898914337158, + "rewards/margins": 7.514843940734863, + "rewards/rejected": -8.860156059265137, + "step": 6300 + }, + { + "epoch": 2.0309004586786834, + "grad_norm": 11.769905580342098, + "learning_rate": 4.921925305859626e-07, + "logits/chosen": -0.44578856229782104, + "logits/rejected": -0.5584991574287415, + "logps/chosen": -431.75, + "logps/rejected": -447.01251220703125, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7609618902206421, + "rewards/margins": 7.5234375, + "rewards/rejected": -8.290624618530273, + "step": 6310 + }, + { + "epoch": 2.034119256457713, + "grad_norm": 1.982744998119319, + "learning_rate": 4.913876368319382e-07, + "logits/chosen": -0.3163696229457855, + "logits/rejected": -0.467315673828125, + "logps/chosen": -469.04998779296875, + "logps/rejected": -484.20001220703125, + "loss": 0.0194, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.653076171875, + "rewards/margins": 7.573437690734863, + "rewards/rejected": -8.225781440734863, + "step": 6320 + }, + { + "epoch": 2.0373380542367427, + "grad_norm": 6.486293164174652, + "learning_rate": 4.905827430779137e-07, + "logits/chosen": -0.3734985291957855, + "logits/rejected": -0.46455079317092896, + "logps/chosen": -387.79998779296875, + "logps/rejected": -435.82501220703125, + "loss": 0.0243, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.061254858970642, + "rewards/margins": 7.136328220367432, + "rewards/rejected": -8.202343940734863, + "step": 6330 + }, + { + "epoch": 2.040556852015772, + "grad_norm": 2.339835508904128, + "learning_rate": 4.897778493238892e-07, + "logits/chosen": -0.35206300020217896, + "logits/rejected": -0.5419067144393921, + "logps/chosen": -426.4375, + "logps/rejected": -432.42498779296875, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0043823719024658, + "rewards/margins": 7.2578125, + "rewards/rejected": -8.262499809265137, + "step": 6340 + }, + { + "epoch": 2.0437756497948016, + "grad_norm": 7.665855271716088, + "learning_rate": 4.889729555698648e-07, + "logits/chosen": -0.5026229619979858, + "logits/rejected": -0.639984130859375, + "logps/chosen": -403.3999938964844, + "logps/rejected": -454.57501220703125, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8552795648574829, + "rewards/margins": 8.220312118530273, + "rewards/rejected": -9.074999809265137, + "step": 6350 + }, + { + "epoch": 2.046994447573831, + "grad_norm": 8.60049143098422, + "learning_rate": 4.881680618158403e-07, + "logits/chosen": -0.3955322206020355, + "logits/rejected": -0.515289306640625, + "logps/chosen": -436.1000061035156, + "logps/rejected": -474.8999938964844, + "loss": 0.0179, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2161133289337158, + "rewards/margins": 7.976953029632568, + "rewards/rejected": -9.190625190734863, + "step": 6360 + }, + { + "epoch": 2.0502132453528605, + "grad_norm": 2.4630218309312544, + "learning_rate": 4.873631680618158e-07, + "logits/chosen": -0.42431640625, + "logits/rejected": -0.5663818120956421, + "logps/chosen": -437.4437561035156, + "logps/rejected": -461.29998779296875, + "loss": 0.0089, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.8677002191543579, + "rewards/margins": 7.514062404632568, + "rewards/rejected": -8.379687309265137, + "step": 6370 + }, + { + "epoch": 2.0534320431318904, + "grad_norm": 8.435014062652364, + "learning_rate": 4.865582743077913e-07, + "logits/chosen": -0.39013671875, + "logits/rejected": -0.5655883550643921, + "logps/chosen": -446.57501220703125, + "logps/rejected": -466.0249938964844, + "loss": 0.0158, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.913037121295929, + "rewards/margins": 7.5625, + "rewards/rejected": -8.475000381469727, + "step": 6380 + }, + { + "epoch": 2.05665084091092, + "grad_norm": 40.25718319755989, + "learning_rate": 4.857533805537669e-07, + "logits/chosen": -0.495993047952652, + "logits/rejected": -0.646679699420929, + "logps/chosen": -380.17498779296875, + "logps/rejected": -426.125, + "loss": 0.0267, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.968994140625, + "rewards/margins": 7.446093559265137, + "rewards/rejected": -9.416406631469727, + "step": 6390 + }, + { + "epoch": 2.0598696386899493, + "grad_norm": 2.1736715804785542, + "learning_rate": 4.849484867997424e-07, + "logits/chosen": -0.444488525390625, + "logits/rejected": -0.533935546875, + "logps/chosen": -413.0249938964844, + "logps/rejected": -456.8500061035156, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.045385718345642, + "rewards/margins": 7.971875190734863, + "rewards/rejected": -9.015625, + "step": 6400 + }, + { + "epoch": 2.0630884364689788, + "grad_norm": 5.536168387611534, + "learning_rate": 4.84143593045718e-07, + "logits/chosen": -0.3273864686489105, + "logits/rejected": -0.4566101133823395, + "logps/chosen": -440.5375061035156, + "logps/rejected": -489.29998779296875, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.321496605873108, + "rewards/margins": 7.63671875, + "rewards/rejected": -8.96484375, + "step": 6410 + }, + { + "epoch": 2.066307234248008, + "grad_norm": 4.881813564295855, + "learning_rate": 4.833386992916935e-07, + "logits/chosen": -0.4342590272426605, + "logits/rejected": -0.6829589605331421, + "logps/chosen": -434.07501220703125, + "logps/rejected": -447.79998779296875, + "loss": 0.0146, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.1797363758087158, + "rewards/margins": 8.212499618530273, + "rewards/rejected": -9.393750190734863, + "step": 6420 + }, + { + "epoch": 2.069526032027038, + "grad_norm": 1.9544549295767797, + "learning_rate": 4.825338055376689e-07, + "logits/chosen": -0.4042602479457855, + "logits/rejected": -0.5618896484375, + "logps/chosen": -400.86248779296875, + "logps/rejected": -412.75, + "loss": 0.0228, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.6024901866912842, + "rewards/margins": 7.459374904632568, + "rewards/rejected": -9.061718940734863, + "step": 6430 + }, + { + "epoch": 2.0727448298060676, + "grad_norm": 1.0374814534528007, + "learning_rate": 4.817289117836445e-07, + "logits/chosen": -0.4842773377895355, + "logits/rejected": -0.7022460699081421, + "logps/chosen": -435.5249938964844, + "logps/rejected": -477.1499938964844, + "loss": 0.016, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.834252953529358, + "rewards/margins": 8.06640625, + "rewards/rejected": -9.90625, + "step": 6440 + }, + { + "epoch": 2.075963627585097, + "grad_norm": 2.322452905510868, + "learning_rate": 4.809240180296201e-07, + "logits/chosen": -0.24342040717601776, + "logits/rejected": -0.510784924030304, + "logps/chosen": -454.98748779296875, + "logps/rejected": -436.17498779296875, + "loss": 0.0335, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.1854248046875, + "rewards/margins": 7.805468559265137, + "rewards/rejected": -8.98828125, + "step": 6450 + }, + { + "epoch": 2.0791824253641265, + "grad_norm": 6.34333890343581, + "learning_rate": 4.801191242755956e-07, + "logits/chosen": -0.352783203125, + "logits/rejected": -0.5975097417831421, + "logps/chosen": -411.0249938964844, + "logps/rejected": -418.54998779296875, + "loss": 0.0215, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.5115234851837158, + "rewards/margins": 7.765625, + "rewards/rejected": -9.27734375, + "step": 6460 + }, + { + "epoch": 2.082401223143156, + "grad_norm": 17.949228473965036, + "learning_rate": 4.793142305215711e-07, + "logits/chosen": -0.43183594942092896, + "logits/rejected": -0.589062511920929, + "logps/chosen": -373.32501220703125, + "logps/rejected": -429.20001220703125, + "loss": 0.0145, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.659460425376892, + "rewards/margins": 7.76171875, + "rewards/rejected": -9.41796875, + "step": 6470 + }, + { + "epoch": 2.0856200209221853, + "grad_norm": 1.5490988232327076, + "learning_rate": 4.785093367675467e-07, + "logits/chosen": -0.2710632383823395, + "logits/rejected": -0.460641473531723, + "logps/chosen": -426.07501220703125, + "logps/rejected": -457.17498779296875, + "loss": 0.0238, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.18896484375, + "rewards/margins": 7.443749904632568, + "rewards/rejected": -8.631250381469727, + "step": 6480 + }, + { + "epoch": 2.0888388187012152, + "grad_norm": 11.295202537887707, + "learning_rate": 4.777044430135222e-07, + "logits/chosen": -0.3290039002895355, + "logits/rejected": -0.5732482671737671, + "logps/chosen": -417.04998779296875, + "logps/rejected": -421.57501220703125, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0994994640350342, + "rewards/margins": 8.0546875, + "rewards/rejected": -9.15625, + "step": 6490 + }, + { + "epoch": 2.0920576164802447, + "grad_norm": 2.3469582943794443, + "learning_rate": 4.768995492594977e-07, + "logits/chosen": -0.362060546875, + "logits/rejected": -0.5155075192451477, + "logps/chosen": -401.6499938964844, + "logps/rejected": -452.625, + "loss": 0.012, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.316137671470642, + "rewards/margins": 7.415625095367432, + "rewards/rejected": -8.734375, + "step": 6500 + }, + { + "epoch": 2.095276414259274, + "grad_norm": 5.92952458465171, + "learning_rate": 4.7609465550547323e-07, + "logits/chosen": -0.323355108499527, + "logits/rejected": -0.46308594942092896, + "logps/chosen": -378.3125, + "logps/rejected": -415.375, + "loss": 0.0309, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.6087524890899658, + "rewards/margins": 7.735156059265137, + "rewards/rejected": -9.349218368530273, + "step": 6510 + }, + { + "epoch": 2.0984952120383036, + "grad_norm": 2.3921642325845327, + "learning_rate": 4.752897617514488e-07, + "logits/chosen": -0.5409164428710938, + "logits/rejected": -0.6092529296875, + "logps/chosen": -391.61248779296875, + "logps/rejected": -428.54998779296875, + "loss": 0.0181, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4998900890350342, + "rewards/margins": 7.907031059265137, + "rewards/rejected": -9.40234375, + "step": 6520 + }, + { + "epoch": 2.101714009817333, + "grad_norm": 18.88228082097493, + "learning_rate": 4.744848679974243e-07, + "logits/chosen": -0.3541259765625, + "logits/rejected": -0.622729480266571, + "logps/chosen": -462.82501220703125, + "logps/rejected": -471.57501220703125, + "loss": 0.0186, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.280175805091858, + "rewards/margins": 8.696874618530273, + "rewards/rejected": -9.970312118530273, + "step": 6530 + }, + { + "epoch": 2.104932807596363, + "grad_norm": 1.3032825202175546, + "learning_rate": 4.7367997424339984e-07, + "logits/chosen": -0.21243897080421448, + "logits/rejected": -0.4118896424770355, + "logps/chosen": -467.32501220703125, + "logps/rejected": -483.8500061035156, + "loss": 0.0385, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.543737769126892, + "rewards/margins": 8.092577934265137, + "rewards/rejected": -9.630859375, + "step": 6540 + }, + { + "epoch": 2.1081516053753924, + "grad_norm": 2.7322994570704195, + "learning_rate": 4.728750804893754e-07, + "logits/chosen": -0.4007507264614105, + "logits/rejected": -0.581439197063446, + "logps/chosen": -394.71875, + "logps/rejected": -440.0, + "loss": 0.0135, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.607153296470642, + "rewards/margins": 7.955468654632568, + "rewards/rejected": -9.568750381469727, + "step": 6550 + }, + { + "epoch": 2.111370403154422, + "grad_norm": 2.4854151522082635, + "learning_rate": 4.720701867353509e-07, + "logits/chosen": -0.44205933809280396, + "logits/rejected": -0.5842254757881165, + "logps/chosen": -392.4750061035156, + "logps/rejected": -453.79998779296875, + "loss": 0.016, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.4242675304412842, + "rewards/margins": 7.196875095367432, + "rewards/rejected": -8.620312690734863, + "step": 6560 + }, + { + "epoch": 2.1145892009334513, + "grad_norm": 12.370829596464986, + "learning_rate": 4.7126529298132645e-07, + "logits/chosen": -0.3435913026332855, + "logits/rejected": -0.548449695110321, + "logps/chosen": -455.6499938964844, + "logps/rejected": -469.6000061035156, + "loss": 0.0243, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.0544312000274658, + "rewards/margins": 7.495312690734863, + "rewards/rejected": -8.549219131469727, + "step": 6570 + }, + { + "epoch": 2.1178079987124807, + "grad_norm": 1.314802998002699, + "learning_rate": 4.7046039922730194e-07, + "logits/chosen": -0.2937866151332855, + "logits/rejected": -0.507367730140686, + "logps/chosen": -447.3500061035156, + "logps/rejected": -511.0249938964844, + "loss": 0.0137, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.348626732826233, + "rewards/margins": 8.11328125, + "rewards/rejected": -9.458593368530273, + "step": 6580 + }, + { + "epoch": 2.1210267964915106, + "grad_norm": 1.7779582168758232, + "learning_rate": 4.6965550547327747e-07, + "logits/chosen": -0.43897706270217896, + "logits/rejected": -0.582928478717804, + "logps/chosen": -411.48126220703125, + "logps/rejected": -466.2250061035156, + "loss": 0.0177, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4311645030975342, + "rewards/margins": 8.09375, + "rewards/rejected": -9.521093368530273, + "step": 6590 + }, + { + "epoch": 2.12424559427054, + "grad_norm": 1.5037446162735844, + "learning_rate": 4.6885061171925306e-07, + "logits/chosen": -0.45949095487594604, + "logits/rejected": -0.6290527582168579, + "logps/chosen": -430.6499938964844, + "logps/rejected": -464.92498779296875, + "loss": 0.0156, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.810034155845642, + "rewards/margins": 7.908593654632568, + "rewards/rejected": -9.72265625, + "step": 6600 + }, + { + "epoch": 2.1274643920495695, + "grad_norm": 22.500032060967264, + "learning_rate": 4.6804571796522855e-07, + "logits/chosen": -0.4304138123989105, + "logits/rejected": -0.744494616985321, + "logps/chosen": -424.42498779296875, + "logps/rejected": -447.6499938964844, + "loss": 0.022, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.101855516433716, + "rewards/margins": 8.159375190734863, + "rewards/rejected": -10.260156631469727, + "step": 6610 + }, + { + "epoch": 2.130683189828599, + "grad_norm": 3.5632629344958153, + "learning_rate": 4.672408242112041e-07, + "logits/chosen": -0.49672240018844604, + "logits/rejected": -0.671032726764679, + "logps/chosen": -418.70001220703125, + "logps/rejected": -482.29998779296875, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5508911609649658, + "rewards/margins": 8.33984375, + "rewards/rejected": -9.889062881469727, + "step": 6620 + }, + { + "epoch": 2.1339019876076284, + "grad_norm": 80.67424797263004, + "learning_rate": 4.664359304571796e-07, + "logits/chosen": -0.5140380859375, + "logits/rejected": -0.5865691900253296, + "logps/chosen": -360.01251220703125, + "logps/rejected": -428.2749938964844, + "loss": 0.0288, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.78778076171875, + "rewards/margins": 7.965234279632568, + "rewards/rejected": -9.753125190734863, + "step": 6630 + }, + { + "epoch": 2.1371207853866583, + "grad_norm": 2.8115276149799495, + "learning_rate": 4.6563103670315516e-07, + "logits/chosen": -0.38343507051467896, + "logits/rejected": -0.6332550048828125, + "logps/chosen": -442.70001220703125, + "logps/rejected": -480.5, + "loss": 0.012, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.2860350608825684, + "rewards/margins": 8.053906440734863, + "rewards/rejected": -10.345312118530273, + "step": 6640 + }, + { + "epoch": 2.1403395831656877, + "grad_norm": 5.282101897692132, + "learning_rate": 4.648261429491307e-07, + "logits/chosen": -0.46976011991500854, + "logits/rejected": -0.6455932855606079, + "logps/chosen": -394.2875061035156, + "logps/rejected": -442.5, + "loss": 0.0264, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.9209716320037842, + "rewards/margins": 7.967968940734863, + "rewards/rejected": -9.893750190734863, + "step": 6650 + }, + { + "epoch": 2.143558380944717, + "grad_norm": 1.8166299660938694, + "learning_rate": 4.6402124919510623e-07, + "logits/chosen": -0.343374639749527, + "logits/rejected": -0.6359313726425171, + "logps/chosen": -446.38751220703125, + "logps/rejected": -475.04998779296875, + "loss": 0.0156, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.512719750404358, + "rewards/margins": 8.516406059265137, + "rewards/rejected": -10.038281440734863, + "step": 6660 + }, + { + "epoch": 2.1467771787237466, + "grad_norm": 2.0038915794637586, + "learning_rate": 4.632163554410817e-07, + "logits/chosen": -0.5651000738143921, + "logits/rejected": -0.686968982219696, + "logps/chosen": -425.6499938964844, + "logps/rejected": -451.54998779296875, + "loss": 0.0143, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.4739990234375, + "rewards/margins": 8.022656440734863, + "rewards/rejected": -9.495312690734863, + "step": 6670 + }, + { + "epoch": 2.149995976502776, + "grad_norm": 5.327334701196706, + "learning_rate": 4.624114616870573e-07, + "logits/chosen": -0.29594725370407104, + "logits/rejected": -0.586138904094696, + "logps/chosen": -412.82501220703125, + "logps/rejected": -397.75, + "loss": 0.0301, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.43182373046875, + "rewards/margins": 7.790625095367432, + "rewards/rejected": -9.224218368530273, + "step": 6680 + }, + { + "epoch": 2.153214774281806, + "grad_norm": 6.563652793607414, + "learning_rate": 4.6160656793303284e-07, + "logits/chosen": -0.3126892149448395, + "logits/rejected": -0.5355468988418579, + "logps/chosen": -441.125, + "logps/rejected": -454.7250061035156, + "loss": 0.0176, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.344793677330017, + "rewards/margins": 7.789843559265137, + "rewards/rejected": -9.135156631469727, + "step": 6690 + }, + { + "epoch": 2.1564335720608354, + "grad_norm": 2.5257347277310034, + "learning_rate": 4.608016741790083e-07, + "logits/chosen": -0.20808258652687073, + "logits/rejected": -0.41701048612594604, + "logps/chosen": -472.625, + "logps/rejected": -479.95001220703125, + "loss": 0.0186, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.1083252429962158, + "rewards/margins": 8.517969131469727, + "rewards/rejected": -9.622655868530273, + "step": 6700 + }, + { + "epoch": 2.159652369839865, + "grad_norm": 8.081451493069439, + "learning_rate": 4.5999678042498386e-07, + "logits/chosen": -0.353759765625, + "logits/rejected": -0.566394031047821, + "logps/chosen": -460.2250061035156, + "logps/rejected": -487.57501220703125, + "loss": 0.0144, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.6987793445587158, + "rewards/margins": 8.356249809265137, + "rewards/rejected": -10.053906440734863, + "step": 6710 + }, + { + "epoch": 2.1628711676188943, + "grad_norm": 2.9803888057924692, + "learning_rate": 4.5919188667095945e-07, + "logits/chosen": -0.5055907964706421, + "logits/rejected": -0.641845703125, + "logps/chosen": -434.20001220703125, + "logps/rejected": -459.67498779296875, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.071240186691284, + "rewards/margins": 8.150781631469727, + "rewards/rejected": -10.225000381469727, + "step": 6720 + }, + { + "epoch": 2.1660899653979238, + "grad_norm": 1.3161852929853655, + "learning_rate": 4.5838699291693493e-07, + "logits/chosen": -0.45517271757125854, + "logits/rejected": -0.5345489382743835, + "logps/chosen": -436.3500061035156, + "logps/rejected": -510.04998779296875, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2069945335388184, + "rewards/margins": 8.235937118530273, + "rewards/rejected": -10.443750381469727, + "step": 6730 + }, + { + "epoch": 2.169308763176953, + "grad_norm": 1.6383841060756084, + "learning_rate": 4.5758209916291047e-07, + "logits/chosen": -0.47478026151657104, + "logits/rejected": -0.7142699956893921, + "logps/chosen": -426.2250061035156, + "logps/rejected": -463.75, + "loss": 0.024, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.494702100753784, + "rewards/margins": 8.260937690734863, + "rewards/rejected": -10.7578125, + "step": 6740 + }, + { + "epoch": 2.172527560955983, + "grad_norm": 5.102466569932819, + "learning_rate": 4.5677720540888596e-07, + "logits/chosen": -0.525256335735321, + "logits/rejected": -0.6061462163925171, + "logps/chosen": -429.25, + "logps/rejected": -483.42498779296875, + "loss": 0.0144, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.65643310546875, + "rewards/margins": 8.112500190734863, + "rewards/rejected": -9.766406059265137, + "step": 6750 + }, + { + "epoch": 2.1757463587350125, + "grad_norm": 3.369516843576246, + "learning_rate": 4.5597231165486154e-07, + "logits/chosen": -0.47481077909469604, + "logits/rejected": -0.690386950969696, + "logps/chosen": -446.1499938964844, + "logps/rejected": -456.04998779296875, + "loss": 0.0139, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.057568311691284, + "rewards/margins": 7.926562309265137, + "rewards/rejected": -9.982812881469727, + "step": 6760 + }, + { + "epoch": 2.178965156514042, + "grad_norm": 3.0613469426656783, + "learning_rate": 4.551674179008371e-07, + "logits/chosen": -0.4301605224609375, + "logits/rejected": -0.628063976764679, + "logps/chosen": -434.95001220703125, + "logps/rejected": -456.67498779296875, + "loss": 0.0133, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.810430884361267, + "rewards/margins": 8.171875, + "rewards/rejected": -9.98046875, + "step": 6770 + }, + { + "epoch": 2.1821839542930714, + "grad_norm": 4.307468931530203, + "learning_rate": 4.5436252414681257e-07, + "logits/chosen": -0.48350220918655396, + "logits/rejected": -0.7245117425918579, + "logps/chosen": -442.2749938964844, + "logps/rejected": -443.45001220703125, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4184386730194092, + "rewards/margins": 7.992968559265137, + "rewards/rejected": -9.418749809265137, + "step": 6780 + }, + { + "epoch": 2.185402752072101, + "grad_norm": 10.160887079800993, + "learning_rate": 4.535576303927881e-07, + "logits/chosen": -0.39216309785842896, + "logits/rejected": -0.6262573003768921, + "logps/chosen": -424.25, + "logps/rejected": -432.79998779296875, + "loss": 0.0122, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.430908203125, + "rewards/margins": 7.848437309265137, + "rewards/rejected": -9.284375190734863, + "step": 6790 + }, + { + "epoch": 2.1886215498511308, + "grad_norm": 0.6896350858753575, + "learning_rate": 4.527527366387637e-07, + "logits/chosen": -0.36879271268844604, + "logits/rejected": -0.5271972417831421, + "logps/chosen": -409.42498779296875, + "logps/rejected": -477.6000061035156, + "loss": 0.0134, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.7334716320037842, + "rewards/margins": 8.059374809265137, + "rewards/rejected": -9.799219131469727, + "step": 6800 + }, + { + "epoch": 2.1918403476301602, + "grad_norm": 3.9758978880958806, + "learning_rate": 4.519478428847392e-07, + "logits/chosen": -0.40953367948532104, + "logits/rejected": -0.5908203125, + "logps/chosen": -417.125, + "logps/rejected": -473.57501220703125, + "loss": 0.0109, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.9662353992462158, + "rewards/margins": 8.309374809265137, + "rewards/rejected": -10.27734375, + "step": 6810 + }, + { + "epoch": 2.1950591454091897, + "grad_norm": 12.863131160665333, + "learning_rate": 4.511429491307147e-07, + "logits/chosen": -0.41864013671875, + "logits/rejected": -0.557666003704071, + "logps/chosen": -455.2749938964844, + "logps/rejected": -477.20001220703125, + "loss": 0.03, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.6106445789337158, + "rewards/margins": 8.711718559265137, + "rewards/rejected": -10.325780868530273, + "step": 6820 + }, + { + "epoch": 2.198277943188219, + "grad_norm": 6.551549963385313, + "learning_rate": 4.5033805537669025e-07, + "logits/chosen": -0.57122802734375, + "logits/rejected": -0.589245617389679, + "logps/chosen": -425.2250061035156, + "logps/rejected": -469.95001220703125, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.960961937904358, + "rewards/margins": 8.095312118530273, + "rewards/rejected": -10.052343368530273, + "step": 6830 + }, + { + "epoch": 2.2014967409672486, + "grad_norm": 2.865914305322498, + "learning_rate": 4.495331616226658e-07, + "logits/chosen": -0.4785003662109375, + "logits/rejected": -0.650103747844696, + "logps/chosen": -425.29998779296875, + "logps/rejected": -462.67498779296875, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8685791492462158, + "rewards/margins": 7.992968559265137, + "rewards/rejected": -9.860156059265137, + "step": 6840 + }, + { + "epoch": 2.2047155387462785, + "grad_norm": 24.972691169807785, + "learning_rate": 4.487282678686413e-07, + "logits/chosen": -0.45643311738967896, + "logits/rejected": -0.599993884563446, + "logps/chosen": -456.4750061035156, + "logps/rejected": -491.20001220703125, + "loss": 0.0155, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.939233422279358, + "rewards/margins": 8.542187690734863, + "rewards/rejected": -10.489062309265137, + "step": 6850 + }, + { + "epoch": 2.207934336525308, + "grad_norm": 3.3081101481009, + "learning_rate": 4.4792337411461686e-07, + "logits/chosen": -0.5624359250068665, + "logits/rejected": -0.719451904296875, + "logps/chosen": -422.70001220703125, + "logps/rejected": -448.45001220703125, + "loss": 0.0147, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.2920899391174316, + "rewards/margins": 8.030468940734863, + "rewards/rejected": -10.324999809265137, + "step": 6860 + }, + { + "epoch": 2.2111531343043374, + "grad_norm": 7.742675321816513, + "learning_rate": 4.4711848036059234e-07, + "logits/chosen": -0.2821106016635895, + "logits/rejected": -0.5432373285293579, + "logps/chosen": -449.125, + "logps/rejected": -468.42498779296875, + "loss": 0.0195, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.091992139816284, + "rewards/margins": 8.239062309265137, + "rewards/rejected": -10.324999809265137, + "step": 6870 + }, + { + "epoch": 2.214371932083367, + "grad_norm": 2.038012153080145, + "learning_rate": 4.4631358660656793e-07, + "logits/chosen": -0.5516113042831421, + "logits/rejected": -0.7692626714706421, + "logps/chosen": -403.57501220703125, + "logps/rejected": -432.57501220703125, + "loss": 0.0171, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.775488257408142, + "rewards/margins": 8.052343368530273, + "rewards/rejected": -9.830469131469727, + "step": 6880 + }, + { + "epoch": 2.2175907298623962, + "grad_norm": 1.2124659296751392, + "learning_rate": 4.4550869285254347e-07, + "logits/chosen": -0.36259764432907104, + "logits/rejected": -0.44916075468063354, + "logps/chosen": -460.2749938964844, + "logps/rejected": -471.75, + "loss": 0.0303, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.43817138671875, + "rewards/margins": 7.552343845367432, + "rewards/rejected": -8.986719131469727, + "step": 6890 + }, + { + "epoch": 2.2208095276414257, + "grad_norm": 0.6142716813514184, + "learning_rate": 4.4470379909851895e-07, + "logits/chosen": -0.45183104276657104, + "logits/rejected": -0.5668090581893921, + "logps/chosen": -435.875, + "logps/rejected": -488.45001220703125, + "loss": 0.0154, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.9280273914337158, + "rewards/margins": 8.278905868530273, + "rewards/rejected": -10.200780868530273, + "step": 6900 + }, + { + "epoch": 2.2240283254204556, + "grad_norm": 3.933702193350116, + "learning_rate": 4.4389890534449454e-07, + "logits/chosen": -0.39494627714157104, + "logits/rejected": -0.651257336139679, + "logps/chosen": -415.7250061035156, + "logps/rejected": -462.79998779296875, + "loss": 0.0192, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.0323729515075684, + "rewards/margins": 8.420312881469727, + "rewards/rejected": -10.446093559265137, + "step": 6910 + }, + { + "epoch": 2.227247123199485, + "grad_norm": 2.8388643011278183, + "learning_rate": 4.4309401159047e-07, + "logits/chosen": -0.42948609590530396, + "logits/rejected": -0.6564308404922485, + "logps/chosen": -403.17498779296875, + "logps/rejected": -449.0, + "loss": 0.0093, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.400830030441284, + "rewards/margins": 7.942187309265137, + "rewards/rejected": -10.337499618530273, + "step": 6920 + }, + { + "epoch": 2.2304659209785145, + "grad_norm": 7.208289058830082, + "learning_rate": 4.4228911783644556e-07, + "logits/chosen": -0.5129852294921875, + "logits/rejected": -0.6410506963729858, + "logps/chosen": -403.5375061035156, + "logps/rejected": -448.0249938964844, + "loss": 0.0174, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.7335448265075684, + "rewards/margins": 8.119531631469727, + "rewards/rejected": -10.85546875, + "step": 6930 + }, + { + "epoch": 2.233684718757544, + "grad_norm": 8.037517489431174, + "learning_rate": 4.414842240824211e-07, + "logits/chosen": -0.414346307516098, + "logits/rejected": -0.5978149175643921, + "logps/chosen": -388.61248779296875, + "logps/rejected": -428.8500061035156, + "loss": 0.016, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.0177245140075684, + "rewards/margins": 8.354687690734863, + "rewards/rejected": -10.379687309265137, + "step": 6940 + }, + { + "epoch": 2.2369035165365734, + "grad_norm": 2.4415595543961, + "learning_rate": 4.4067933032839664e-07, + "logits/chosen": -0.579821765422821, + "logits/rejected": -0.674487292766571, + "logps/chosen": -375.4750061035156, + "logps/rejected": -429.07501220703125, + "loss": 0.018, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.206127882003784, + "rewards/margins": 8.067968368530273, + "rewards/rejected": -10.27734375, + "step": 6950 + }, + { + "epoch": 2.2401223143156033, + "grad_norm": 3.5771038085164304, + "learning_rate": 4.3987443657437217e-07, + "logits/chosen": -0.36567384004592896, + "logits/rejected": -0.5666259527206421, + "logps/chosen": -420.67498779296875, + "logps/rejected": -481.2250061035156, + "loss": 0.0191, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.874609351158142, + "rewards/margins": 8.506250381469727, + "rewards/rejected": -10.384374618530273, + "step": 6960 + }, + { + "epoch": 2.2433411120946327, + "grad_norm": 1.6496011541154523, + "learning_rate": 4.390695428203477e-07, + "logits/chosen": -0.386383056640625, + "logits/rejected": -0.6001884341239929, + "logps/chosen": -485.375, + "logps/rejected": -494.6499938964844, + "loss": 0.0147, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.5666992664337158, + "rewards/margins": 8.836718559265137, + "rewards/rejected": -10.393750190734863, + "step": 6970 + }, + { + "epoch": 2.246559909873662, + "grad_norm": 20.85563937605079, + "learning_rate": 4.382646490663232e-07, + "logits/chosen": -0.3747432827949524, + "logits/rejected": -0.642163097858429, + "logps/chosen": -480.8999938964844, + "logps/rejected": -482.70001220703125, + "loss": 0.0136, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.3547852039337158, + "rewards/margins": 8.017969131469727, + "rewards/rejected": -9.375781059265137, + "step": 6980 + }, + { + "epoch": 2.2497787076526916, + "grad_norm": 4.199251270219436, + "learning_rate": 4.374597553122988e-07, + "logits/chosen": -0.33784788846969604, + "logits/rejected": -0.5712555050849915, + "logps/chosen": -447.95001220703125, + "logps/rejected": -452.6000061035156, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7818114757537842, + "rewards/margins": 7.881249904632568, + "rewards/rejected": -9.6640625, + "step": 6990 + }, + { + "epoch": 2.252997505431721, + "grad_norm": 10.683399812010224, + "learning_rate": 4.366548615582743e-07, + "logits/chosen": -0.4280029237270355, + "logits/rejected": -0.639965832233429, + "logps/chosen": -430.875, + "logps/rejected": -436.375, + "loss": 0.0117, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.6315186023712158, + "rewards/margins": 7.53515625, + "rewards/rejected": -9.164843559265137, + "step": 7000 + }, + { + "epoch": 2.256216303210751, + "grad_norm": 2.1259327412472144, + "learning_rate": 4.358499678042498e-07, + "logits/chosen": -0.3499999940395355, + "logits/rejected": -0.583264172077179, + "logps/chosen": -448.95001220703125, + "logps/rejected": -482.95001220703125, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.887963891029358, + "rewards/margins": 8.198437690734863, + "rewards/rejected": -10.096094131469727, + "step": 7010 + }, + { + "epoch": 2.2594351009897804, + "grad_norm": 34.75813639858462, + "learning_rate": 4.3504507405022534e-07, + "logits/chosen": -0.3892883360385895, + "logits/rejected": -0.5100952386856079, + "logps/chosen": -440.1499938964844, + "logps/rejected": -470.54998779296875, + "loss": 0.0145, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.663177490234375, + "rewards/margins": 8.093358993530273, + "rewards/rejected": -10.754687309265137, + "step": 7020 + }, + { + "epoch": 2.26265389876881, + "grad_norm": 12.867954392883277, + "learning_rate": 4.3424018029620093e-07, + "logits/chosen": -0.35641783475875854, + "logits/rejected": -0.5089477300643921, + "logps/chosen": -440.7250061035156, + "logps/rejected": -469.125, + "loss": 0.0221, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.352856397628784, + "rewards/margins": 7.935546875, + "rewards/rejected": -10.300000190734863, + "step": 7030 + }, + { + "epoch": 2.2658726965478393, + "grad_norm": 13.796053001707623, + "learning_rate": 4.334352865421764e-07, + "logits/chosen": -0.3868957459926605, + "logits/rejected": -0.5830017328262329, + "logps/chosen": -444.07501220703125, + "logps/rejected": -490.375, + "loss": 0.0211, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.2838134765625, + "rewards/margins": 8.257031440734863, + "rewards/rejected": -10.543749809265137, + "step": 7040 + }, + { + "epoch": 2.2690914943268687, + "grad_norm": 10.817657929980204, + "learning_rate": 4.3263039278815195e-07, + "logits/chosen": -0.47445982694625854, + "logits/rejected": -0.631579577922821, + "logps/chosen": -413.875, + "logps/rejected": -457.95001220703125, + "loss": 0.0171, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.607421875, + "rewards/margins": 7.644140720367432, + "rewards/rejected": -9.246874809265137, + "step": 7050 + }, + { + "epoch": 2.2723102921058986, + "grad_norm": 0.1986378748395385, + "learning_rate": 4.3182549903412743e-07, + "logits/chosen": -0.42711180448532104, + "logits/rejected": -0.52874755859375, + "logps/chosen": -413.5, + "logps/rejected": -461.29998779296875, + "loss": 0.011, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.8063476085662842, + "rewards/margins": 7.88671875, + "rewards/rejected": -9.689844131469727, + "step": 7060 + }, + { + "epoch": 2.275529089884928, + "grad_norm": 2.0457549768370122, + "learning_rate": 4.31020605280103e-07, + "logits/chosen": -0.384246826171875, + "logits/rejected": -0.5200439691543579, + "logps/chosen": -422.73748779296875, + "logps/rejected": -499.54998779296875, + "loss": 0.0223, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.590429663658142, + "rewards/margins": 8.306249618530273, + "rewards/rejected": -9.900781631469727, + "step": 7070 + }, + { + "epoch": 2.2787478876639575, + "grad_norm": 3.0872161511806406, + "learning_rate": 4.3021571152607856e-07, + "logits/chosen": -0.40046995878219604, + "logits/rejected": -0.6213623285293579, + "logps/chosen": -390.375, + "logps/rejected": -452.04998779296875, + "loss": 0.0081, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.6278502941131592, + "rewards/margins": 7.90625, + "rewards/rejected": -9.54296875, + "step": 7080 + }, + { + "epoch": 2.281966685442987, + "grad_norm": 0.6282470498113244, + "learning_rate": 4.2941081777205404e-07, + "logits/chosen": -0.38303834199905396, + "logits/rejected": -0.5684570074081421, + "logps/chosen": -458.2250061035156, + "logps/rejected": -448.5249938964844, + "loss": 0.0168, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.560583472251892, + "rewards/margins": 8.1171875, + "rewards/rejected": -9.682031631469727, + "step": 7090 + }, + { + "epoch": 2.2851854832220164, + "grad_norm": 6.127606556892711, + "learning_rate": 4.286059240180296e-07, + "logits/chosen": -0.33380126953125, + "logits/rejected": -0.59716796875, + "logps/chosen": -432.4750061035156, + "logps/rejected": -436.45001220703125, + "loss": 0.0174, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.086840867996216, + "rewards/margins": 7.678124904632568, + "rewards/rejected": -9.760937690734863, + "step": 7100 + }, + { + "epoch": 2.2884042810010463, + "grad_norm": 5.794986173158012, + "learning_rate": 4.2780103026400517e-07, + "logits/chosen": -0.3682495057582855, + "logits/rejected": -0.563232421875, + "logps/chosen": -407.6000061035156, + "logps/rejected": -439.17498779296875, + "loss": 0.0145, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.212133765220642, + "rewards/margins": 7.938281059265137, + "rewards/rejected": -9.154687881469727, + "step": 7110 + }, + { + "epoch": 2.2916230787800758, + "grad_norm": 9.36827391522123, + "learning_rate": 4.2699613650998065e-07, + "logits/chosen": -0.37192994356155396, + "logits/rejected": -0.5132812261581421, + "logps/chosen": -460.45001220703125, + "logps/rejected": -464.82501220703125, + "loss": 0.0255, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.558837890625, + "rewards/margins": 7.618750095367432, + "rewards/rejected": -9.181249618530273, + "step": 7120 + }, + { + "epoch": 2.294841876559105, + "grad_norm": 2.00797735326429, + "learning_rate": 4.261912427559562e-07, + "logits/chosen": -0.289743036031723, + "logits/rejected": -0.44191282987594604, + "logps/chosen": -398.04998779296875, + "logps/rejected": -457.32501220703125, + "loss": 0.0346, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.880957007408142, + "rewards/margins": 7.824609279632568, + "rewards/rejected": -9.705469131469727, + "step": 7130 + }, + { + "epoch": 2.2980606743381347, + "grad_norm": 23.148878001804906, + "learning_rate": 4.2538634900193173e-07, + "logits/chosen": -0.4847778379917145, + "logits/rejected": -0.5170882940292358, + "logps/chosen": -385.9750061035156, + "logps/rejected": -457.79998779296875, + "loss": 0.0272, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.60162353515625, + "rewards/margins": 7.864843845367432, + "rewards/rejected": -9.463281631469727, + "step": 7140 + }, + { + "epoch": 2.301279472117164, + "grad_norm": 3.991288380245251, + "learning_rate": 4.2458145524790727e-07, + "logits/chosen": -0.46466064453125, + "logits/rejected": -0.697338879108429, + "logps/chosen": -439.2749938964844, + "logps/rejected": -459.92498779296875, + "loss": 0.014, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.973242163658142, + "rewards/margins": 7.76171875, + "rewards/rejected": -9.734375, + "step": 7150 + }, + { + "epoch": 2.304498269896194, + "grad_norm": 10.149750304584762, + "learning_rate": 4.237765614938828e-07, + "logits/chosen": -0.48176270723342896, + "logits/rejected": -0.7100585699081421, + "logps/chosen": -399.32501220703125, + "logps/rejected": -433.375, + "loss": 0.0314, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.7082030773162842, + "rewards/margins": 8.051172256469727, + "rewards/rejected": -9.759374618530273, + "step": 7160 + }, + { + "epoch": 2.3077170676752234, + "grad_norm": 1.8495107343833674, + "learning_rate": 4.2297166773985834e-07, + "logits/chosen": -0.3635009825229645, + "logits/rejected": -0.56182861328125, + "logps/chosen": -399.8500061035156, + "logps/rejected": -440.5, + "loss": 0.0238, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.6438171863555908, + "rewards/margins": 8.005468368530273, + "rewards/rejected": -9.651562690734863, + "step": 7170 + }, + { + "epoch": 2.310935865454253, + "grad_norm": 4.493359902290196, + "learning_rate": 4.221667739858338e-07, + "logits/chosen": -0.36814576387405396, + "logits/rejected": -0.507800281047821, + "logps/chosen": -373.51251220703125, + "logps/rejected": -443.57501220703125, + "loss": 0.0241, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.654394507408142, + "rewards/margins": 7.639062404632568, + "rewards/rejected": -9.292187690734863, + "step": 7180 + }, + { + "epoch": 2.3141546632332823, + "grad_norm": 20.62715876595457, + "learning_rate": 4.213618802318094e-07, + "logits/chosen": -0.290090948343277, + "logits/rejected": -0.46826171875, + "logps/chosen": -433.54998779296875, + "logps/rejected": -479.04998779296875, + "loss": 0.0263, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.597497582435608, + "rewards/margins": 8.1640625, + "rewards/rejected": -9.764062881469727, + "step": 7190 + }, + { + "epoch": 2.317373461012312, + "grad_norm": 25.10654866506488, + "learning_rate": 4.2055698647778495e-07, + "logits/chosen": -0.4768920838832855, + "logits/rejected": -0.630780041217804, + "logps/chosen": -390.54998779296875, + "logps/rejected": -445.17498779296875, + "loss": 0.0089, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.750146508216858, + "rewards/margins": 8.471094131469727, + "rewards/rejected": -10.220312118530273, + "step": 7200 + }, + { + "epoch": 2.3205922587913417, + "grad_norm": 12.4320257655765, + "learning_rate": 4.1975209272376043e-07, + "logits/chosen": -0.520184338092804, + "logits/rejected": -0.702593982219696, + "logps/chosen": -428.0874938964844, + "logps/rejected": -451.1499938964844, + "loss": 0.0312, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.7038817405700684, + "rewards/margins": 8.146875381469727, + "rewards/rejected": -10.8515625, + "step": 7210 + }, + { + "epoch": 2.323811056570371, + "grad_norm": 2.619222534870835, + "learning_rate": 4.1894719896973597e-07, + "logits/chosen": -0.40863037109375, + "logits/rejected": -0.503509521484375, + "logps/chosen": -396.7250061035156, + "logps/rejected": -484.20001220703125, + "loss": 0.0338, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.446972608566284, + "rewards/margins": 8.346094131469727, + "rewards/rejected": -10.79296875, + "step": 7220 + }, + { + "epoch": 2.3270298543494006, + "grad_norm": 2.3922606416802066, + "learning_rate": 4.1814230521571156e-07, + "logits/chosen": -0.5074707269668579, + "logits/rejected": -0.7183593511581421, + "logps/chosen": -438.375, + "logps/rejected": -484.5249938964844, + "loss": 0.0133, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.961743116378784, + "rewards/margins": 8.282031059265137, + "rewards/rejected": -11.252344131469727, + "step": 7230 + }, + { + "epoch": 2.33024865212843, + "grad_norm": 44.933914868561274, + "learning_rate": 4.1733741146168704e-07, + "logits/chosen": -0.3880157470703125, + "logits/rejected": -0.5460052490234375, + "logps/chosen": -451.875, + "logps/rejected": -491.17498779296875, + "loss": 0.034, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.677783250808716, + "rewards/margins": 8.067968368530273, + "rewards/rejected": -10.739843368530273, + "step": 7240 + }, + { + "epoch": 2.3334674499074595, + "grad_norm": 0.509398212607731, + "learning_rate": 4.165325177076626e-07, + "logits/chosen": -0.4426025450229645, + "logits/rejected": -0.6255859136581421, + "logps/chosen": -443.45001220703125, + "logps/rejected": -510.20001220703125, + "loss": 0.0176, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.4392333030700684, + "rewards/margins": 8.457812309265137, + "rewards/rejected": -10.899218559265137, + "step": 7250 + }, + { + "epoch": 2.336686247686489, + "grad_norm": 1.3481457686176435, + "learning_rate": 4.1572762395363806e-07, + "logits/chosen": -0.507220447063446, + "logits/rejected": -0.611773669719696, + "logps/chosen": -437.8999938964844, + "logps/rejected": -471.6499938964844, + "loss": 0.0114, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.395987033843994, + "rewards/margins": 8.080469131469727, + "rewards/rejected": -10.477343559265137, + "step": 7260 + }, + { + "epoch": 2.339905045465519, + "grad_norm": 1.630207762564614, + "learning_rate": 4.1492273019961365e-07, + "logits/chosen": -0.38604736328125, + "logits/rejected": -0.584729015827179, + "logps/chosen": -381.875, + "logps/rejected": -458.0249938964844, + "loss": 0.0251, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.1435546875, + "rewards/margins": 7.704687595367432, + "rewards/rejected": -9.846875190734863, + "step": 7270 + }, + { + "epoch": 2.3431238432445483, + "grad_norm": 0.7272799710302364, + "learning_rate": 4.141178364455892e-07, + "logits/chosen": -0.35747069120407104, + "logits/rejected": -0.43262940645217896, + "logps/chosen": -420.04998779296875, + "logps/rejected": -491.25, + "loss": 0.0136, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.6466186046600342, + "rewards/margins": 8.66015625, + "rewards/rejected": -10.307812690734863, + "step": 7280 + }, + { + "epoch": 2.3463426410235777, + "grad_norm": 11.031374206348824, + "learning_rate": 4.1331294269156467e-07, + "logits/chosen": -0.4498657286167145, + "logits/rejected": -0.5535827875137329, + "logps/chosen": -433.5249938964844, + "logps/rejected": -465.7250061035156, + "loss": 0.0199, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.8692138195037842, + "rewards/margins": 8.366796493530273, + "rewards/rejected": -10.235156059265137, + "step": 7290 + }, + { + "epoch": 2.349561438802607, + "grad_norm": 30.5763011092367, + "learning_rate": 4.125080489375402e-07, + "logits/chosen": -0.47869873046875, + "logits/rejected": -0.657788097858429, + "logps/chosen": -385.92498779296875, + "logps/rejected": -433.5, + "loss": 0.0158, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.9554443359375, + "rewards/margins": 8.038281440734863, + "rewards/rejected": -9.995312690734863, + "step": 7300 + }, + { + "epoch": 2.3527802365816366, + "grad_norm": 8.90448040805336, + "learning_rate": 4.117031551835158e-07, + "logits/chosen": -0.4921875, + "logits/rejected": -0.639331042766571, + "logps/chosen": -436.125, + "logps/rejected": -488.20001220703125, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8939208984375, + "rewards/margins": 8.805468559265137, + "rewards/rejected": -10.704687118530273, + "step": 7310 + }, + { + "epoch": 2.355999034360666, + "grad_norm": 7.850192042388279, + "learning_rate": 4.108982614294913e-07, + "logits/chosen": -0.4437713623046875, + "logits/rejected": -0.5233398675918579, + "logps/chosen": -458.2749938964844, + "logps/rejected": -501.7250061035156, + "loss": 0.0296, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.4422850608825684, + "rewards/margins": 8.215624809265137, + "rewards/rejected": -10.665624618530273, + "step": 7320 + }, + { + "epoch": 2.359217832139696, + "grad_norm": 24.460602831430812, + "learning_rate": 4.100933676754668e-07, + "logits/chosen": -0.5856689214706421, + "logits/rejected": -0.727246105670929, + "logps/chosen": -450.875, + "logps/rejected": -481.04998779296875, + "loss": 0.0104, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.78125, + "rewards/margins": 8.490625381469727, + "rewards/rejected": -11.271875381469727, + "step": 7330 + }, + { + "epoch": 2.3624366299187254, + "grad_norm": 2.8427357599651293, + "learning_rate": 4.0928847392144236e-07, + "logits/chosen": -0.3785400390625, + "logits/rejected": -0.6131225824356079, + "logps/chosen": -493.04998779296875, + "logps/rejected": -502.5, + "loss": 0.0161, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.4375, + "rewards/margins": 8.618749618530273, + "rewards/rejected": -11.063281059265137, + "step": 7340 + }, + { + "epoch": 2.365655427697755, + "grad_norm": 10.535260051583236, + "learning_rate": 4.084835801674179e-07, + "logits/chosen": -0.460845947265625, + "logits/rejected": -0.555682361125946, + "logps/chosen": -395.0, + "logps/rejected": -437.7749938964844, + "loss": 0.0157, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.9033691883087158, + "rewards/margins": 8.114843368530273, + "rewards/rejected": -10.026562690734863, + "step": 7350 + }, + { + "epoch": 2.3688742254767843, + "grad_norm": 2.775725600928586, + "learning_rate": 4.0767868641339343e-07, + "logits/chosen": -0.37542724609375, + "logits/rejected": -0.561047375202179, + "logps/chosen": -438.20001220703125, + "logps/rejected": -454.70001220703125, + "loss": 0.0211, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.4390380382537842, + "rewards/margins": 7.568749904632568, + "rewards/rejected": -9.004687309265137, + "step": 7360 + }, + { + "epoch": 2.3720930232558137, + "grad_norm": 19.532719651751307, + "learning_rate": 4.0687379265936897e-07, + "logits/chosen": -0.393798828125, + "logits/rejected": -0.5437988042831421, + "logps/chosen": -405.2749938964844, + "logps/rejected": -434.125, + "loss": 0.0209, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.067578077316284, + "rewards/margins": 7.96875, + "rewards/rejected": -10.041406631469727, + "step": 7370 + }, + { + "epoch": 2.3753118210348436, + "grad_norm": 3.7655752299359926, + "learning_rate": 4.0606889890534445e-07, + "logits/chosen": -0.564318835735321, + "logits/rejected": -0.8094726800918579, + "logps/chosen": -412.7749938964844, + "logps/rejected": -434.1499938964844, + "loss": 0.016, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.989038109779358, + "rewards/margins": 8.05078125, + "rewards/rejected": -10.045312881469727, + "step": 7380 + }, + { + "epoch": 2.378530618813873, + "grad_norm": 12.462780936586887, + "learning_rate": 4.0526400515132004e-07, + "logits/chosen": -0.48210448026657104, + "logits/rejected": -0.6438232660293579, + "logps/chosen": -419.23748779296875, + "logps/rejected": -474.9750061035156, + "loss": 0.0382, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.200939893722534, + "rewards/margins": 8.415624618530273, + "rewards/rejected": -10.616406440734863, + "step": 7390 + }, + { + "epoch": 2.3817494165929025, + "grad_norm": 0.9866067264638212, + "learning_rate": 4.044591113972955e-07, + "logits/chosen": -0.555651843547821, + "logits/rejected": -0.721362292766571, + "logps/chosen": -405.2749938964844, + "logps/rejected": -473.04998779296875, + "loss": 0.0095, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.9631836414337158, + "rewards/margins": 8.332812309265137, + "rewards/rejected": -10.29296875, + "step": 7400 + }, + { + "epoch": 2.384968214371932, + "grad_norm": 5.587516733451009, + "learning_rate": 4.0365421764327106e-07, + "logits/chosen": -0.4870666563510895, + "logits/rejected": -0.646563708782196, + "logps/chosen": -414.17498779296875, + "logps/rejected": -500.70001220703125, + "loss": 0.016, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.7681884765625, + "rewards/margins": 8.62890625, + "rewards/rejected": -10.39453125, + "step": 7410 + }, + { + "epoch": 2.3881870121509614, + "grad_norm": 52.54082572869093, + "learning_rate": 4.028493238892466e-07, + "logits/chosen": -0.48471373319625854, + "logits/rejected": -0.612530529499054, + "logps/chosen": -432.625, + "logps/rejected": -482.6499938964844, + "loss": 0.0183, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.1145081520080566, + "rewards/margins": 8.6875, + "rewards/rejected": -10.797656059265137, + "step": 7420 + }, + { + "epoch": 2.3914058099299913, + "grad_norm": 1.695874372698196, + "learning_rate": 4.0204443013522213e-07, + "logits/chosen": -0.4143920838832855, + "logits/rejected": -0.7276245355606079, + "logps/chosen": -440.79998779296875, + "logps/rejected": -441.6499938964844, + "loss": 0.0528, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.1766114234924316, + "rewards/margins": 8.142187118530273, + "rewards/rejected": -10.321874618530273, + "step": 7430 + }, + { + "epoch": 2.3946246077090207, + "grad_norm": 3.7799764663298356, + "learning_rate": 4.0123953638119767e-07, + "logits/chosen": -0.3882293701171875, + "logits/rejected": -0.7183593511581421, + "logps/chosen": -451.125, + "logps/rejected": -464.8500061035156, + "loss": 0.0178, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.2076659202575684, + "rewards/margins": 8.185155868530273, + "rewards/rejected": -10.382031440734863, + "step": 7440 + }, + { + "epoch": 2.39784340548805, + "grad_norm": 10.593231725654986, + "learning_rate": 4.004346426271732e-07, + "logits/chosen": -0.38385009765625, + "logits/rejected": -0.6704971194267273, + "logps/chosen": -443.1000061035156, + "logps/rejected": -452.4750061035156, + "loss": 0.0251, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.281176805496216, + "rewards/margins": 7.9296875, + "rewards/rejected": -10.210156440734863, + "step": 7450 + }, + { + "epoch": 2.4010622032670796, + "grad_norm": 5.633782085408329, + "learning_rate": 3.996297488731487e-07, + "logits/chosen": -0.50567626953125, + "logits/rejected": -0.6635986566543579, + "logps/chosen": -396.04998779296875, + "logps/rejected": -431.04998779296875, + "loss": 0.0304, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.341662645339966, + "rewards/margins": 7.609375, + "rewards/rejected": -9.952343940734863, + "step": 7460 + }, + { + "epoch": 2.404281001046109, + "grad_norm": 2.3011887348070617, + "learning_rate": 3.988248551191243e-07, + "logits/chosen": -0.3563232421875, + "logits/rejected": -0.581341564655304, + "logps/chosen": -518.875, + "logps/rejected": -519.7249755859375, + "loss": 0.0216, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.747167944908142, + "rewards/margins": 8.603906631469727, + "rewards/rejected": -10.346094131469727, + "step": 7470 + }, + { + "epoch": 2.407499798825139, + "grad_norm": 3.9864877437955935, + "learning_rate": 3.980199613650998e-07, + "logits/chosen": -0.3430542051792145, + "logits/rejected": -0.531970202922821, + "logps/chosen": -441.23748779296875, + "logps/rejected": -436.92498779296875, + "loss": 0.0324, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.623864769935608, + "rewards/margins": 8.206250190734863, + "rewards/rejected": -9.824999809265137, + "step": 7480 + }, + { + "epoch": 2.4107185966041684, + "grad_norm": 2.4677492549992124, + "learning_rate": 3.972150676110753e-07, + "logits/chosen": -0.5662902593612671, + "logits/rejected": -0.7221008539199829, + "logps/chosen": -418.17498779296875, + "logps/rejected": -415.625, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.815209984779358, + "rewards/margins": 7.750781059265137, + "rewards/rejected": -9.5703125, + "step": 7490 + }, + { + "epoch": 2.413937394383198, + "grad_norm": 7.117879445296361, + "learning_rate": 3.9641017385705084e-07, + "logits/chosen": -0.3782592713832855, + "logits/rejected": -0.5306060910224915, + "logps/chosen": -411.42498779296875, + "logps/rejected": -455.07501220703125, + "loss": 0.0274, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.837744116783142, + "rewards/margins": 8.269140243530273, + "rewards/rejected": -10.106249809265137, + "step": 7500 + }, + { + "epoch": 2.4171561921622273, + "grad_norm": 7.21464967134009, + "learning_rate": 3.9560528010302643e-07, + "logits/chosen": -0.4977706968784332, + "logits/rejected": -0.6375366449356079, + "logps/chosen": -429.1499938964844, + "logps/rejected": -472.57501220703125, + "loss": 0.0086, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.040966749191284, + "rewards/margins": 8.668749809265137, + "rewards/rejected": -10.70703125, + "step": 7510 + }, + { + "epoch": 2.4203749899412568, + "grad_norm": 2.11332947869915, + "learning_rate": 3.948003863490019e-07, + "logits/chosen": -0.4921096861362457, + "logits/rejected": -0.6949462890625, + "logps/chosen": -408.67498779296875, + "logps/rejected": -445.625, + "loss": 0.0167, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.092041015625, + "rewards/margins": 7.992968559265137, + "rewards/rejected": -10.08203125, + "step": 7520 + }, + { + "epoch": 2.4235937877202867, + "grad_norm": 6.644341191342752, + "learning_rate": 3.9399549259497745e-07, + "logits/chosen": -0.4420166015625, + "logits/rejected": -0.547778308391571, + "logps/chosen": -425.79998779296875, + "logps/rejected": -452.7749938964844, + "loss": 0.0236, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.015673875808716, + "rewards/margins": 7.813281059265137, + "rewards/rejected": -9.831250190734863, + "step": 7530 + }, + { + "epoch": 2.426812585499316, + "grad_norm": 2.024208172177011, + "learning_rate": 3.9319059884095293e-07, + "logits/chosen": -0.4680725038051605, + "logits/rejected": -0.670483410358429, + "logps/chosen": -424.54998779296875, + "logps/rejected": -483.17498779296875, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.267285108566284, + "rewards/margins": 8.867968559265137, + "rewards/rejected": -11.129687309265137, + "step": 7540 + }, + { + "epoch": 2.4300313832783456, + "grad_norm": 1.8652440655400429, + "learning_rate": 3.923857050869285e-07, + "logits/chosen": -0.518725574016571, + "logits/rejected": -0.6346435546875, + "logps/chosen": -431.2749938964844, + "logps/rejected": -488.79998779296875, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.823583960533142, + "rewards/margins": 8.642969131469727, + "rewards/rejected": -10.46875, + "step": 7550 + }, + { + "epoch": 2.433250181057375, + "grad_norm": 4.531675059774994, + "learning_rate": 3.9158081133290406e-07, + "logits/chosen": -0.49607545137405396, + "logits/rejected": -0.6980682611465454, + "logps/chosen": -442.92498779296875, + "logps/rejected": -465.8999938964844, + "loss": 0.0165, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.838525414466858, + "rewards/margins": 8.175000190734863, + "rewards/rejected": -10.015625, + "step": 7560 + }, + { + "epoch": 2.4364689788364045, + "grad_norm": 1.4652218895650413, + "learning_rate": 3.9077591757887954e-07, + "logits/chosen": -0.448934942483902, + "logits/rejected": -0.540203869342804, + "logps/chosen": -427.9624938964844, + "logps/rejected": -475.25, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9446289539337158, + "rewards/margins": 8.234375, + "rewards/rejected": -10.178906440734863, + "step": 7570 + }, + { + "epoch": 2.4396877766154343, + "grad_norm": 20.567812812207183, + "learning_rate": 3.899710238248551e-07, + "logits/chosen": -0.4322700500488281, + "logits/rejected": -0.5925964117050171, + "logps/chosen": -447.2749938964844, + "logps/rejected": -455.20001220703125, + "loss": 0.0226, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.5669677257537842, + "rewards/margins": 8.107030868530273, + "rewards/rejected": -9.677343368530273, + "step": 7580 + }, + { + "epoch": 2.442906574394464, + "grad_norm": 1.326433049325909, + "learning_rate": 3.8916613007083067e-07, + "logits/chosen": -0.45035094022750854, + "logits/rejected": -0.69818115234375, + "logps/chosen": -479.9750061035156, + "logps/rejected": -490.5, + "loss": 0.007, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.253015160560608, + "rewards/margins": 8.867968559265137, + "rewards/rejected": -10.123437881469727, + "step": 7590 + }, + { + "epoch": 2.4461253721734932, + "grad_norm": 74.5154646267558, + "learning_rate": 3.8836123631680615e-07, + "logits/chosen": -0.29710692167282104, + "logits/rejected": -0.45646971464157104, + "logps/chosen": -463.0, + "logps/rejected": -498.8500061035156, + "loss": 0.0243, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.7626221179962158, + "rewards/margins": 8.042187690734863, + "rewards/rejected": -9.810155868530273, + "step": 7600 + }, + { + "epoch": 2.4493441699525227, + "grad_norm": 1.9476971958907818, + "learning_rate": 3.875563425627817e-07, + "logits/chosen": -0.44923096895217896, + "logits/rejected": -0.629589855670929, + "logps/chosen": -463.45001220703125, + "logps/rejected": -500.70001220703125, + "loss": 0.014, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.585888624191284, + "rewards/margins": 8.08203125, + "rewards/rejected": -10.671093940734863, + "step": 7610 + }, + { + "epoch": 2.452562967731552, + "grad_norm": 3.7723014721212165, + "learning_rate": 3.867514488087572e-07, + "logits/chosen": -0.562939465045929, + "logits/rejected": -0.66937255859375, + "logps/chosen": -395.54998779296875, + "logps/rejected": -463.1499938964844, + "loss": 0.013, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.590039014816284, + "rewards/margins": 8.197656631469727, + "rewards/rejected": -10.789843559265137, + "step": 7620 + }, + { + "epoch": 2.455781765510582, + "grad_norm": 3.829233556127988, + "learning_rate": 3.8594655505473276e-07, + "logits/chosen": -0.4648193418979645, + "logits/rejected": -0.6399170160293579, + "logps/chosen": -429.70001220703125, + "logps/rejected": -464.45001220703125, + "loss": 0.0187, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.6805663108825684, + "rewards/margins": 8.875781059265137, + "rewards/rejected": -11.551562309265137, + "step": 7630 + }, + { + "epoch": 2.4590005632896115, + "grad_norm": 11.023401521353337, + "learning_rate": 3.851416613007083e-07, + "logits/chosen": -0.4936889708042145, + "logits/rejected": -0.667041003704071, + "logps/chosen": -427.79998779296875, + "logps/rejected": -462.7749938964844, + "loss": 0.0205, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.357177734375, + "rewards/margins": 8.589062690734863, + "rewards/rejected": -10.947656631469727, + "step": 7640 + }, + { + "epoch": 2.462219361068641, + "grad_norm": 1.7699479743684028, + "learning_rate": 3.8433676754668384e-07, + "logits/chosen": -0.6031860113143921, + "logits/rejected": -0.7352660894393921, + "logps/chosen": -440.25, + "logps/rejected": -491.75, + "loss": 0.009, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.7478270530700684, + "rewards/margins": 8.086718559265137, + "rewards/rejected": -10.831250190734863, + "step": 7650 + }, + { + "epoch": 2.4654381588476704, + "grad_norm": 7.372694640733425, + "learning_rate": 3.835318737926593e-07, + "logits/chosen": -0.500903308391571, + "logits/rejected": -0.560620129108429, + "logps/chosen": -400.04998779296875, + "logps/rejected": -481.0249938964844, + "loss": 0.032, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.0959715843200684, + "rewards/margins": 7.91015625, + "rewards/rejected": -11.0078125, + "step": 7660 + }, + { + "epoch": 2.4686569566267, + "grad_norm": 2.381811684738555, + "learning_rate": 3.827269800386349e-07, + "logits/chosen": -0.515332043170929, + "logits/rejected": -0.6778808832168579, + "logps/chosen": -432.7250061035156, + "logps/rejected": -463.75, + "loss": 0.0142, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.3944334983825684, + "rewards/margins": 8.845312118530273, + "rewards/rejected": -11.243749618530273, + "step": 7670 + }, + { + "epoch": 2.4718757544057297, + "grad_norm": 0.595200207298219, + "learning_rate": 3.8192208628461045e-07, + "logits/chosen": -0.6296844482421875, + "logits/rejected": -0.746960461139679, + "logps/chosen": -417.1499938964844, + "logps/rejected": -486.3999938964844, + "loss": 0.0184, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.0169677734375, + "rewards/margins": 9.264062881469727, + "rewards/rejected": -11.279687881469727, + "step": 7680 + }, + { + "epoch": 2.475094552184759, + "grad_norm": 1.6407925231451088, + "learning_rate": 3.8111719253058593e-07, + "logits/chosen": -0.45265501737594604, + "logits/rejected": -0.5492919683456421, + "logps/chosen": -394.5874938964844, + "logps/rejected": -464.17498779296875, + "loss": 0.0332, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.9798095226287842, + "rewards/margins": 7.899218559265137, + "rewards/rejected": -9.879687309265137, + "step": 7690 + }, + { + "epoch": 2.4783133499637886, + "grad_norm": 0.29973353016715554, + "learning_rate": 3.8031229877656147e-07, + "logits/chosen": -0.5266479253768921, + "logits/rejected": -0.6513916254043579, + "logps/chosen": -417.29998779296875, + "logps/rejected": -502.79998779296875, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.858129858970642, + "rewards/margins": 8.807812690734863, + "rewards/rejected": -10.663281440734863, + "step": 7700 + }, + { + "epoch": 2.481532147742818, + "grad_norm": 8.932263424039375, + "learning_rate": 3.7950740502253706e-07, + "logits/chosen": -0.6021728515625, + "logits/rejected": -0.7928466796875, + "logps/chosen": -435.32501220703125, + "logps/rejected": -461.5, + "loss": 0.0113, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.649011254310608, + "rewards/margins": 8.3984375, + "rewards/rejected": -10.046093940734863, + "step": 7710 + }, + { + "epoch": 2.4847509455218475, + "grad_norm": 35.65779369800102, + "learning_rate": 3.7870251126851254e-07, + "logits/chosen": -0.5266968011856079, + "logits/rejected": -0.7280517816543579, + "logps/chosen": -429.0249938964844, + "logps/rejected": -479.17498779296875, + "loss": 0.0223, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.249438524246216, + "rewards/margins": 8.149999618530273, + "rewards/rejected": -10.392187118530273, + "step": 7720 + }, + { + "epoch": 2.487969743300877, + "grad_norm": 16.811844207599407, + "learning_rate": 3.778976175144881e-07, + "logits/chosen": -0.5251404047012329, + "logits/rejected": -0.6989990472793579, + "logps/chosen": -438.0249938964844, + "logps/rejected": -460.375, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1163573265075684, + "rewards/margins": 8.41796875, + "rewards/rejected": -10.52734375, + "step": 7730 + }, + { + "epoch": 2.491188541079907, + "grad_norm": 63.44575925224733, + "learning_rate": 3.7709272376046356e-07, + "logits/chosen": -0.6395004391670227, + "logits/rejected": -0.7576538324356079, + "logps/chosen": -398.5249938964844, + "logps/rejected": -461.8999938964844, + "loss": 0.0117, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.325146436691284, + "rewards/margins": 8.056249618530273, + "rewards/rejected": -10.385156631469727, + "step": 7740 + }, + { + "epoch": 2.4944073388589363, + "grad_norm": 0.8302862601454284, + "learning_rate": 3.7628783000643915e-07, + "logits/chosen": -0.505755603313446, + "logits/rejected": -0.6180175542831421, + "logps/chosen": -441.625, + "logps/rejected": -460.92498779296875, + "loss": 0.0286, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.736242651939392, + "rewards/margins": 8.061718940734863, + "rewards/rejected": -9.792577743530273, + "step": 7750 + }, + { + "epoch": 2.4976261366379657, + "grad_norm": 1.1149977090299577, + "learning_rate": 3.754829362524147e-07, + "logits/chosen": -0.4617553651332855, + "logits/rejected": -0.690106213092804, + "logps/chosen": -438.7875061035156, + "logps/rejected": -475.20001220703125, + "loss": 0.0099, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.865576148033142, + "rewards/margins": 8.221094131469727, + "rewards/rejected": -10.091405868530273, + "step": 7760 + }, + { + "epoch": 2.500844934416995, + "grad_norm": 8.596955295508069, + "learning_rate": 3.7467804249839017e-07, + "logits/chosen": -0.712170422077179, + "logits/rejected": -0.780914306640625, + "logps/chosen": -372.38751220703125, + "logps/rejected": -451.1000061035156, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.080273389816284, + "rewards/margins": 7.880468845367432, + "rewards/rejected": -9.962499618530273, + "step": 7770 + }, + { + "epoch": 2.5040637321960246, + "grad_norm": 0.722078391660882, + "learning_rate": 3.738731487443657e-07, + "logits/chosen": -0.46213990449905396, + "logits/rejected": -0.634533703327179, + "logps/chosen": -420.6499938964844, + "logps/rejected": -471.45001220703125, + "loss": 0.0236, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.94244384765625, + "rewards/margins": 8.267187118530273, + "rewards/rejected": -10.204687118530273, + "step": 7780 + }, + { + "epoch": 2.507282529975054, + "grad_norm": 2.219943439280917, + "learning_rate": 3.730682549903413e-07, + "logits/chosen": -0.5098022222518921, + "logits/rejected": -0.686846911907196, + "logps/chosen": -440.0249938964844, + "logps/rejected": -472.75, + "loss": 0.014, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.026623487472534, + "rewards/margins": 8.435155868530273, + "rewards/rejected": -10.453906059265137, + "step": 7790 + }, + { + "epoch": 2.510501327754084, + "grad_norm": 59.83744254862081, + "learning_rate": 3.722633612363168e-07, + "logits/chosen": -0.4461669921875, + "logits/rejected": -0.5809173583984375, + "logps/chosen": -406.8500061035156, + "logps/rejected": -486.0, + "loss": 0.0143, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.5107665061950684, + "rewards/margins": 8.496874809265137, + "rewards/rejected": -11.006250381469727, + "step": 7800 + }, + { + "epoch": 2.5137201255331134, + "grad_norm": 3.7033401045098993, + "learning_rate": 3.714584674822923e-07, + "logits/chosen": -0.46177977323532104, + "logits/rejected": -0.5803161859512329, + "logps/chosen": -417.79998779296875, + "logps/rejected": -464.4750061035156, + "loss": 0.0327, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.0412840843200684, + "rewards/margins": 8.092968940734863, + "rewards/rejected": -10.137499809265137, + "step": 7810 + }, + { + "epoch": 2.516938923312143, + "grad_norm": 9.816849798833326, + "learning_rate": 3.7065357372826785e-07, + "logits/chosen": -0.616894543170929, + "logits/rejected": -0.794262707233429, + "logps/chosen": -433.75, + "logps/rejected": -493.92498779296875, + "loss": 0.0108, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.97412109375, + "rewards/margins": 8.196874618530273, + "rewards/rejected": -11.176562309265137, + "step": 7820 + }, + { + "epoch": 2.5201577210911723, + "grad_norm": 2.5361428899747906, + "learning_rate": 3.698486799742434e-07, + "logits/chosen": -0.584729015827179, + "logits/rejected": -0.777972400188446, + "logps/chosen": -475.6499938964844, + "logps/rejected": -480.70001220703125, + "loss": 0.0127, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.3856201171875, + "rewards/margins": 8.266406059265137, + "rewards/rejected": -10.658594131469727, + "step": 7830 + }, + { + "epoch": 2.5233765188702018, + "grad_norm": 2.2709610255827735, + "learning_rate": 3.6904378622021893e-07, + "logits/chosen": -0.3979858458042145, + "logits/rejected": -0.601116955280304, + "logps/chosen": -432.875, + "logps/rejected": -479.1499938964844, + "loss": 0.0334, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.16455078125, + "rewards/margins": 8.407812118530273, + "rewards/rejected": -10.575780868530273, + "step": 7840 + }, + { + "epoch": 2.5265953166492316, + "grad_norm": 1.0069481961307984, + "learning_rate": 3.6823889246619447e-07, + "logits/chosen": -0.47033387422561646, + "logits/rejected": -0.6783813238143921, + "logps/chosen": -370.125, + "logps/rejected": -445.7250061035156, + "loss": 0.0201, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.966821312904358, + "rewards/margins": 8.600781440734863, + "rewards/rejected": -10.57421875, + "step": 7850 + }, + { + "epoch": 2.529814114428261, + "grad_norm": 32.55214650863085, + "learning_rate": 3.6743399871216995e-07, + "logits/chosen": -0.521954357624054, + "logits/rejected": -0.6884124875068665, + "logps/chosen": -460.9750061035156, + "logps/rejected": -509.1000061035156, + "loss": 0.0254, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.751538038253784, + "rewards/margins": 9.061718940734863, + "rewards/rejected": -11.815625190734863, + "step": 7860 + }, + { + "epoch": 2.5330329122072905, + "grad_norm": 44.795954130588434, + "learning_rate": 3.6662910495814554e-07, + "logits/chosen": -0.5592285394668579, + "logits/rejected": -0.701922595500946, + "logps/chosen": -429.2749938964844, + "logps/rejected": -471.3500061035156, + "loss": 0.0259, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.759033203125, + "rewards/margins": 8.557812690734863, + "rewards/rejected": -11.317187309265137, + "step": 7870 + }, + { + "epoch": 2.53625170998632, + "grad_norm": 12.782550318276964, + "learning_rate": 3.65824211204121e-07, + "logits/chosen": -0.520416259765625, + "logits/rejected": -0.686840832233429, + "logps/chosen": -424.07501220703125, + "logps/rejected": -493.5249938964844, + "loss": 0.0087, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.611132860183716, + "rewards/margins": 8.776562690734863, + "rewards/rejected": -11.384374618530273, + "step": 7880 + }, + { + "epoch": 2.5394705077653494, + "grad_norm": 19.13814045542479, + "learning_rate": 3.6501931745009656e-07, + "logits/chosen": -0.3374877870082855, + "logits/rejected": -0.6713714599609375, + "logps/chosen": -432.11248779296875, + "logps/rejected": -432.875, + "loss": 0.0298, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.501879930496216, + "rewards/margins": 8.036718368530273, + "rewards/rejected": -10.532812118530273, + "step": 7890 + }, + { + "epoch": 2.5426893055443793, + "grad_norm": 2.152569759085124, + "learning_rate": 3.642144236960721e-07, + "logits/chosen": -0.48200684785842896, + "logits/rejected": -0.722582995891571, + "logps/chosen": -405.48748779296875, + "logps/rejected": -456.54998779296875, + "loss": 0.0248, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.7294678688049316, + "rewards/margins": 8.253125190734863, + "rewards/rejected": -10.982030868530273, + "step": 7900 + }, + { + "epoch": 2.5459081033234088, + "grad_norm": 1.0691068006467928, + "learning_rate": 3.6340952994204763e-07, + "logits/chosen": -0.6143859624862671, + "logits/rejected": -0.7857666015625, + "logps/chosen": -449.25, + "logps/rejected": -463.54998779296875, + "loss": 0.0129, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.408447265625, + "rewards/margins": 8.776562690734863, + "rewards/rejected": -11.19140625, + "step": 7910 + }, + { + "epoch": 2.5491269011024382, + "grad_norm": 2.114873147073925, + "learning_rate": 3.6260463618802317e-07, + "logits/chosen": -0.5562744140625, + "logits/rejected": -0.729510486125946, + "logps/chosen": -433.6000061035156, + "logps/rejected": -468.45001220703125, + "loss": 0.0192, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.8245606422424316, + "rewards/margins": 8.603124618530273, + "rewards/rejected": -11.431249618530273, + "step": 7920 + }, + { + "epoch": 2.5523456988814677, + "grad_norm": 2.9139758063958996, + "learning_rate": 3.617997424339987e-07, + "logits/chosen": -0.544848620891571, + "logits/rejected": -0.6393371820449829, + "logps/chosen": -418.95001220703125, + "logps/rejected": -472.70001220703125, + "loss": 0.0174, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.081005811691284, + "rewards/margins": 8.25390625, + "rewards/rejected": -10.3359375, + "step": 7930 + }, + { + "epoch": 2.555564496660497, + "grad_norm": 1.6986593056209538, + "learning_rate": 3.609948486799742e-07, + "logits/chosen": -0.5699707269668579, + "logits/rejected": -0.7229980230331421, + "logps/chosen": -454.82501220703125, + "logps/rejected": -506.3500061035156, + "loss": 0.0147, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.527026414871216, + "rewards/margins": 8.739062309265137, + "rewards/rejected": -11.267969131469727, + "step": 7940 + }, + { + "epoch": 2.558783294439527, + "grad_norm": 10.959363856854024, + "learning_rate": 3.601899549259498e-07, + "logits/chosen": -0.533764660358429, + "logits/rejected": -0.75018310546875, + "logps/chosen": -442.75, + "logps/rejected": -449.6499938964844, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2103514671325684, + "rewards/margins": 8.139062881469727, + "rewards/rejected": -10.3515625, + "step": 7950 + }, + { + "epoch": 2.5620020922185565, + "grad_norm": 3.810352094910954, + "learning_rate": 3.593850611719253e-07, + "logits/chosen": -0.4163452088832855, + "logits/rejected": -0.618701159954071, + "logps/chosen": -424.75, + "logps/rejected": -466.54998779296875, + "loss": 0.0144, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.562829613685608, + "rewards/margins": 8.646093368530273, + "rewards/rejected": -10.205469131469727, + "step": 7960 + }, + { + "epoch": 2.565220889997586, + "grad_norm": 4.199196483072507, + "learning_rate": 3.585801674179008e-07, + "logits/chosen": -0.560803234577179, + "logits/rejected": -0.5821533203125, + "logps/chosen": -405.0, + "logps/rejected": -464.75, + "loss": 0.012, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.6627929210662842, + "rewards/margins": 8.603906631469727, + "rewards/rejected": -10.26953125, + "step": 7970 + }, + { + "epoch": 2.5684396877766154, + "grad_norm": 0.5622130767360719, + "learning_rate": 3.5777527366387634e-07, + "logits/chosen": -0.580029308795929, + "logits/rejected": -0.7919677495956421, + "logps/chosen": -426.7749938964844, + "logps/rejected": -480.8500061035156, + "loss": 0.012, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.6148865222930908, + "rewards/margins": 8.899999618530273, + "rewards/rejected": -10.5078125, + "step": 7980 + }, + { + "epoch": 2.571658485555645, + "grad_norm": 6.684098280470894, + "learning_rate": 3.569703799098519e-07, + "logits/chosen": -0.48609620332717896, + "logits/rejected": -0.587664783000946, + "logps/chosen": -399.07501220703125, + "logps/rejected": -460.82501220703125, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3758118152618408, + "rewards/margins": 8.55859375, + "rewards/rejected": -9.944531440734863, + "step": 7990 + }, + { + "epoch": 2.5748772833346747, + "grad_norm": 0.5645176023866494, + "learning_rate": 3.561654861558274e-07, + "logits/chosen": -0.4273437559604645, + "logits/rejected": -0.6278320550918579, + "logps/chosen": -425.57501220703125, + "logps/rejected": -458.0, + "loss": 0.014, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.649682641029358, + "rewards/margins": 8.348437309265137, + "rewards/rejected": -10.0, + "step": 8000 + }, + { + "epoch": 2.578096081113704, + "grad_norm": 2.1269415625420125, + "learning_rate": 3.5536059240180295e-07, + "logits/chosen": -0.49260252714157104, + "logits/rejected": -0.659820556640625, + "logps/chosen": -435.9750061035156, + "logps/rejected": -481.92498779296875, + "loss": 0.0152, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.094531297683716, + "rewards/margins": 8.21875, + "rewards/rejected": -10.310155868530273, + "step": 8010 + }, + { + "epoch": 2.5813148788927336, + "grad_norm": 3.634194804625366, + "learning_rate": 3.5455569864777843e-07, + "logits/chosen": -0.572399914264679, + "logits/rejected": -0.6343017816543579, + "logps/chosen": -419.1875, + "logps/rejected": -497.375, + "loss": 0.0117, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.106579542160034, + "rewards/margins": 8.764843940734863, + "rewards/rejected": -10.871874809265137, + "step": 8020 + }, + { + "epoch": 2.584533676671763, + "grad_norm": 2.0647810893019583, + "learning_rate": 3.53750804893754e-07, + "logits/chosen": -0.5850585699081421, + "logits/rejected": -0.763378918170929, + "logps/chosen": -406.8999938964844, + "logps/rejected": -451.70001220703125, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3123536109924316, + "rewards/margins": 8.714062690734863, + "rewards/rejected": -11.0234375, + "step": 8030 + }, + { + "epoch": 2.5877524744507925, + "grad_norm": 1.1031934549992526, + "learning_rate": 3.5294591113972956e-07, + "logits/chosen": -0.674731433391571, + "logits/rejected": -0.6859496831893921, + "logps/chosen": -446.67498779296875, + "logps/rejected": -487.8999938964844, + "loss": 0.0183, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.095288038253784, + "rewards/margins": 8.533594131469727, + "rewards/rejected": -11.6328125, + "step": 8040 + }, + { + "epoch": 2.5909712722298224, + "grad_norm": 7.617571887756976, + "learning_rate": 3.5214101738570504e-07, + "logits/chosen": -0.5128113031387329, + "logits/rejected": -0.6702941656112671, + "logps/chosen": -412.3125, + "logps/rejected": -472.1000061035156, + "loss": 0.0335, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.515063524246216, + "rewards/margins": 8.571874618530273, + "rewards/rejected": -12.089062690734863, + "step": 8050 + }, + { + "epoch": 2.594190070008852, + "grad_norm": 2.320510511086523, + "learning_rate": 3.513361236316806e-07, + "logits/chosen": -0.6319824457168579, + "logits/rejected": -0.808667004108429, + "logps/chosen": -435.625, + "logps/rejected": -479.45001220703125, + "loss": 0.0198, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.2982177734375, + "rewards/margins": 8.6953125, + "rewards/rejected": -12.00390625, + "step": 8060 + }, + { + "epoch": 2.5974088677878813, + "grad_norm": 104.66437525921206, + "learning_rate": 3.5053122987765617e-07, + "logits/chosen": -0.6006225347518921, + "logits/rejected": -0.7987060546875, + "logps/chosen": -452.6499938964844, + "logps/rejected": -504.1499938964844, + "loss": 0.0184, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.148852586746216, + "rewards/margins": 8.819531440734863, + "rewards/rejected": -11.96484375, + "step": 8070 + }, + { + "epoch": 2.6006276655669107, + "grad_norm": 3.512273500768897, + "learning_rate": 3.4972633612363165e-07, + "logits/chosen": -0.6493285894393921, + "logits/rejected": -0.669384777545929, + "logps/chosen": -401.95001220703125, + "logps/rejected": -487.0, + "loss": 0.0128, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.7943358421325684, + "rewards/margins": 9.000781059265137, + "rewards/rejected": -11.796093940734863, + "step": 8080 + }, + { + "epoch": 2.60384646334594, + "grad_norm": 22.943067629389756, + "learning_rate": 3.489214423696072e-07, + "logits/chosen": -0.61773681640625, + "logits/rejected": -0.8504699468612671, + "logps/chosen": -485.1875, + "logps/rejected": -509.3999938964844, + "loss": 0.0146, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.434600830078125, + "rewards/margins": 8.667187690734863, + "rewards/rejected": -11.106249809265137, + "step": 8090 + }, + { + "epoch": 2.60706526112497, + "grad_norm": 1.974581190440241, + "learning_rate": 3.481165486155827e-07, + "logits/chosen": -0.491293340921402, + "logits/rejected": -0.712207019329071, + "logps/chosen": -406.07501220703125, + "logps/rejected": -467.8500061035156, + "loss": 0.0161, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.681445360183716, + "rewards/margins": 8.06640625, + "rewards/rejected": -10.755468368530273, + "step": 8100 + }, + { + "epoch": 2.6102840589039995, + "grad_norm": 1.876793591256155, + "learning_rate": 3.4731165486155826e-07, + "logits/chosen": -0.657696545124054, + "logits/rejected": -0.775927722454071, + "logps/chosen": -430.17498779296875, + "logps/rejected": -513.0999755859375, + "loss": 0.0105, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.9898924827575684, + "rewards/margins": 8.839062690734863, + "rewards/rejected": -11.827343940734863, + "step": 8110 + }, + { + "epoch": 2.613502856683029, + "grad_norm": 1.4886231399844456, + "learning_rate": 3.465067611075338e-07, + "logits/chosen": -0.5089675784111023, + "logits/rejected": -0.8973633050918579, + "logps/chosen": -474.67498779296875, + "logps/rejected": -461.3500061035156, + "loss": 0.0255, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.92236328125, + "rewards/margins": 8.681249618530273, + "rewards/rejected": -11.610937118530273, + "step": 8120 + }, + { + "epoch": 2.6167216544620584, + "grad_norm": 6.157500800295626, + "learning_rate": 3.4570186735350933e-07, + "logits/chosen": -0.560620129108429, + "logits/rejected": -0.778027355670929, + "logps/chosen": -434.32501220703125, + "logps/rejected": -493.5249938964844, + "loss": 0.0156, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.02398681640625, + "rewards/margins": 9.140625, + "rewards/rejected": -12.162500381469727, + "step": 8130 + }, + { + "epoch": 2.619940452241088, + "grad_norm": 1.343431760933771, + "learning_rate": 3.448969735994848e-07, + "logits/chosen": -0.6454010009765625, + "logits/rejected": -0.8866943120956421, + "logps/chosen": -432.3999938964844, + "logps/rejected": -489.29998779296875, + "loss": 0.0141, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.140576124191284, + "rewards/margins": 8.750781059265137, + "rewards/rejected": -11.891406059265137, + "step": 8140 + }, + { + "epoch": 2.6231592500201177, + "grad_norm": 1.7842789058043438, + "learning_rate": 3.440920798454604e-07, + "logits/chosen": -0.504589855670929, + "logits/rejected": -0.7514709234237671, + "logps/chosen": -467.7749938964844, + "logps/rejected": -509.95001220703125, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1382079124450684, + "rewards/margins": 9.15234375, + "rewards/rejected": -12.285937309265137, + "step": 8150 + }, + { + "epoch": 2.6263780477991467, + "grad_norm": 73.101266881093, + "learning_rate": 3.4328718609143594e-07, + "logits/chosen": -0.5422118902206421, + "logits/rejected": -0.7138671875, + "logps/chosen": -453.6499938964844, + "logps/rejected": -479.8500061035156, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6849608421325684, + "rewards/margins": 8.496874809265137, + "rewards/rejected": -11.182031631469727, + "step": 8160 + }, + { + "epoch": 2.6295968455781766, + "grad_norm": 9.994503774640693, + "learning_rate": 3.4248229233741143e-07, + "logits/chosen": -0.558850109577179, + "logits/rejected": -0.6455230712890625, + "logps/chosen": -423.7749938964844, + "logps/rejected": -450.375, + "loss": 0.0177, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.3751463890075684, + "rewards/margins": 8.033594131469727, + "rewards/rejected": -10.401562690734863, + "step": 8170 + }, + { + "epoch": 2.632815643357206, + "grad_norm": 36.06669209272162, + "learning_rate": 3.4167739858338697e-07, + "logits/chosen": -0.49628907442092896, + "logits/rejected": -0.742785632610321, + "logps/chosen": -456.625, + "logps/rejected": -467.125, + "loss": 0.0231, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.848046898841858, + "rewards/margins": 8.142187118530273, + "rewards/rejected": -9.995312690734863, + "step": 8180 + }, + { + "epoch": 2.6360344411362355, + "grad_norm": 24.726166195620888, + "learning_rate": 3.4087250482936255e-07, + "logits/chosen": -0.42640382051467896, + "logits/rejected": -0.635937511920929, + "logps/chosen": -435.17498779296875, + "logps/rejected": -445.25, + "loss": 0.0154, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.778173804283142, + "rewards/margins": 8.672656059265137, + "rewards/rejected": -10.453906059265137, + "step": 8190 + }, + { + "epoch": 2.6392532389152654, + "grad_norm": 2.8559182753079932, + "learning_rate": 3.4006761107533804e-07, + "logits/chosen": -0.42985838651657104, + "logits/rejected": -0.7065674066543579, + "logps/chosen": -446.8500061035156, + "logps/rejected": -429.6000061035156, + "loss": 0.0434, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.390722632408142, + "rewards/margins": 7.698437690734863, + "rewards/rejected": -9.092187881469727, + "step": 8200 + }, + { + "epoch": 2.6424720366942944, + "grad_norm": 12.955672644113362, + "learning_rate": 3.392627173213136e-07, + "logits/chosen": -0.34336549043655396, + "logits/rejected": -0.528698742389679, + "logps/chosen": -416.8999938964844, + "logps/rejected": -463.04998779296875, + "loss": 0.0142, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.4265258312225342, + "rewards/margins": 7.947656154632568, + "rewards/rejected": -9.369531631469727, + "step": 8210 + }, + { + "epoch": 2.6456908344733243, + "grad_norm": 0.5900805910030795, + "learning_rate": 3.3845782356728906e-07, + "logits/chosen": -0.4888916015625, + "logits/rejected": -0.6089233160018921, + "logps/chosen": -415.8999938964844, + "logps/rejected": -462.04998779296875, + "loss": 0.0142, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.3256103992462158, + "rewards/margins": 8.723437309265137, + "rewards/rejected": -10.046875, + "step": 8220 + }, + { + "epoch": 2.6489096322523538, + "grad_norm": 1.2644874663730514, + "learning_rate": 3.3765292981326465e-07, + "logits/chosen": -0.36361390352249146, + "logits/rejected": -0.618847668170929, + "logps/chosen": -426.42498779296875, + "logps/rejected": -460.92498779296875, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.601049780845642, + "rewards/margins": 8.654687881469727, + "rewards/rejected": -10.26171875, + "step": 8230 + }, + { + "epoch": 2.652128430031383, + "grad_norm": 2.5408732151132023, + "learning_rate": 3.368480360592402e-07, + "logits/chosen": -0.44062501192092896, + "logits/rejected": -0.5842529535293579, + "logps/chosen": -456.70001220703125, + "logps/rejected": -513.125, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.763238549232483, + "rewards/margins": 8.96875, + "rewards/rejected": -10.731249809265137, + "step": 8240 + }, + { + "epoch": 2.6553472278104127, + "grad_norm": 21.805564164085833, + "learning_rate": 3.3604314230521567e-07, + "logits/chosen": -0.556182861328125, + "logits/rejected": -0.6896415948867798, + "logps/chosen": -390.1000061035156, + "logps/rejected": -451.8500061035156, + "loss": 0.0114, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.3851318359375, + "rewards/margins": 8.38671875, + "rewards/rejected": -10.76953125, + "step": 8250 + }, + { + "epoch": 2.658566025589442, + "grad_norm": 2.4671381908820966, + "learning_rate": 3.352382485511912e-07, + "logits/chosen": -0.692675769329071, + "logits/rejected": -0.7585204839706421, + "logps/chosen": -450.1000061035156, + "logps/rejected": -484.5249938964844, + "loss": 0.0296, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.003173828125, + "rewards/margins": 8.1484375, + "rewards/rejected": -11.154687881469727, + "step": 8260 + }, + { + "epoch": 2.661784823368472, + "grad_norm": 8.488439000187316, + "learning_rate": 3.344333547971668e-07, + "logits/chosen": -0.5413573980331421, + "logits/rejected": -0.69281005859375, + "logps/chosen": -423.125, + "logps/rejected": -474.45001220703125, + "loss": 0.0141, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.382617235183716, + "rewards/margins": 8.5859375, + "rewards/rejected": -10.964062690734863, + "step": 8270 + }, + { + "epoch": 2.6650036211475014, + "grad_norm": 1.4826391181368745, + "learning_rate": 3.336284610431423e-07, + "logits/chosen": -0.4659790098667145, + "logits/rejected": -0.7813965082168579, + "logps/chosen": -443.4750061035156, + "logps/rejected": -425.82501220703125, + "loss": 0.0232, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.9694702625274658, + "rewards/margins": 8.426562309265137, + "rewards/rejected": -10.395312309265137, + "step": 8280 + }, + { + "epoch": 2.668222418926531, + "grad_norm": 0.36501463372601917, + "learning_rate": 3.328235672891178e-07, + "logits/chosen": -0.4102783203125, + "logits/rejected": -0.654125988483429, + "logps/chosen": -472.07501220703125, + "logps/rejected": -511.7250061035156, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.996984839439392, + "rewards/margins": 8.485937118530273, + "rewards/rejected": -10.489062309265137, + "step": 8290 + }, + { + "epoch": 2.6714412167055603, + "grad_norm": 38.0521803378753, + "learning_rate": 3.3201867353509335e-07, + "logits/chosen": -0.481964111328125, + "logits/rejected": -0.6931213140487671, + "logps/chosen": -437.7749938964844, + "logps/rejected": -435.82501220703125, + "loss": 0.0207, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.9174072742462158, + "rewards/margins": 7.692968845367432, + "rewards/rejected": -9.603906631469727, + "step": 8300 + }, + { + "epoch": 2.67466001448459, + "grad_norm": 9.333674476823607, + "learning_rate": 3.312137797810689e-07, + "logits/chosen": -0.539019763469696, + "logits/rejected": -0.5897262692451477, + "logps/chosen": -431.25, + "logps/rejected": -491.54998779296875, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.188854932785034, + "rewards/margins": 8.423437118530273, + "rewards/rejected": -10.624218940734863, + "step": 8310 + }, + { + "epoch": 2.6778788122636197, + "grad_norm": 0.7996043374107201, + "learning_rate": 3.304088860270444e-07, + "logits/chosen": -0.622967541217804, + "logits/rejected": -0.7970947027206421, + "logps/chosen": -433.0, + "logps/rejected": -482.6000061035156, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.634082078933716, + "rewards/margins": 8.715624809265137, + "rewards/rejected": -11.353124618530273, + "step": 8320 + }, + { + "epoch": 2.681097610042649, + "grad_norm": 48.054673629910226, + "learning_rate": 3.2960399227301996e-07, + "logits/chosen": -0.44554442167282104, + "logits/rejected": -0.6994873285293579, + "logps/chosen": -413.75, + "logps/rejected": -460.3999938964844, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.313037157058716, + "rewards/margins": 8.514062881469727, + "rewards/rejected": -10.822656631469727, + "step": 8330 + }, + { + "epoch": 2.6843164078216786, + "grad_norm": 1.3233166741596534, + "learning_rate": 3.2879909851899545e-07, + "logits/chosen": -0.4901367127895355, + "logits/rejected": -0.6405884027481079, + "logps/chosen": -459.7250061035156, + "logps/rejected": -501.8500061035156, + "loss": 0.013, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.1915283203125, + "rewards/margins": 8.809374809265137, + "rewards/rejected": -10.998437881469727, + "step": 8340 + }, + { + "epoch": 2.687535205600708, + "grad_norm": 3.144370110566875, + "learning_rate": 3.2799420476497104e-07, + "logits/chosen": -0.6291748285293579, + "logits/rejected": -0.811145007610321, + "logps/chosen": -429.5249938964844, + "logps/rejected": -460.42498779296875, + "loss": 0.0129, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.2546629905700684, + "rewards/margins": 8.59765625, + "rewards/rejected": -10.853906631469727, + "step": 8350 + }, + { + "epoch": 2.6907540033797375, + "grad_norm": 18.935874604400677, + "learning_rate": 3.271893110109465e-07, + "logits/chosen": -0.477996826171875, + "logits/rejected": -0.6242736577987671, + "logps/chosen": -433.7749938964844, + "logps/rejected": -455.0249938964844, + "loss": 0.0179, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.9734618663787842, + "rewards/margins": 8.533594131469727, + "rewards/rejected": -10.497655868530273, + "step": 8360 + }, + { + "epoch": 2.6939728011587674, + "grad_norm": 2.79230255354177, + "learning_rate": 3.2638441725692206e-07, + "logits/chosen": -0.521899402141571, + "logits/rejected": -0.6948608160018921, + "logps/chosen": -429.07501220703125, + "logps/rejected": -459.1000061035156, + "loss": 0.0325, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.510205030441284, + "rewards/margins": 8.150781631469727, + "rewards/rejected": -10.660937309265137, + "step": 8370 + }, + { + "epoch": 2.697191598937797, + "grad_norm": 27.571770368055994, + "learning_rate": 3.255795235028976e-07, + "logits/chosen": -0.528027355670929, + "logits/rejected": -0.734405517578125, + "logps/chosen": -396.6000061035156, + "logps/rejected": -437.2250061035156, + "loss": 0.0153, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.8783202171325684, + "rewards/margins": 8.719531059265137, + "rewards/rejected": -11.596875190734863, + "step": 8380 + }, + { + "epoch": 2.7004103967168263, + "grad_norm": 9.620995712389421, + "learning_rate": 3.2477462974887313e-07, + "logits/chosen": -0.41260987520217896, + "logits/rejected": -0.656970202922821, + "logps/chosen": -491.1000061035156, + "logps/rejected": -482.625, + "loss": 0.0405, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.1207518577575684, + "rewards/margins": 8.37109375, + "rewards/rejected": -10.4921875, + "step": 8390 + }, + { + "epoch": 2.7036291944958557, + "grad_norm": 9.12122740252555, + "learning_rate": 3.2396973599484867e-07, + "logits/chosen": -0.537792980670929, + "logits/rejected": -0.728625476360321, + "logps/chosen": -460.7124938964844, + "logps/rejected": -488.92498779296875, + "loss": 0.028, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.068554639816284, + "rewards/margins": 8.186718940734863, + "rewards/rejected": -10.2578125, + "step": 8400 + }, + { + "epoch": 2.706847992274885, + "grad_norm": 0.6254781088329263, + "learning_rate": 3.231648422408242e-07, + "logits/chosen": -0.5377289056777954, + "logits/rejected": -0.6257781982421875, + "logps/chosen": -401.29998779296875, + "logps/rejected": -466.5, + "loss": 0.0185, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.0992920398712158, + "rewards/margins": 8.516406059265137, + "rewards/rejected": -9.617968559265137, + "step": 8410 + }, + { + "epoch": 2.710066790053915, + "grad_norm": 1.2327825324396697, + "learning_rate": 3.223599484867997e-07, + "logits/chosen": -0.2990966737270355, + "logits/rejected": -0.5748291015625, + "logps/chosen": -454.79998779296875, + "logps/rejected": -438.70001220703125, + "loss": 0.0342, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.612280249595642, + "rewards/margins": 8.103906631469727, + "rewards/rejected": -9.72265625, + "step": 8420 + }, + { + "epoch": 2.7132855878329445, + "grad_norm": 1.63589187825038, + "learning_rate": 3.215550547327753e-07, + "logits/chosen": -0.619915783405304, + "logits/rejected": -0.797119140625, + "logps/chosen": -431.0249938964844, + "logps/rejected": -458.4750061035156, + "loss": 0.0143, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.4884276390075684, + "rewards/margins": 8.171093940734863, + "rewards/rejected": -10.661718368530273, + "step": 8430 + }, + { + "epoch": 2.716504385611974, + "grad_norm": 3.7542140327575684, + "learning_rate": 3.207501609787508e-07, + "logits/chosen": -0.5867065191268921, + "logits/rejected": -0.745928943157196, + "logps/chosen": -431.70001220703125, + "logps/rejected": -442.04998779296875, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.227294921875, + "rewards/margins": 8.157031059265137, + "rewards/rejected": -10.37890625, + "step": 8440 + }, + { + "epoch": 2.7197231833910034, + "grad_norm": 7.1121765625822775, + "learning_rate": 3.199452672247263e-07, + "logits/chosen": -0.5148559808731079, + "logits/rejected": -0.66259765625, + "logps/chosen": -387.2749938964844, + "logps/rejected": -449.25, + "loss": 0.0149, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.9755859375, + "rewards/margins": 8.339062690734863, + "rewards/rejected": -11.321093559265137, + "step": 8450 + }, + { + "epoch": 2.722941981170033, + "grad_norm": 13.804249491866164, + "learning_rate": 3.1914037347070183e-07, + "logits/chosen": -0.536358654499054, + "logits/rejected": -0.681811511516571, + "logps/chosen": -438.625, + "logps/rejected": -489.04998779296875, + "loss": 0.0241, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.3481688499450684, + "rewards/margins": 8.206250190734863, + "rewards/rejected": -10.553125381469727, + "step": 8460 + }, + { + "epoch": 2.7261607789490627, + "grad_norm": 2.9012710110598015, + "learning_rate": 3.183354797166774e-07, + "logits/chosen": -0.622052013874054, + "logits/rejected": -0.7821289300918579, + "logps/chosen": -425.32501220703125, + "logps/rejected": -471.6499938964844, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6226439476013184, + "rewards/margins": 8.857812881469727, + "rewards/rejected": -11.482812881469727, + "step": 8470 + }, + { + "epoch": 2.729379576728092, + "grad_norm": 1.5842527234468031, + "learning_rate": 3.175305859626529e-07, + "logits/chosen": -0.48438721895217896, + "logits/rejected": -0.651354968547821, + "logps/chosen": -445.7749938964844, + "logps/rejected": -511.8999938964844, + "loss": 0.0191, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.9916014671325684, + "rewards/margins": 8.211718559265137, + "rewards/rejected": -11.201562881469727, + "step": 8480 + }, + { + "epoch": 2.7325983745071216, + "grad_norm": 1.6779187417670443, + "learning_rate": 3.1672569220862844e-07, + "logits/chosen": -0.574267566204071, + "logits/rejected": -0.7593628168106079, + "logps/chosen": -451.82501220703125, + "logps/rejected": -498.25, + "loss": 0.0119, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.945507764816284, + "rewards/margins": 8.509374618530273, + "rewards/rejected": -11.456250190734863, + "step": 8490 + }, + { + "epoch": 2.735817172286151, + "grad_norm": 7.11794510727111, + "learning_rate": 3.1592079845460393e-07, + "logits/chosen": -0.5959808230400085, + "logits/rejected": -0.7844390869140625, + "logps/chosen": -423.54998779296875, + "logps/rejected": -487.82501220703125, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2540040016174316, + "rewards/margins": 8.537500381469727, + "rewards/rejected": -11.791406631469727, + "step": 8500 + }, + { + "epoch": 2.7390359700651805, + "grad_norm": 5.215099437495332, + "learning_rate": 3.151159047005795e-07, + "logits/chosen": -0.5923217535018921, + "logits/rejected": -0.763745129108429, + "logps/chosen": -435.82501220703125, + "logps/rejected": -481.6000061035156, + "loss": 0.0102, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.7843260765075684, + "rewards/margins": 8.700780868530273, + "rewards/rejected": -11.485937118530273, + "step": 8510 + }, + { + "epoch": 2.7422547678442104, + "grad_norm": 0.7191336218195, + "learning_rate": 3.1431101094655505e-07, + "logits/chosen": -0.3760131895542145, + "logits/rejected": -0.5795654058456421, + "logps/chosen": -478.32501220703125, + "logps/rejected": -505.5249938964844, + "loss": 0.0149, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.9448120594024658, + "rewards/margins": 9.137499809265137, + "rewards/rejected": -11.082812309265137, + "step": 8520 + }, + { + "epoch": 2.74547356562324, + "grad_norm": 3.1726324234820096, + "learning_rate": 3.1350611719253054e-07, + "logits/chosen": -0.46818238496780396, + "logits/rejected": -0.6493682861328125, + "logps/chosen": -381.5249938964844, + "logps/rejected": -425.82501220703125, + "loss": 0.032, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.711230516433716, + "rewards/margins": 8.895312309265137, + "rewards/rejected": -11.607812881469727, + "step": 8530 + }, + { + "epoch": 2.7486923634022693, + "grad_norm": 3.2615815611088514, + "learning_rate": 3.127012234385061e-07, + "logits/chosen": -0.5801025629043579, + "logits/rejected": -0.593737781047821, + "logps/chosen": -404.82501220703125, + "logps/rejected": -484.6499938964844, + "loss": 0.0183, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.493847608566284, + "rewards/margins": 8.248437881469727, + "rewards/rejected": -10.742968559265137, + "step": 8540 + }, + { + "epoch": 2.7519111611812987, + "grad_norm": 4.573830656765686, + "learning_rate": 3.1189632968448166e-07, + "logits/chosen": -0.5987793207168579, + "logits/rejected": -0.833844006061554, + "logps/chosen": -424.75, + "logps/rejected": -435.20001220703125, + "loss": 0.0361, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.314868211746216, + "rewards/margins": 8.61328125, + "rewards/rejected": -10.931249618530273, + "step": 8550 + }, + { + "epoch": 2.755129958960328, + "grad_norm": 11.110687287483483, + "learning_rate": 3.1109143593045715e-07, + "logits/chosen": -0.549816906452179, + "logits/rejected": -0.657183825969696, + "logps/chosen": -434.4750061035156, + "logps/rejected": -504.75, + "loss": 0.0283, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.1998047828674316, + "rewards/margins": 8.172266006469727, + "rewards/rejected": -11.369531631469727, + "step": 8560 + }, + { + "epoch": 2.758348756739358, + "grad_norm": 3.1689090426740814, + "learning_rate": 3.102865421764327e-07, + "logits/chosen": -0.4038757383823395, + "logits/rejected": -0.6594268679618835, + "logps/chosen": -426.375, + "logps/rejected": -450.92498779296875, + "loss": 0.0359, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.4631104469299316, + "rewards/margins": 8.046875, + "rewards/rejected": -10.507031440734863, + "step": 8570 + }, + { + "epoch": 2.7615675545183875, + "grad_norm": 0.7995018252498313, + "learning_rate": 3.094816484224082e-07, + "logits/chosen": -0.583984375, + "logits/rejected": -0.746868908405304, + "logps/chosen": -424.8500061035156, + "logps/rejected": -486.95001220703125, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.912207007408142, + "rewards/margins": 9.107030868530273, + "rewards/rejected": -11.015625, + "step": 8580 + }, + { + "epoch": 2.764786352297417, + "grad_norm": 1.0268547293131225, + "learning_rate": 3.0867675466838376e-07, + "logits/chosen": -0.3205810487270355, + "logits/rejected": -0.4852539002895355, + "logps/chosen": -416.8500061035156, + "logps/rejected": -491.1000061035156, + "loss": 0.0175, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.711145043373108, + "rewards/margins": 8.699999809265137, + "rewards/rejected": -10.412500381469727, + "step": 8590 + }, + { + "epoch": 2.7680051500764464, + "grad_norm": 1.546069672466597, + "learning_rate": 3.078718609143593e-07, + "logits/chosen": -0.59295654296875, + "logits/rejected": -0.797253429889679, + "logps/chosen": -373.0625, + "logps/rejected": -434.79998779296875, + "loss": 0.0115, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.2570557594299316, + "rewards/margins": 8.641406059265137, + "rewards/rejected": -10.899218559265137, + "step": 8600 + }, + { + "epoch": 2.771223947855476, + "grad_norm": 5.161613267774413, + "learning_rate": 3.0706696716033483e-07, + "logits/chosen": -0.573657214641571, + "logits/rejected": -0.572131335735321, + "logps/chosen": -433.95001220703125, + "logps/rejected": -510.8999938964844, + "loss": 0.0151, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.2726073265075684, + "rewards/margins": 8.409375190734863, + "rewards/rejected": -10.682812690734863, + "step": 8610 + }, + { + "epoch": 2.7744427456345058, + "grad_norm": 8.4923273772325, + "learning_rate": 3.062620734063103e-07, + "logits/chosen": -0.43731385469436646, + "logits/rejected": -0.6338866949081421, + "logps/chosen": -448.9750061035156, + "logps/rejected": -486.1499938964844, + "loss": 0.02, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.496411085128784, + "rewards/margins": 8.474218368530273, + "rewards/rejected": -10.970312118530273, + "step": 8620 + }, + { + "epoch": 2.7776615434135348, + "grad_norm": 5.0999923865911425, + "learning_rate": 3.054571796522859e-07, + "logits/chosen": -0.528369128704071, + "logits/rejected": -0.7850402593612671, + "logps/chosen": -443.5249938964844, + "logps/rejected": -447.7749938964844, + "loss": 0.014, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.766699194908142, + "rewards/margins": 8.755468368530273, + "rewards/rejected": -10.5234375, + "step": 8630 + }, + { + "epoch": 2.7808803411925647, + "grad_norm": 3.1997066139500783, + "learning_rate": 3.0465228589826144e-07, + "logits/chosen": -0.5848388671875, + "logits/rejected": -0.719866931438446, + "logps/chosen": -414.7749938964844, + "logps/rejected": -466.0249938964844, + "loss": 0.0198, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.4039673805236816, + "rewards/margins": 8.331250190734863, + "rewards/rejected": -10.735156059265137, + "step": 8640 + }, + { + "epoch": 2.784099138971594, + "grad_norm": 26.362163813218196, + "learning_rate": 3.038473921442369e-07, + "logits/chosen": -0.5826050043106079, + "logits/rejected": -0.8090454339981079, + "logps/chosen": -413.8500061035156, + "logps/rejected": -460.79998779296875, + "loss": 0.0272, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.5023436546325684, + "rewards/margins": 7.940625190734863, + "rewards/rejected": -10.442968368530273, + "step": 8650 + }, + { + "epoch": 2.7873179367506236, + "grad_norm": 9.570755084954543, + "learning_rate": 3.0304249839021246e-07, + "logits/chosen": -0.4937744140625, + "logits/rejected": -0.7602752447128296, + "logps/chosen": -425.7749938964844, + "logps/rejected": -458.04998779296875, + "loss": 0.0119, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.9621093273162842, + "rewards/margins": 8.744531631469727, + "rewards/rejected": -10.703125, + "step": 8660 + }, + { + "epoch": 2.7905367345296535, + "grad_norm": 2.1047783422866098, + "learning_rate": 3.02237604636188e-07, + "logits/chosen": -0.493338018655777, + "logits/rejected": -0.733489990234375, + "logps/chosen": -425.6499938964844, + "logps/rejected": -468.1000061035156, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4959473609924316, + "rewards/margins": 8.421875, + "rewards/rejected": -10.916406631469727, + "step": 8670 + }, + { + "epoch": 2.7937555323086825, + "grad_norm": 0.8781020528749314, + "learning_rate": 3.0143271088216354e-07, + "logits/chosen": -0.4128051698207855, + "logits/rejected": -0.5542709231376648, + "logps/chosen": -399.57501220703125, + "logps/rejected": -455.7749938964844, + "loss": 0.0186, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.654541015625, + "rewards/margins": 8.348437309265137, + "rewards/rejected": -11.004687309265137, + "step": 8680 + }, + { + "epoch": 2.7969743300877123, + "grad_norm": 0.9793720980769169, + "learning_rate": 3.0062781712813907e-07, + "logits/chosen": -0.42121583223342896, + "logits/rejected": -0.6047607660293579, + "logps/chosen": -460.5249938964844, + "logps/rejected": -489.79998779296875, + "loss": 0.011, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.454083204269409, + "rewards/margins": 8.614062309265137, + "rewards/rejected": -11.071093559265137, + "step": 8690 + }, + { + "epoch": 2.800193127866742, + "grad_norm": 0.3261975073412854, + "learning_rate": 2.9982292337411456e-07, + "logits/chosen": -0.500805675983429, + "logits/rejected": -0.6803222894668579, + "logps/chosen": -389.8125, + "logps/rejected": -450.8500061035156, + "loss": 0.0149, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.4573731422424316, + "rewards/margins": 8.592577934265137, + "rewards/rejected": -11.056249618530273, + "step": 8700 + }, + { + "epoch": 2.8034119256457712, + "grad_norm": 2.3748488962732854, + "learning_rate": 2.9901802962009015e-07, + "logits/chosen": -0.48431396484375, + "logits/rejected": -0.751757800579071, + "logps/chosen": -465.2250061035156, + "logps/rejected": -499.6000061035156, + "loss": 0.0088, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.4757080078125, + "rewards/margins": 9.038281440734863, + "rewards/rejected": -11.51171875, + "step": 8710 + }, + { + "epoch": 2.8066307234248007, + "grad_norm": 1.5381044466016567, + "learning_rate": 2.982131358660657e-07, + "logits/chosen": -0.3299194276332855, + "logits/rejected": -0.547253429889679, + "logps/chosen": -462.92498779296875, + "logps/rejected": -479.20001220703125, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.241406202316284, + "rewards/margins": 8.485937118530273, + "rewards/rejected": -10.732030868530273, + "step": 8720 + }, + { + "epoch": 2.80984952120383, + "grad_norm": 3.254200157680842, + "learning_rate": 2.9740824211204117e-07, + "logits/chosen": -0.5397049188613892, + "logits/rejected": -0.6824706792831421, + "logps/chosen": -412.6000061035156, + "logps/rejected": -480.45001220703125, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3084473609924316, + "rewards/margins": 8.603906631469727, + "rewards/rejected": -10.907031059265137, + "step": 8730 + }, + { + "epoch": 2.81306831898286, + "grad_norm": 5.993139643280078, + "learning_rate": 2.966033483580167e-07, + "logits/chosen": -0.545452892780304, + "logits/rejected": -0.778552234172821, + "logps/chosen": -471.0, + "logps/rejected": -458.6499938964844, + "loss": 0.0288, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.6180176734924316, + "rewards/margins": 8.143750190734863, + "rewards/rejected": -10.770312309265137, + "step": 8740 + }, + { + "epoch": 2.8162871167618895, + "grad_norm": 6.0524780894978045, + "learning_rate": 2.957984546039923e-07, + "logits/chosen": -0.47309571504592896, + "logits/rejected": -0.6326843500137329, + "logps/chosen": -424.0, + "logps/rejected": -466.95001220703125, + "loss": 0.0284, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.3062500953674316, + "rewards/margins": 8.276562690734863, + "rewards/rejected": -10.579687118530273, + "step": 8750 + }, + { + "epoch": 2.819505914540919, + "grad_norm": 9.778876729223859, + "learning_rate": 2.949935608499678e-07, + "logits/chosen": -0.3948730528354645, + "logits/rejected": -0.4809814393520355, + "logps/chosen": -416.0249938964844, + "logps/rejected": -470.07501220703125, + "loss": 0.0141, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.7916381359100342, + "rewards/margins": 8.526562690734863, + "rewards/rejected": -10.322656631469727, + "step": 8760 + }, + { + "epoch": 2.8227247123199484, + "grad_norm": 2.8249468225013, + "learning_rate": 2.941886670959433e-07, + "logits/chosen": -0.576733410358429, + "logits/rejected": -0.863085925579071, + "logps/chosen": -436.67498779296875, + "logps/rejected": -443.04998779296875, + "loss": 0.0177, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.4273681640625, + "rewards/margins": 8.279687881469727, + "rewards/rejected": -10.705469131469727, + "step": 8770 + }, + { + "epoch": 2.825943510098978, + "grad_norm": 62.5054084122427, + "learning_rate": 2.9338377334191885e-07, + "logits/chosen": -0.47079163789749146, + "logits/rejected": -0.61279296875, + "logps/chosen": -421.67498779296875, + "logps/rejected": -468.625, + "loss": 0.0104, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.2734375, + "rewards/margins": 8.478124618530273, + "rewards/rejected": -10.746874809265137, + "step": 8780 + }, + { + "epoch": 2.8291623078780077, + "grad_norm": 2.8183147537088344, + "learning_rate": 2.925788795878944e-07, + "logits/chosen": -0.6253662109375, + "logits/rejected": -0.8195556402206421, + "logps/chosen": -414.4750061035156, + "logps/rejected": -458.1499938964844, + "loss": 0.0173, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.6517090797424316, + "rewards/margins": 8.53125, + "rewards/rejected": -11.185937881469727, + "step": 8790 + }, + { + "epoch": 2.832381105657037, + "grad_norm": 1.3907307890258152, + "learning_rate": 2.917739858338699e-07, + "logits/chosen": -0.6145569086074829, + "logits/rejected": -0.737506091594696, + "logps/chosen": -375.38751220703125, + "logps/rejected": -447.79998779296875, + "loss": 0.0094, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.4078125953674316, + "rewards/margins": 8.90234375, + "rewards/rejected": -11.315625190734863, + "step": 8800 + }, + { + "epoch": 2.8355999034360666, + "grad_norm": 28.852897989676844, + "learning_rate": 2.9096909207984546e-07, + "logits/chosen": -0.5480011105537415, + "logits/rejected": -0.6608215570449829, + "logps/chosen": -395.7749938964844, + "logps/rejected": -488.5, + "loss": 0.0427, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.399761915206909, + "rewards/margins": 8.458593368530273, + "rewards/rejected": -10.862500190734863, + "step": 8810 + }, + { + "epoch": 2.838818701215096, + "grad_norm": 28.14453879913012, + "learning_rate": 2.9016419832582094e-07, + "logits/chosen": -0.6476074457168579, + "logits/rejected": -0.85284423828125, + "logps/chosen": -428.1000061035156, + "logps/rejected": -467.92498779296875, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2303466796875, + "rewards/margins": 8.48046875, + "rewards/rejected": -10.71484375, + "step": 8820 + }, + { + "epoch": 2.8420374989941255, + "grad_norm": 0.6121081846551913, + "learning_rate": 2.8935930457179653e-07, + "logits/chosen": -0.5587524175643921, + "logits/rejected": -0.772961437702179, + "logps/chosen": -404.0249938964844, + "logps/rejected": -443.95001220703125, + "loss": 0.0205, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.7379150390625, + "rewards/margins": 8.419530868530273, + "rewards/rejected": -10.161718368530273, + "step": 8830 + }, + { + "epoch": 2.8452562967731554, + "grad_norm": 1.1686424937173134, + "learning_rate": 2.88554410817772e-07, + "logits/chosen": -0.49064940214157104, + "logits/rejected": -0.622802734375, + "logps/chosen": -456.3999938964844, + "logps/rejected": -514.9000244140625, + "loss": 0.0173, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.0718994140625, + "rewards/margins": 8.548437118530273, + "rewards/rejected": -10.62109375, + "step": 8840 + }, + { + "epoch": 2.848475094552185, + "grad_norm": 23.570162721899703, + "learning_rate": 2.8774951706374755e-07, + "logits/chosen": -0.5179244875907898, + "logits/rejected": -0.595703125, + "logps/chosen": -400.0, + "logps/rejected": -477.75, + "loss": 0.0281, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.404345750808716, + "rewards/margins": 8.467187881469727, + "rewards/rejected": -10.8671875, + "step": 8850 + }, + { + "epoch": 2.8516938923312143, + "grad_norm": 5.153054957327474, + "learning_rate": 2.869446233097231e-07, + "logits/chosen": -0.5641723871231079, + "logits/rejected": -0.72235107421875, + "logps/chosen": -442.95001220703125, + "logps/rejected": -441.625, + "loss": 0.0119, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.1817383766174316, + "rewards/margins": 8.068750381469727, + "rewards/rejected": -10.25, + "step": 8860 + }, + { + "epoch": 2.8549126901102437, + "grad_norm": 19.482493987305013, + "learning_rate": 2.8613972955569863e-07, + "logits/chosen": -0.5357605218887329, + "logits/rejected": -0.7522827386856079, + "logps/chosen": -400.79998779296875, + "logps/rejected": -470.95001220703125, + "loss": 0.0108, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.3101806640625, + "rewards/margins": 8.171875, + "rewards/rejected": -10.479687690734863, + "step": 8870 + }, + { + "epoch": 2.858131487889273, + "grad_norm": 0.8450314126029047, + "learning_rate": 2.8533483580167416e-07, + "logits/chosen": -0.482177734375, + "logits/rejected": -0.64556884765625, + "logps/chosen": -436.125, + "logps/rejected": -511.0, + "loss": 0.014, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.8815186023712158, + "rewards/margins": 8.823437690734863, + "rewards/rejected": -10.709375381469727, + "step": 8880 + }, + { + "epoch": 2.861350285668303, + "grad_norm": 2.073862810386649, + "learning_rate": 2.845299420476497e-07, + "logits/chosen": -0.537518322467804, + "logits/rejected": -0.751538097858429, + "logps/chosen": -421.25, + "logps/rejected": -449.29998779296875, + "loss": 0.0149, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.837426781654358, + "rewards/margins": 8.735156059265137, + "rewards/rejected": -10.568750381469727, + "step": 8890 + }, + { + "epoch": 2.8645690834473325, + "grad_norm": 1.3704860739357017, + "learning_rate": 2.837250482936252e-07, + "logits/chosen": -0.521466076374054, + "logits/rejected": -0.7205139398574829, + "logps/chosen": -435.375, + "logps/rejected": -438.17498779296875, + "loss": 0.0162, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.310131788253784, + "rewards/margins": 7.928906440734863, + "rewards/rejected": -10.239843368530273, + "step": 8900 + }, + { + "epoch": 2.867787881226362, + "grad_norm": 13.267220134510168, + "learning_rate": 2.829201545396008e-07, + "logits/chosen": -0.50494384765625, + "logits/rejected": -0.654388427734375, + "logps/chosen": -440.1875, + "logps/rejected": -470.9750061035156, + "loss": 0.0153, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.972283959388733, + "rewards/margins": 8.684765815734863, + "rewards/rejected": -10.664843559265137, + "step": 8910 + }, + { + "epoch": 2.8710066790053914, + "grad_norm": 8.756897072737045, + "learning_rate": 2.821152607855763e-07, + "logits/chosen": -0.5036681890487671, + "logits/rejected": -0.567553699016571, + "logps/chosen": -409.25, + "logps/rejected": -476.20001220703125, + "loss": 0.0308, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.123095750808716, + "rewards/margins": 8.219531059265137, + "rewards/rejected": -10.337499618530273, + "step": 8920 + }, + { + "epoch": 2.874225476784421, + "grad_norm": 32.10043343294096, + "learning_rate": 2.813103670315518e-07, + "logits/chosen": -0.539581298828125, + "logits/rejected": -0.6648315191268921, + "logps/chosen": -379.7250061035156, + "logps/rejected": -437.95001220703125, + "loss": 0.036, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.09716796875, + "rewards/margins": 8.443750381469727, + "rewards/rejected": -10.539843559265137, + "step": 8930 + }, + { + "epoch": 2.8774442745634508, + "grad_norm": 10.238221740492197, + "learning_rate": 2.8050547327752733e-07, + "logits/chosen": -0.397613525390625, + "logits/rejected": -0.5490051507949829, + "logps/chosen": -466.63751220703125, + "logps/rejected": -510.7749938964844, + "loss": 0.0195, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.843359351158142, + "rewards/margins": 8.350781440734863, + "rewards/rejected": -10.1953125, + "step": 8940 + }, + { + "epoch": 2.88066307234248, + "grad_norm": 9.94763373827978, + "learning_rate": 2.797005795235029e-07, + "logits/chosen": -0.669329822063446, + "logits/rejected": -0.75634765625, + "logps/chosen": -426.0, + "logps/rejected": -467.375, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1550536155700684, + "rewards/margins": 8.612890243530273, + "rewards/rejected": -10.762499809265137, + "step": 8950 + }, + { + "epoch": 2.8838818701215096, + "grad_norm": 0.5402334210782758, + "learning_rate": 2.788956857694784e-07, + "logits/chosen": -0.550610363483429, + "logits/rejected": -0.6672118902206421, + "logps/chosen": -434.625, + "logps/rejected": -503.04998779296875, + "loss": 0.0083, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.3369383811950684, + "rewards/margins": 8.684374809265137, + "rewards/rejected": -11.022656440734863, + "step": 8960 + }, + { + "epoch": 2.887100667900539, + "grad_norm": 1.9562417275497384, + "learning_rate": 2.7809079201545394e-07, + "logits/chosen": -0.558520495891571, + "logits/rejected": -0.841601550579071, + "logps/chosen": -485.95001220703125, + "logps/rejected": -469.4750061035156, + "loss": 0.0146, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.616015672683716, + "rewards/margins": 8.549219131469727, + "rewards/rejected": -11.16015625, + "step": 8970 + }, + { + "epoch": 2.8903194656795685, + "grad_norm": 1.610302470615962, + "learning_rate": 2.772858982614294e-07, + "logits/chosen": -0.5040283203125, + "logits/rejected": -0.7112182378768921, + "logps/chosen": -441.2250061035156, + "logps/rejected": -485.1000061035156, + "loss": 0.0166, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.7052369117736816, + "rewards/margins": 8.771875381469727, + "rewards/rejected": -11.471875190734863, + "step": 8980 + }, + { + "epoch": 2.8935382634585984, + "grad_norm": 2.545393504201997, + "learning_rate": 2.76481004507405e-07, + "logits/chosen": -0.5203857421875, + "logits/rejected": -0.7049499750137329, + "logps/chosen": -481.8500061035156, + "logps/rejected": -506.20001220703125, + "loss": 0.0191, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.7027831077575684, + "rewards/margins": 8.559374809265137, + "rewards/rejected": -11.265625, + "step": 8990 + }, + { + "epoch": 2.896757061237628, + "grad_norm": 3.917419325827331, + "learning_rate": 2.7567611075338055e-07, + "logits/chosen": -0.6212981939315796, + "logits/rejected": -0.906115710735321, + "logps/chosen": -482.1499938964844, + "logps/rejected": -487.29998779296875, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0948243141174316, + "rewards/margins": 8.990625381469727, + "rewards/rejected": -12.0859375, + "step": 9000 + }, + { + "epoch": 2.8999758590166573, + "grad_norm": 30.331228622492244, + "learning_rate": 2.7487121699935604e-07, + "logits/chosen": -0.35914915800094604, + "logits/rejected": -0.575451672077179, + "logps/chosen": -455.04998779296875, + "logps/rejected": -492.7749938964844, + "loss": 0.0275, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.0831055641174316, + "rewards/margins": 8.586718559265137, + "rewards/rejected": -11.668749809265137, + "step": 9010 + }, + { + "epoch": 2.903194656795687, + "grad_norm": 7.790621201456291, + "learning_rate": 2.7406632324533157e-07, + "logits/chosen": -0.5698608160018921, + "logits/rejected": -0.741943359375, + "logps/chosen": -459.04998779296875, + "logps/rejected": -513.0999755859375, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.40234375, + "rewards/margins": 8.494531631469727, + "rewards/rejected": -11.901562690734863, + "step": 9020 + }, + { + "epoch": 2.9064134545747162, + "grad_norm": 8.205122836352547, + "learning_rate": 2.7326142949130716e-07, + "logits/chosen": -0.7016357183456421, + "logits/rejected": -0.9016784429550171, + "logps/chosen": -459.5, + "logps/rejected": -496.5249938964844, + "loss": 0.0221, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.4059081077575684, + "rewards/margins": 8.759374618530273, + "rewards/rejected": -12.159375190734863, + "step": 9030 + }, + { + "epoch": 2.909632252353746, + "grad_norm": 2.124640222047891, + "learning_rate": 2.7245653573728265e-07, + "logits/chosen": -0.506970226764679, + "logits/rejected": -0.7533935308456421, + "logps/chosen": -430.8125, + "logps/rejected": -469.0249938964844, + "loss": 0.0148, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.8310546875, + "rewards/margins": 8.543749809265137, + "rewards/rejected": -11.37890625, + "step": 9040 + }, + { + "epoch": 2.9128510501327756, + "grad_norm": 8.820401245482193, + "learning_rate": 2.716516419832582e-07, + "logits/chosen": -0.5192092657089233, + "logits/rejected": -0.8327423334121704, + "logps/chosen": -461.54998779296875, + "logps/rejected": -492.8500061035156, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.566699266433716, + "rewards/margins": 8.682812690734863, + "rewards/rejected": -11.251562118530273, + "step": 9050 + }, + { + "epoch": 2.916069847911805, + "grad_norm": 15.781023899256146, + "learning_rate": 2.708467482292337e-07, + "logits/chosen": -0.43986207246780396, + "logits/rejected": -0.6464751958847046, + "logps/chosen": -443.1499938964844, + "logps/rejected": -466.875, + "loss": 0.0156, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.9370605945587158, + "rewards/margins": 8.189062118530273, + "rewards/rejected": -10.124218940734863, + "step": 9060 + }, + { + "epoch": 2.9192886456908345, + "grad_norm": 16.653935515908447, + "learning_rate": 2.7004185447520926e-07, + "logits/chosen": -0.5531250238418579, + "logits/rejected": -0.736956775188446, + "logps/chosen": -404.67498779296875, + "logps/rejected": -429.57501220703125, + "loss": 0.0211, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.497363328933716, + "rewards/margins": 8.301562309265137, + "rewards/rejected": -10.807812690734863, + "step": 9070 + }, + { + "epoch": 2.922507443469864, + "grad_norm": 126.36016616899639, + "learning_rate": 2.692369607211848e-07, + "logits/chosen": -0.473876953125, + "logits/rejected": -0.7089782953262329, + "logps/chosen": -450.54998779296875, + "logps/rejected": -486.07501220703125, + "loss": 0.0315, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.539599657058716, + "rewards/margins": 9.036718368530273, + "rewards/rejected": -11.578125, + "step": 9080 + }, + { + "epoch": 2.925726241248894, + "grad_norm": 1.1914755597213176, + "learning_rate": 2.6843206696716033e-07, + "logits/chosen": -0.597729504108429, + "logits/rejected": -0.601910412311554, + "logps/chosen": -432.04998779296875, + "logps/rejected": -531.4500122070312, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.522900342941284, + "rewards/margins": 8.9765625, + "rewards/rejected": -11.490625381469727, + "step": 9090 + }, + { + "epoch": 2.928945039027923, + "grad_norm": 2.5144395948039144, + "learning_rate": 2.6762717321313587e-07, + "logits/chosen": -0.5958496332168579, + "logits/rejected": -0.7978881597518921, + "logps/chosen": -460.75, + "logps/rejected": -481.0249938964844, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6287598609924316, + "rewards/margins": 8.587499618530273, + "rewards/rejected": -11.213281631469727, + "step": 9100 + }, + { + "epoch": 2.9321638368069527, + "grad_norm": 29.476530358291328, + "learning_rate": 2.668222794591114e-07, + "logits/chosen": -0.6678222417831421, + "logits/rejected": -0.805676281452179, + "logps/chosen": -385.88751220703125, + "logps/rejected": -457.17498779296875, + "loss": 0.0209, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.462231397628784, + "rewards/margins": 8.529687881469727, + "rewards/rejected": -10.992968559265137, + "step": 9110 + }, + { + "epoch": 2.935382634585982, + "grad_norm": 2.3397496121586943, + "learning_rate": 2.6601738570508694e-07, + "logits/chosen": -0.477224737405777, + "logits/rejected": -0.68975830078125, + "logps/chosen": -468.67498779296875, + "logps/rejected": -492.79998779296875, + "loss": 0.0123, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.182360887527466, + "rewards/margins": 8.688281059265137, + "rewards/rejected": -11.869531631469727, + "step": 9120 + }, + { + "epoch": 2.9386014323650116, + "grad_norm": 6.62504755959387, + "learning_rate": 2.652124919510624e-07, + "logits/chosen": -0.5084472894668579, + "logits/rejected": -0.6562134027481079, + "logps/chosen": -470.42498779296875, + "logps/rejected": -505.8500061035156, + "loss": 0.0147, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.5522704124450684, + "rewards/margins": 8.528905868530273, + "rewards/rejected": -11.082812309265137, + "step": 9130 + }, + { + "epoch": 2.941820230144041, + "grad_norm": 3.749295965168506, + "learning_rate": 2.64407598197038e-07, + "logits/chosen": -0.5556396245956421, + "logits/rejected": -0.8018798828125, + "logps/chosen": -456.1000061035156, + "logps/rejected": -479.82501220703125, + "loss": 0.019, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.5176758766174316, + "rewards/margins": 8.68359375, + "rewards/rejected": -11.192968368530273, + "step": 9140 + }, + { + "epoch": 2.9450390279230705, + "grad_norm": 14.434338741785277, + "learning_rate": 2.636027044430135e-07, + "logits/chosen": -0.4456420838832855, + "logits/rejected": -0.7585204839706421, + "logps/chosen": -451.25, + "logps/rejected": -467.5249938964844, + "loss": 0.019, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.3312010765075684, + "rewards/margins": 8.607812881469727, + "rewards/rejected": -10.943750381469727, + "step": 9150 + }, + { + "epoch": 2.9482578257021004, + "grad_norm": 2.209628026396337, + "learning_rate": 2.6279781068898903e-07, + "logits/chosen": -0.520458996295929, + "logits/rejected": -0.736865222454071, + "logps/chosen": -484.9750061035156, + "logps/rejected": -491.9750061035156, + "loss": 0.0162, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.054443359375, + "rewards/margins": 8.362500190734863, + "rewards/rejected": -10.419530868530273, + "step": 9160 + }, + { + "epoch": 2.95147662348113, + "grad_norm": 0.9420077698749816, + "learning_rate": 2.6199291693496457e-07, + "logits/chosen": -0.5956481695175171, + "logits/rejected": -0.716723620891571, + "logps/chosen": -416.6000061035156, + "logps/rejected": -463.07501220703125, + "loss": 0.0175, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.730712890625, + "rewards/margins": 8.470312118530273, + "rewards/rejected": -10.194531440734863, + "step": 9170 + }, + { + "epoch": 2.9546954212601593, + "grad_norm": 2.312670752387843, + "learning_rate": 2.611880231809401e-07, + "logits/chosen": -0.4604232907295227, + "logits/rejected": -0.7019561529159546, + "logps/chosen": -452.6000061035156, + "logps/rejected": -460.6000061035156, + "loss": 0.0085, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.5531189441680908, + "rewards/margins": 8.536718368530273, + "rewards/rejected": -10.096094131469727, + "step": 9180 + }, + { + "epoch": 2.9579142190391887, + "grad_norm": 0.7305338230954745, + "learning_rate": 2.6038312942691564e-07, + "logits/chosen": -0.5050758123397827, + "logits/rejected": -0.6465820074081421, + "logps/chosen": -432.11248779296875, + "logps/rejected": -477.1875, + "loss": 0.0251, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.112353563308716, + "rewards/margins": 8.236913681030273, + "rewards/rejected": -10.3515625, + "step": 9190 + }, + { + "epoch": 2.961133016818218, + "grad_norm": 3.913366487542373, + "learning_rate": 2.595782356728912e-07, + "logits/chosen": -0.37353819608688354, + "logits/rejected": -0.6171630620956421, + "logps/chosen": -414.17498779296875, + "logps/rejected": -442.82501220703125, + "loss": 0.0188, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.989965796470642, + "rewards/margins": 8.616406440734863, + "rewards/rejected": -10.603124618530273, + "step": 9200 + }, + { + "epoch": 2.964351814597248, + "grad_norm": 1.433550470063184, + "learning_rate": 2.5877334191886666e-07, + "logits/chosen": -0.4478759765625, + "logits/rejected": -0.6420532464981079, + "logps/chosen": -443.70001220703125, + "logps/rejected": -491.1000061035156, + "loss": 0.0196, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.3563475608825684, + "rewards/margins": 8.874218940734863, + "rewards/rejected": -11.224218368530273, + "step": 9210 + }, + { + "epoch": 2.9675706123762775, + "grad_norm": 1.9548222103074737, + "learning_rate": 2.5796844816484225e-07, + "logits/chosen": -0.47431641817092896, + "logits/rejected": -0.7542785406112671, + "logps/chosen": -439.7124938964844, + "logps/rejected": -474.4750061035156, + "loss": 0.0117, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.052233934402466, + "rewards/margins": 9.383593559265137, + "rewards/rejected": -11.442968368530273, + "step": 9220 + }, + { + "epoch": 2.970789410155307, + "grad_norm": 10.547318996612269, + "learning_rate": 2.571635544108178e-07, + "logits/chosen": -0.5974060297012329, + "logits/rejected": -0.7191406488418579, + "logps/chosen": -477.29998779296875, + "logps/rejected": -481.67498779296875, + "loss": 0.023, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.5707030296325684, + "rewards/margins": 8.5, + "rewards/rejected": -11.071093559265137, + "step": 9230 + }, + { + "epoch": 2.9740082079343364, + "grad_norm": 25.01723770599662, + "learning_rate": 2.563586606567933e-07, + "logits/chosen": -0.6593047976493835, + "logits/rejected": -0.811230480670929, + "logps/chosen": -393.2749938964844, + "logps/rejected": -449.3500061035156, + "loss": 0.0239, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.691943407058716, + "rewards/margins": 8.528905868530273, + "rewards/rejected": -11.228906631469727, + "step": 9240 + }, + { + "epoch": 2.977227005713366, + "grad_norm": 71.44915226288786, + "learning_rate": 2.555537669027688e-07, + "logits/chosen": -0.5322036743164062, + "logits/rejected": -0.8440185785293579, + "logps/chosen": -450.45001220703125, + "logps/rejected": -484.5, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.988574266433716, + "rewards/margins": 8.817968368530273, + "rewards/rejected": -11.811718940734863, + "step": 9250 + }, + { + "epoch": 2.9804458034923957, + "grad_norm": 0.8989977602738716, + "learning_rate": 2.547488731487444e-07, + "logits/chosen": -0.5390564203262329, + "logits/rejected": -0.7166748046875, + "logps/chosen": -437.2875061035156, + "logps/rejected": -479.67498779296875, + "loss": 0.023, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.766064405441284, + "rewards/margins": 8.93359375, + "rewards/rejected": -11.694531440734863, + "step": 9260 + }, + { + "epoch": 2.983664601271425, + "grad_norm": 4.719198839411514, + "learning_rate": 2.539439793947199e-07, + "logits/chosen": -0.5741729736328125, + "logits/rejected": -0.6941772699356079, + "logps/chosen": -406.3500061035156, + "logps/rejected": -492.1000061035156, + "loss": 0.0087, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.7017579078674316, + "rewards/margins": 9.003125190734863, + "rewards/rejected": -11.704687118530273, + "step": 9270 + }, + { + "epoch": 2.9868833990504546, + "grad_norm": 26.11813297352473, + "learning_rate": 2.531390856406954e-07, + "logits/chosen": -0.6433349847793579, + "logits/rejected": -0.787670910358429, + "logps/chosen": -429.375, + "logps/rejected": -460.7250061035156, + "loss": 0.0291, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.4124999046325684, + "rewards/margins": 8.318750381469727, + "rewards/rejected": -11.728124618530273, + "step": 9280 + }, + { + "epoch": 2.990102196829484, + "grad_norm": 0.6396375986439401, + "learning_rate": 2.5233419188667096e-07, + "logits/chosen": -0.527172863483429, + "logits/rejected": -0.5692504644393921, + "logps/chosen": -403.8999938964844, + "logps/rejected": -482.7250061035156, + "loss": 0.0477, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.890331983566284, + "rewards/margins": 8.180468559265137, + "rewards/rejected": -11.0703125, + "step": 9290 + }, + { + "epoch": 2.9933209946085135, + "grad_norm": 14.970231446001907, + "learning_rate": 2.515292981326465e-07, + "logits/chosen": -0.622692883014679, + "logits/rejected": -0.810351550579071, + "logps/chosen": -411.8500061035156, + "logps/rejected": -441.6499938964844, + "loss": 0.0101, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.6986327171325684, + "rewards/margins": 8.055468559265137, + "rewards/rejected": -10.753125190734863, + "step": 9300 + }, + { + "epoch": 2.9965397923875434, + "grad_norm": 7.208148741224667, + "learning_rate": 2.5072440437862203e-07, + "logits/chosen": -0.58734130859375, + "logits/rejected": -0.79010009765625, + "logps/chosen": -442.42498779296875, + "logps/rejected": -490.0, + "loss": 0.0086, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.3633790016174316, + "rewards/margins": 8.750781059265137, + "rewards/rejected": -11.103124618530273, + "step": 9310 + }, + { + "epoch": 2.999758590166573, + "grad_norm": 2.2928292485669615, + "learning_rate": 2.499195106245975e-07, + "logits/chosen": -0.48567503690719604, + "logits/rejected": -0.720629870891571, + "logps/chosen": -457.2749938964844, + "logps/rejected": -458.5249938964844, + "loss": 0.0138, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.2429137229919434, + "rewards/margins": 8.7265625, + "rewards/rejected": -10.969531059265137, + "step": 9320 + }, + { + "epoch": 3.0028969180011265, + "grad_norm": 2.158045328889, + "learning_rate": 2.491146168705731e-07, + "logits/chosen": -0.5152055621147156, + "logits/rejected": -0.6566882133483887, + "logps/chosen": -382.4615478515625, + "logps/rejected": -425.9230651855469, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6290220022201538, + "rewards/margins": 9.599358558654785, + "rewards/rejected": -11.224358558654785, + "step": 9330 + }, + { + "epoch": 3.006115715780156, + "grad_norm": 1.0346149456740623, + "learning_rate": 2.483097231165486e-07, + "logits/chosen": -0.489105224609375, + "logits/rejected": -0.7758544683456421, + "logps/chosen": -419.17498779296875, + "logps/rejected": -452.25, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1817994117736816, + "rewards/margins": 9.671875, + "rewards/rejected": -11.852343559265137, + "step": 9340 + }, + { + "epoch": 3.009334513559186, + "grad_norm": 0.3463932714112916, + "learning_rate": 2.475048293625241e-07, + "logits/chosen": -0.5008789300918579, + "logits/rejected": -0.6536865234375, + "logps/chosen": -455.2749938964844, + "logps/rejected": -531.6749877929688, + "loss": 0.0051, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.978271484375, + "rewards/margins": 10.0390625, + "rewards/rejected": -12.016406059265137, + "step": 9350 + }, + { + "epoch": 3.0125533113382152, + "grad_norm": 0.1469467173837905, + "learning_rate": 2.4669993560849966e-07, + "logits/chosen": -0.547894299030304, + "logits/rejected": -0.883593738079071, + "logps/chosen": -441.3999938964844, + "logps/rejected": -461.2749938964844, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2674803733825684, + "rewards/margins": 10.11328125, + "rewards/rejected": -12.373437881469727, + "step": 9360 + }, + { + "epoch": 3.0157721091172447, + "grad_norm": 3.0962059641886546, + "learning_rate": 2.458950418544752e-07, + "logits/chosen": -0.6479126214981079, + "logits/rejected": -0.785168468952179, + "logps/chosen": -421.125, + "logps/rejected": -473.95001220703125, + "loss": 0.0058, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.382128953933716, + "rewards/margins": 9.521093368530273, + "rewards/rejected": -12.91015625, + "step": 9370 + }, + { + "epoch": 3.018990906896274, + "grad_norm": 7.444272505391699, + "learning_rate": 2.4509014810045074e-07, + "logits/chosen": -0.683197021484375, + "logits/rejected": -0.7616027593612671, + "logps/chosen": -416.79998779296875, + "logps/rejected": -480.1499938964844, + "loss": 0.0062, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.773345947265625, + "rewards/margins": 9.78515625, + "rewards/rejected": -12.559374809265137, + "step": 9380 + }, + { + "epoch": 3.0222097046753036, + "grad_norm": 1.1032816720066805, + "learning_rate": 2.4428525434642627e-07, + "logits/chosen": -0.550122082233429, + "logits/rejected": -0.7519286870956421, + "logps/chosen": -389.2749938964844, + "logps/rejected": -490.70001220703125, + "loss": 0.0076, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.331835985183716, + "rewards/margins": 10.089062690734863, + "rewards/rejected": -13.417187690734863, + "step": 9390 + }, + { + "epoch": 3.0254285024543335, + "grad_norm": 0.7685357387672501, + "learning_rate": 2.434803605924018e-07, + "logits/chosen": -0.73944091796875, + "logits/rejected": -0.8756103515625, + "logps/chosen": -434.57501220703125, + "logps/rejected": -488.75, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1324219703674316, + "rewards/margins": 10.449999809265137, + "rewards/rejected": -13.576562881469727, + "step": 9400 + }, + { + "epoch": 3.028647300233363, + "grad_norm": 1.5083563844412575, + "learning_rate": 2.4267546683837735e-07, + "logits/chosen": -0.4747512936592102, + "logits/rejected": -0.7348388433456421, + "logps/chosen": -429.88751220703125, + "logps/rejected": -490.75, + "loss": 0.0183, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.2865233421325684, + "rewards/margins": 10.254687309265137, + "rewards/rejected": -13.534375190734863, + "step": 9410 + }, + { + "epoch": 3.0318660980123924, + "grad_norm": 0.2438029673578744, + "learning_rate": 2.4187057308435283e-07, + "logits/chosen": -0.5634826421737671, + "logits/rejected": -0.744769275188446, + "logps/chosen": -439.6000061035156, + "logps/rejected": -527.5499877929688, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.9981446266174316, + "rewards/margins": 10.192187309265137, + "rewards/rejected": -13.201562881469727, + "step": 9420 + }, + { + "epoch": 3.035084895791422, + "grad_norm": 0.46559699145151706, + "learning_rate": 2.410656793303284e-07, + "logits/chosen": -0.506970226764679, + "logits/rejected": -0.7862609624862671, + "logps/chosen": -424.63751220703125, + "logps/rejected": -485.7250061035156, + "loss": 0.0061, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.4305419921875, + "rewards/margins": 10.776562690734863, + "rewards/rejected": -13.204687118530273, + "step": 9430 + }, + { + "epoch": 3.0383036935704513, + "grad_norm": 1.852728457723093, + "learning_rate": 2.402607855763039e-07, + "logits/chosen": -0.5943664312362671, + "logits/rejected": -0.7294067144393921, + "logps/chosen": -430.1499938964844, + "logps/rejected": -495.95001220703125, + "loss": 0.018, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.108569383621216, + "rewards/margins": 10.11328125, + "rewards/rejected": -13.223437309265137, + "step": 9440 + }, + { + "epoch": 3.041522491349481, + "grad_norm": 0.6442019365812937, + "learning_rate": 2.3945589182227944e-07, + "logits/chosen": -0.5627075433731079, + "logits/rejected": -0.750720202922821, + "logps/chosen": -457.7749938964844, + "logps/rejected": -500.29998779296875, + "loss": 0.0056, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.276318311691284, + "rewards/margins": 10.055468559265137, + "rewards/rejected": -13.326562881469727, + "step": 9450 + }, + { + "epoch": 3.0447412891285106, + "grad_norm": 1.4804318948915793, + "learning_rate": 2.38650998068255e-07, + "logits/chosen": -0.5351623296737671, + "logits/rejected": -0.808026134967804, + "logps/chosen": -431.3500061035156, + "logps/rejected": -491.8500061035156, + "loss": 0.0095, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.12109375, + "rewards/margins": 9.84765625, + "rewards/rejected": -12.96875, + "step": 9460 + }, + { + "epoch": 3.04796008690754, + "grad_norm": 0.6655982993059596, + "learning_rate": 2.3784610431423051e-07, + "logits/chosen": -0.757031261920929, + "logits/rejected": -0.7741058468818665, + "logps/chosen": -398.79998779296875, + "logps/rejected": -522.125, + "loss": 0.0055, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.2174315452575684, + "rewards/margins": 9.921093940734863, + "rewards/rejected": -13.139062881469727, + "step": 9470 + }, + { + "epoch": 3.0511788846865695, + "grad_norm": 0.21319915694041477, + "learning_rate": 2.3704121056020605e-07, + "logits/chosen": -0.52374267578125, + "logits/rejected": -0.7463531494140625, + "logps/chosen": -509.57501220703125, + "logps/rejected": -552.0499877929688, + "loss": 0.016, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.9771485328674316, + "rewards/margins": 10.038281440734863, + "rewards/rejected": -13.021875381469727, + "step": 9480 + }, + { + "epoch": 3.054397682465599, + "grad_norm": 0.4571798267092712, + "learning_rate": 2.362363168061816e-07, + "logits/chosen": -0.53662109375, + "logits/rejected": -0.8059066534042358, + "logps/chosen": -478.26251220703125, + "logps/rejected": -509.79998779296875, + "loss": 0.0049, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.9927978515625, + "rewards/margins": 10.699999809265137, + "rewards/rejected": -12.693750381469727, + "step": 9490 + }, + { + "epoch": 3.057616480244629, + "grad_norm": 1.632218143247566, + "learning_rate": 2.354314230521571e-07, + "logits/chosen": -0.38300782442092896, + "logits/rejected": -0.658679187297821, + "logps/chosen": -465.875, + "logps/rejected": -525.1500244140625, + "loss": 0.0055, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.747021436691284, + "rewards/margins": 9.686718940734863, + "rewards/rejected": -12.432812690734863, + "step": 9500 + }, + { + "epoch": 3.0608352780236583, + "grad_norm": 0.5548616984705896, + "learning_rate": 2.3462652929813263e-07, + "logits/chosen": -0.575488269329071, + "logits/rejected": -0.7869201898574829, + "logps/chosen": -457.6499938964844, + "logps/rejected": -501.79998779296875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7891602516174316, + "rewards/margins": 10.08984375, + "rewards/rejected": -12.879687309265137, + "step": 9510 + }, + { + "epoch": 3.0640540758026877, + "grad_norm": 0.5420092085772109, + "learning_rate": 2.3382163554410817e-07, + "logits/chosen": -0.747601330280304, + "logits/rejected": -0.9070190191268921, + "logps/chosen": -399.92498779296875, + "logps/rejected": -456.92498779296875, + "loss": 0.0168, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.1890625953674316, + "rewards/margins": 10.251562118530273, + "rewards/rejected": -13.435937881469727, + "step": 9520 + }, + { + "epoch": 3.067272873581717, + "grad_norm": 0.559288325925879, + "learning_rate": 2.330167417900837e-07, + "logits/chosen": -0.557812511920929, + "logits/rejected": -0.742877185344696, + "logps/chosen": -464.25, + "logps/rejected": -529.75, + "loss": 0.0048, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.418652296066284, + "rewards/margins": 10.384374618530273, + "rewards/rejected": -13.801562309265137, + "step": 9530 + }, + { + "epoch": 3.0704916713607466, + "grad_norm": 3.0771477696834992, + "learning_rate": 2.3221184803605922e-07, + "logits/chosen": -0.5746704339981079, + "logits/rejected": -0.71771240234375, + "logps/chosen": -432.45001220703125, + "logps/rejected": -505.4750061035156, + "loss": 0.0095, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.4659180641174316, + "rewards/margins": 10.1015625, + "rewards/rejected": -13.576562881469727, + "step": 9540 + }, + { + "epoch": 3.073710469139776, + "grad_norm": 0.7778629128354831, + "learning_rate": 2.3140695428203478e-07, + "logits/chosen": -0.5536773800849915, + "logits/rejected": -0.7110961675643921, + "logps/chosen": -422.17498779296875, + "logps/rejected": -489.8999938964844, + "loss": 0.0052, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.3165283203125, + "rewards/margins": 10.331250190734863, + "rewards/rejected": -13.642187118530273, + "step": 9550 + }, + { + "epoch": 3.076929266918806, + "grad_norm": 0.34299926260604724, + "learning_rate": 2.306020605280103e-07, + "logits/chosen": -0.530773937702179, + "logits/rejected": -0.7767120599746704, + "logps/chosen": -464.25, + "logps/rejected": -489.0, + "loss": 0.0052, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.376953125, + "rewards/margins": 10.235937118530273, + "rewards/rejected": -13.606249809265137, + "step": 9560 + }, + { + "epoch": 3.0801480646978354, + "grad_norm": 0.5295264743971226, + "learning_rate": 2.2979716677398583e-07, + "logits/chosen": -0.7320312261581421, + "logits/rejected": -0.8402343988418579, + "logps/chosen": -397.75, + "logps/rejected": -477.5, + "loss": 0.0048, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.3349609375, + "rewards/margins": 10.659375190734863, + "rewards/rejected": -13.987500190734863, + "step": 9570 + }, + { + "epoch": 3.083366862476865, + "grad_norm": 0.563277742358812, + "learning_rate": 2.2899227301996136e-07, + "logits/chosen": -0.610504150390625, + "logits/rejected": -0.792004406452179, + "logps/chosen": -396.2749938964844, + "logps/rejected": -492.57501220703125, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9084715843200684, + "rewards/margins": 10.607812881469727, + "rewards/rejected": -13.520312309265137, + "step": 9580 + }, + { + "epoch": 3.0865856602558943, + "grad_norm": 6.7605425406463455, + "learning_rate": 2.281873792659369e-07, + "logits/chosen": -0.6882079839706421, + "logits/rejected": -0.785839855670929, + "logps/chosen": -428.54998779296875, + "logps/rejected": -524.4000244140625, + "loss": 0.0055, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.2113280296325684, + "rewards/margins": 10.999218940734863, + "rewards/rejected": -14.215624809265137, + "step": 9590 + }, + { + "epoch": 3.0898044580349238, + "grad_norm": 0.11920662578826767, + "learning_rate": 2.273824855119124e-07, + "logits/chosen": -0.45330810546875, + "logits/rejected": -0.6261261105537415, + "logps/chosen": -454.54998779296875, + "logps/rejected": -515.2000122070312, + "loss": 0.0181, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.478515625, + "rewards/margins": 10.545312881469727, + "rewards/rejected": -14.0234375, + "step": 9600 + }, + { + "epoch": 3.0930232558139537, + "grad_norm": 0.6486881862034146, + "learning_rate": 2.2657759175788795e-07, + "logits/chosen": -0.7178710699081421, + "logits/rejected": -0.846728503704071, + "logps/chosen": -428.75, + "logps/rejected": -504.6000061035156, + "loss": 0.008, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.8609375953674316, + "rewards/margins": 10.410937309265137, + "rewards/rejected": -14.265625, + "step": 9610 + }, + { + "epoch": 3.096242053592983, + "grad_norm": 0.4953143519928256, + "learning_rate": 2.2577269800386349e-07, + "logits/chosen": -0.659106433391571, + "logits/rejected": -0.879650890827179, + "logps/chosen": -439.17498779296875, + "logps/rejected": -497.8999938964844, + "loss": 0.0054, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.796142578125, + "rewards/margins": 10.032031059265137, + "rewards/rejected": -13.824999809265137, + "step": 9620 + }, + { + "epoch": 3.0994608513720125, + "grad_norm": 0.4008861615245404, + "learning_rate": 2.2496780424983902e-07, + "logits/chosen": -0.6470702886581421, + "logits/rejected": -0.8326660394668579, + "logps/chosen": -439.20001220703125, + "logps/rejected": -475.3999938964844, + "loss": 0.0048, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.08203125, + "rewards/margins": 10.014062881469727, + "rewards/rejected": -13.092187881469727, + "step": 9630 + }, + { + "epoch": 3.102679649151042, + "grad_norm": 0.41653952122956345, + "learning_rate": 2.2416291049581453e-07, + "logits/chosen": -0.522106945514679, + "logits/rejected": -0.7438110113143921, + "logps/chosen": -475.79998779296875, + "logps/rejected": -515.7000122070312, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.4017333984375, + "rewards/margins": 10.232812881469727, + "rewards/rejected": -13.639843940734863, + "step": 9640 + }, + { + "epoch": 3.1058984469300714, + "grad_norm": 0.414647301031093, + "learning_rate": 2.233580167417901e-07, + "logits/chosen": -0.6600402593612671, + "logits/rejected": -0.910961925983429, + "logps/chosen": -439.1499938964844, + "logps/rejected": -476.45001220703125, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.2710938453674316, + "rewards/margins": 10.544530868530273, + "rewards/rejected": -13.8203125, + "step": 9650 + }, + { + "epoch": 3.1091172447091013, + "grad_norm": 8.183217560618623, + "learning_rate": 2.225531229877656e-07, + "logits/chosen": -0.53265380859375, + "logits/rejected": -0.6930176019668579, + "logps/chosen": -446.125, + "logps/rejected": -518.5250244140625, + "loss": 0.0053, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.57421875, + "rewards/margins": 10.34375, + "rewards/rejected": -13.921875, + "step": 9660 + }, + { + "epoch": 3.112336042488131, + "grad_norm": 0.4238057852822602, + "learning_rate": 2.2174822923374114e-07, + "logits/chosen": -0.648211658000946, + "logits/rejected": -0.828442394733429, + "logps/chosen": -431.5, + "logps/rejected": -508.29998779296875, + "loss": 0.0058, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.472973585128784, + "rewards/margins": 10.821874618530273, + "rewards/rejected": -14.301562309265137, + "step": 9670 + }, + { + "epoch": 3.1155548402671602, + "grad_norm": 0.7934957364804466, + "learning_rate": 2.2094333547971665e-07, + "logits/chosen": -0.619854748249054, + "logits/rejected": -0.8575683832168579, + "logps/chosen": -435.4750061035156, + "logps/rejected": -517.9500122070312, + "loss": 0.0056, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.5755858421325684, + "rewards/margins": 10.767969131469727, + "rewards/rejected": -14.340624809265137, + "step": 9680 + }, + { + "epoch": 3.1187736380461897, + "grad_norm": 2.3419315812557526, + "learning_rate": 2.2013844172569222e-07, + "logits/chosen": -0.6262420415878296, + "logits/rejected": -0.7595459222793579, + "logps/chosen": -470.29998779296875, + "logps/rejected": -555.2000122070312, + "loss": 0.006, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.7930665016174316, + "rewards/margins": 11.039843559265137, + "rewards/rejected": -14.837499618530273, + "step": 9690 + }, + { + "epoch": 3.121992435825219, + "grad_norm": 1.7154108922237712, + "learning_rate": 2.1933354797166773e-07, + "logits/chosen": -0.686999499797821, + "logits/rejected": -0.8966308832168579, + "logps/chosen": -480.5, + "logps/rejected": -528.6500244140625, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.77569580078125, + "rewards/margins": 10.699999809265137, + "rewards/rejected": -14.481249809265137, + "step": 9700 + }, + { + "epoch": 3.125211233604249, + "grad_norm": 0.2798420942428041, + "learning_rate": 2.1852865421764326e-07, + "logits/chosen": -0.6624084711074829, + "logits/rejected": -0.8705078363418579, + "logps/chosen": -430.07501220703125, + "logps/rejected": -499.45001220703125, + "loss": 0.0048, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.440624952316284, + "rewards/margins": 10.446874618530273, + "rewards/rejected": -13.879687309265137, + "step": 9710 + }, + { + "epoch": 3.1284300313832785, + "grad_norm": 1.1900188363462758, + "learning_rate": 2.177237604636188e-07, + "logits/chosen": -0.668780505657196, + "logits/rejected": -0.8663574457168579, + "logps/chosen": -427.1499938964844, + "logps/rejected": -472.25, + "loss": 0.0054, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.632031202316284, + "rewards/margins": 10.166406631469727, + "rewards/rejected": -13.800000190734863, + "step": 9720 + }, + { + "epoch": 3.131648829162308, + "grad_norm": 0.46108754982269295, + "learning_rate": 2.1691886670959434e-07, + "logits/chosen": -0.4460441470146179, + "logits/rejected": -0.594805896282196, + "logps/chosen": -436.9750061035156, + "logps/rejected": -478.25, + "loss": 0.0063, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.96142578125, + "rewards/margins": 10.041406631469727, + "rewards/rejected": -12.996874809265137, + "step": 9730 + }, + { + "epoch": 3.1348676269413374, + "grad_norm": 0.5053653967471706, + "learning_rate": 2.1611397295556985e-07, + "logits/chosen": -0.4989013671875, + "logits/rejected": -0.711077868938446, + "logps/chosen": -440.1499938964844, + "logps/rejected": -493.54998779296875, + "loss": 0.0052, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.7865233421325684, + "rewards/margins": 10.354687690734863, + "rewards/rejected": -13.140625, + "step": 9740 + }, + { + "epoch": 3.138086424720367, + "grad_norm": 1.9600780824922184, + "learning_rate": 2.1530907920154538e-07, + "logits/chosen": -0.6235595941543579, + "logits/rejected": -0.8115234375, + "logps/chosen": -421.8500061035156, + "logps/rejected": -458.07501220703125, + "loss": 0.0051, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.1842284202575684, + "rewards/margins": 9.88671875, + "rewards/rejected": -13.067187309265137, + "step": 9750 + }, + { + "epoch": 3.1413052224993967, + "grad_norm": 0.8753319371954136, + "learning_rate": 2.1450418544752092e-07, + "logits/chosen": -0.639508068561554, + "logits/rejected": -0.8695923089981079, + "logps/chosen": -457.8500061035156, + "logps/rejected": -499.45001220703125, + "loss": 0.0048, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.2394042015075684, + "rewards/margins": 10.190625190734863, + "rewards/rejected": -13.426562309265137, + "step": 9760 + }, + { + "epoch": 3.144524020278426, + "grad_norm": 0.7638953982949837, + "learning_rate": 2.1369929169349646e-07, + "logits/chosen": -0.691479504108429, + "logits/rejected": -0.864514172077179, + "logps/chosen": -496.7250061035156, + "logps/rejected": -529.0, + "loss": 0.005, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.395214796066284, + "rewards/margins": 10.015625, + "rewards/rejected": -13.420312881469727, + "step": 9770 + }, + { + "epoch": 3.1477428180574556, + "grad_norm": 3.228799074008784, + "learning_rate": 2.1289439793947197e-07, + "logits/chosen": -0.596057116985321, + "logits/rejected": -0.9015136957168579, + "logps/chosen": -447.375, + "logps/rejected": -499.6499938964844, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3106932640075684, + "rewards/margins": 10.345312118530273, + "rewards/rejected": -13.665624618530273, + "step": 9780 + }, + { + "epoch": 3.150961615836485, + "grad_norm": 1.9882381719439326, + "learning_rate": 2.1208950418544753e-07, + "logits/chosen": -0.5967956781387329, + "logits/rejected": -0.8252197504043579, + "logps/chosen": -432.67498779296875, + "logps/rejected": -506.82501220703125, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4041991233825684, + "rewards/margins": 11.286718368530273, + "rewards/rejected": -14.699999809265137, + "step": 9790 + }, + { + "epoch": 3.1541804136155145, + "grad_norm": 0.7818270599283283, + "learning_rate": 2.1128461043142304e-07, + "logits/chosen": -0.719268798828125, + "logits/rejected": -0.89251708984375, + "logps/chosen": -436.5, + "logps/rejected": -505.375, + "loss": 0.005, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.644238233566284, + "rewards/margins": 9.900781631469727, + "rewards/rejected": -13.545312881469727, + "step": 9800 + }, + { + "epoch": 3.157399211394544, + "grad_norm": 0.3707411867512116, + "learning_rate": 2.1047971667739858e-07, + "logits/chosen": -0.49852293729782104, + "logits/rejected": -0.6614745855331421, + "logps/chosen": -433.7749938964844, + "logps/rejected": -500.0, + "loss": 0.0064, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.3831787109375, + "rewards/margins": 10.5078125, + "rewards/rejected": -13.896875381469727, + "step": 9810 + }, + { + "epoch": 3.160618009173574, + "grad_norm": 0.9566607075681659, + "learning_rate": 2.0967482292337411e-07, + "logits/chosen": -0.705059826374054, + "logits/rejected": -0.8375610113143921, + "logps/chosen": -400.45001220703125, + "logps/rejected": -507.45001220703125, + "loss": 0.0049, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.610790967941284, + "rewards/margins": 10.850000381469727, + "rewards/rejected": -14.449999809265137, + "step": 9820 + }, + { + "epoch": 3.1638368069526033, + "grad_norm": 0.1878684041404509, + "learning_rate": 2.0886992916934965e-07, + "logits/chosen": -0.6789093017578125, + "logits/rejected": -0.8298095464706421, + "logps/chosen": -430.8999938964844, + "logps/rejected": -517.4500122070312, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.319140672683716, + "rewards/margins": 10.560937881469727, + "rewards/rejected": -13.8828125, + "step": 9830 + }, + { + "epoch": 3.1670556047316327, + "grad_norm": 18.464900779334258, + "learning_rate": 2.0806503541532516e-07, + "logits/chosen": -0.5073486566543579, + "logits/rejected": -0.6837524175643921, + "logps/chosen": -443.8500061035156, + "logps/rejected": -499.54998779296875, + "loss": 0.008, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.0209717750549316, + "rewards/margins": 10.873437881469727, + "rewards/rejected": -13.896093368530273, + "step": 9840 + }, + { + "epoch": 3.170274402510662, + "grad_norm": 3.886331330341584, + "learning_rate": 2.072601416613007e-07, + "logits/chosen": -0.5392211675643921, + "logits/rejected": -0.802490234375, + "logps/chosen": -459.9750061035156, + "logps/rejected": -526.2000122070312, + "loss": 0.005, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.1478514671325684, + "rewards/margins": 10.466405868530273, + "rewards/rejected": -13.6171875, + "step": 9850 + }, + { + "epoch": 3.1734932002896916, + "grad_norm": 25.05005958551861, + "learning_rate": 2.0645524790727623e-07, + "logits/chosen": -0.631927490234375, + "logits/rejected": -0.8154296875, + "logps/chosen": -426.1499938964844, + "logps/rejected": -503.45001220703125, + "loss": 0.0072, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.3809571266174316, + "rewards/margins": 10.03515625, + "rewards/rejected": -13.419530868530273, + "step": 9860 + }, + { + "epoch": 3.1767119980687215, + "grad_norm": 0.18398364189611238, + "learning_rate": 2.0565035415325177e-07, + "logits/chosen": -0.68524169921875, + "logits/rejected": -0.870849609375, + "logps/chosen": -476.57501220703125, + "logps/rejected": -543.9500122070312, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5990967750549316, + "rewards/margins": 10.737500190734863, + "rewards/rejected": -14.334375381469727, + "step": 9870 + }, + { + "epoch": 3.179930795847751, + "grad_norm": 0.15882007463569636, + "learning_rate": 2.0484546039922728e-07, + "logits/chosen": -0.633105456829071, + "logits/rejected": -0.8441101312637329, + "logps/chosen": -425.3500061035156, + "logps/rejected": -469.8999938964844, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.330859422683716, + "rewards/margins": 10.325780868530273, + "rewards/rejected": -13.654687881469727, + "step": 9880 + }, + { + "epoch": 3.1831495936267804, + "grad_norm": 0.6962324434615712, + "learning_rate": 2.0404056664520284e-07, + "logits/chosen": -0.791015625, + "logits/rejected": -0.89776611328125, + "logps/chosen": -430.2250061035156, + "logps/rejected": -511.95001220703125, + "loss": 0.0061, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.358593702316284, + "rewards/margins": 10.426562309265137, + "rewards/rejected": -13.796875, + "step": 9890 + }, + { + "epoch": 3.18636839140581, + "grad_norm": 0.4454876661958816, + "learning_rate": 2.0323567289117835e-07, + "logits/chosen": -0.693103015422821, + "logits/rejected": -0.9295654296875, + "logps/chosen": -445.375, + "logps/rejected": -545.5499877929688, + "loss": 0.0094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.4009766578674316, + "rewards/margins": 10.828125, + "rewards/rejected": -14.235937118530273, + "step": 9900 + }, + { + "epoch": 3.1895871891848393, + "grad_norm": 0.22360998173953425, + "learning_rate": 2.024307791371539e-07, + "logits/chosen": -0.50469970703125, + "logits/rejected": -0.697924792766571, + "logps/chosen": -453.75, + "logps/rejected": -522.0499877929688, + "loss": 0.0177, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.267071485519409, + "rewards/margins": 10.88671875, + "rewards/rejected": -14.160937309265137, + "step": 9910 + }, + { + "epoch": 3.192805986963869, + "grad_norm": 0.23740401616840204, + "learning_rate": 2.016258853831294e-07, + "logits/chosen": -0.575714111328125, + "logits/rejected": -0.854418933391571, + "logps/chosen": -439.9750061035156, + "logps/rejected": -485.25, + "loss": 0.0048, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.1605467796325684, + "rewards/margins": 10.456250190734863, + "rewards/rejected": -13.625, + "step": 9920 + }, + { + "epoch": 3.1960247847428986, + "grad_norm": 0.6102746818448548, + "learning_rate": 2.0082099162910496e-07, + "logits/chosen": -0.6001037359237671, + "logits/rejected": -0.8821777105331421, + "logps/chosen": -480.1000061035156, + "logps/rejected": -501.0, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.485913038253784, + "rewards/margins": 10.294530868530273, + "rewards/rejected": -13.778124809265137, + "step": 9930 + }, + { + "epoch": 3.199243582521928, + "grad_norm": 0.1484868845317087, + "learning_rate": 2.0001609787508047e-07, + "logits/chosen": -0.5502403378486633, + "logits/rejected": -0.6889709234237671, + "logps/chosen": -442.6499938964844, + "logps/rejected": -545.5, + "loss": 0.0098, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.667724609375, + "rewards/margins": 10.661718368530273, + "rewards/rejected": -13.322656631469727, + "step": 9940 + }, + { + "epoch": 3.2024623803009575, + "grad_norm": 1.6100210541023798, + "learning_rate": 1.99211204121056e-07, + "logits/chosen": -0.810455322265625, + "logits/rejected": -0.9979492425918579, + "logps/chosen": -374.3999938964844, + "logps/rejected": -462.1499938964844, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.531054735183716, + "rewards/margins": 10.021875381469727, + "rewards/rejected": -13.551562309265137, + "step": 9950 + }, + { + "epoch": 3.205681178079987, + "grad_norm": 2.2132581457629366, + "learning_rate": 1.9840631036703155e-07, + "logits/chosen": -0.619519054889679, + "logits/rejected": -0.865527331829071, + "logps/chosen": -490.625, + "logps/rejected": -552.6749877929688, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6207032203674316, + "rewards/margins": 10.8671875, + "rewards/rejected": -14.481249809265137, + "step": 9960 + }, + { + "epoch": 3.2088999758590164, + "grad_norm": 0.4208133728721226, + "learning_rate": 1.9760141661300709e-07, + "logits/chosen": -0.509661853313446, + "logits/rejected": -0.8161880373954773, + "logps/chosen": -490.375, + "logps/rejected": -511.5, + "loss": 0.005, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.777148485183716, + "rewards/margins": 10.165624618530273, + "rewards/rejected": -13.9453125, + "step": 9970 + }, + { + "epoch": 3.2121187736380463, + "grad_norm": 0.3860757910755741, + "learning_rate": 1.967965228589826e-07, + "logits/chosen": -0.5793914794921875, + "logits/rejected": -0.935778796672821, + "logps/chosen": -441.3999938964844, + "logps/rejected": -499.20001220703125, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.261340379714966, + "rewards/margins": 10.454687118530273, + "rewards/rejected": -13.720312118530273, + "step": 9980 + }, + { + "epoch": 3.2153375714170758, + "grad_norm": 3.0521339077682024, + "learning_rate": 1.9599162910495813e-07, + "logits/chosen": -0.6557296514511108, + "logits/rejected": -0.816540539264679, + "logps/chosen": -431.6499938964844, + "logps/rejected": -530.1749877929688, + "loss": 0.0139, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.641308546066284, + "rewards/margins": 10.1484375, + "rewards/rejected": -13.782031059265137, + "step": 9990 + }, + { + "epoch": 3.218556369196105, + "grad_norm": 0.19294479969109818, + "learning_rate": 1.9518673535093367e-07, + "logits/chosen": -0.6134887933731079, + "logits/rejected": -0.712170422077179, + "logps/chosen": -463.45001220703125, + "logps/rejected": -533.4000244140625, + "loss": 0.0091, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.707714796066284, + "rewards/margins": 10.485937118530273, + "rewards/rejected": -14.198437690734863, + "step": 10000 + }, + { + "epoch": 3.2217751669751347, + "grad_norm": 0.4444451832874659, + "learning_rate": 1.943818415969092e-07, + "logits/chosen": -0.6468505859375, + "logits/rejected": -0.835705578327179, + "logps/chosen": -463.32501220703125, + "logps/rejected": -526.1500244140625, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7408204078674316, + "rewards/margins": 10.845312118530273, + "rewards/rejected": -14.587499618530273, + "step": 10010 + }, + { + "epoch": 3.224993964754164, + "grad_norm": 0.08004133210925865, + "learning_rate": 1.9357694784288472e-07, + "logits/chosen": -0.774035632610321, + "logits/rejected": -1.029272437095642, + "logps/chosen": -402.04998779296875, + "logps/rejected": -451.625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7364745140075684, + "rewards/margins": 10.8046875, + "rewards/rejected": -14.537500381469727, + "step": 10020 + }, + { + "epoch": 3.228212762533194, + "grad_norm": 0.2527144509695844, + "learning_rate": 1.9277205408886028e-07, + "logits/chosen": -0.8017333745956421, + "logits/rejected": -1.055078148841858, + "logps/chosen": -426.8999938964844, + "logps/rejected": -488.92498779296875, + "loss": 0.0048, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.398486375808716, + "rewards/margins": 10.356249809265137, + "rewards/rejected": -13.756250381469727, + "step": 10030 + }, + { + "epoch": 3.2314315603122234, + "grad_norm": 3.975927338649442, + "learning_rate": 1.919671603348358e-07, + "logits/chosen": -0.7174438238143921, + "logits/rejected": -0.942797839641571, + "logps/chosen": -475.7250061035156, + "logps/rejected": -504.04998779296875, + "loss": 0.0102, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.5804200172424316, + "rewards/margins": 10.30859375, + "rewards/rejected": -13.890625, + "step": 10040 + }, + { + "epoch": 3.234650358091253, + "grad_norm": 0.6329483798829164, + "learning_rate": 1.9116226658081133e-07, + "logits/chosen": -0.6496948003768921, + "logits/rejected": -0.9426025152206421, + "logps/chosen": -439.45001220703125, + "logps/rejected": -486.1499938964844, + "loss": 0.0049, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.267040967941284, + "rewards/margins": 10.038281440734863, + "rewards/rejected": -13.3046875, + "step": 10050 + }, + { + "epoch": 3.2378691558702823, + "grad_norm": 0.2956001188105742, + "learning_rate": 1.9035737282678686e-07, + "logits/chosen": -0.4580078125, + "logits/rejected": -0.7880004644393921, + "logps/chosen": -464.75, + "logps/rejected": -504.2250061035156, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.963061571121216, + "rewards/margins": 10.147656440734863, + "rewards/rejected": -13.107812881469727, + "step": 10060 + }, + { + "epoch": 3.241087953649312, + "grad_norm": 0.11484882041846013, + "learning_rate": 1.895524790727624e-07, + "logits/chosen": -0.7980712652206421, + "logits/rejected": -0.9283508062362671, + "logps/chosen": -435.3500061035156, + "logps/rejected": -536.7000122070312, + "loss": 0.0049, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.531543016433716, + "rewards/margins": 11.225781440734863, + "rewards/rejected": -14.756250381469727, + "step": 10070 + }, + { + "epoch": 3.2443067514283417, + "grad_norm": 0.5264107174048275, + "learning_rate": 1.887475853187379e-07, + "logits/chosen": -0.5923217535018921, + "logits/rejected": -0.88262939453125, + "logps/chosen": -489.82501220703125, + "logps/rejected": -508.625, + "loss": 0.0149, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.6783204078674316, + "rewards/margins": 10.564844131469727, + "rewards/rejected": -14.243749618530273, + "step": 10080 + }, + { + "epoch": 3.247525549207371, + "grad_norm": 0.43023288505702095, + "learning_rate": 1.8794269156471345e-07, + "logits/chosen": -0.688946545124054, + "logits/rejected": -1.0232055187225342, + "logps/chosen": -422.25, + "logps/rejected": -441.42498779296875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.423290967941284, + "rewards/margins": 9.883593559265137, + "rewards/rejected": -13.309374809265137, + "step": 10090 + }, + { + "epoch": 3.2507443469864006, + "grad_norm": 4.077183860312291, + "learning_rate": 1.8713779781068898e-07, + "logits/chosen": -0.6689788699150085, + "logits/rejected": -0.8819580078125, + "logps/chosen": -424.1499938964844, + "logps/rejected": -495.8999938964844, + "loss": 0.0099, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.200439453125, + "rewards/margins": 10.266406059265137, + "rewards/rejected": -13.473437309265137, + "step": 10100 + }, + { + "epoch": 3.25396314476543, + "grad_norm": 1.3434292284728664, + "learning_rate": 1.8633290405666452e-07, + "logits/chosen": -0.613189697265625, + "logits/rejected": -0.961718738079071, + "logps/chosen": -430.2875061035156, + "logps/rejected": -495.1499938964844, + "loss": 0.0047, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.7041993141174316, + "rewards/margins": 10.80078125, + "rewards/rejected": -14.518750190734863, + "step": 10110 + }, + { + "epoch": 3.2571819425444595, + "grad_norm": 2.874014046351652, + "learning_rate": 1.8552801030264003e-07, + "logits/chosen": -0.47099608182907104, + "logits/rejected": -0.8485199213027954, + "logps/chosen": -477.04998779296875, + "logps/rejected": -485.07501220703125, + "loss": 0.0096, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.937206983566284, + "rewards/margins": 10.127344131469727, + "rewards/rejected": -13.052343368530273, + "step": 10120 + }, + { + "epoch": 3.2604007403234894, + "grad_norm": 0.22165236681266165, + "learning_rate": 1.847231165486156e-07, + "logits/chosen": -0.7222534418106079, + "logits/rejected": -0.8905029296875, + "logps/chosen": -447.5, + "logps/rejected": -510.1000061035156, + "loss": 0.0133, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.6376953125, + "rewards/margins": 10.243749618530273, + "rewards/rejected": -13.890625, + "step": 10130 + }, + { + "epoch": 3.263619538102519, + "grad_norm": 0.07247264585321504, + "learning_rate": 1.839182227945911e-07, + "logits/chosen": -0.88525390625, + "logits/rejected": -1.061181664466858, + "logps/chosen": -423.54998779296875, + "logps/rejected": -520.7750244140625, + "loss": 0.0047, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.555712938308716, + "rewards/margins": 10.489062309265137, + "rewards/rejected": -14.0390625, + "step": 10140 + }, + { + "epoch": 3.2668383358815483, + "grad_norm": 6.728627214197773, + "learning_rate": 1.8311332904056664e-07, + "logits/chosen": -0.6941467523574829, + "logits/rejected": -0.9289596676826477, + "logps/chosen": -421.3999938964844, + "logps/rejected": -483.25, + "loss": 0.0097, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.2021484375, + "rewards/margins": 9.607030868530273, + "rewards/rejected": -13.809374809265137, + "step": 10150 + }, + { + "epoch": 3.2700571336605777, + "grad_norm": 0.5647648700258849, + "learning_rate": 1.8230843528654215e-07, + "logits/chosen": -0.7457519769668579, + "logits/rejected": -1.0149414539337158, + "logps/chosen": -420.4750061035156, + "logps/rejected": -475.32501220703125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.385937452316284, + "rewards/margins": 10.41015625, + "rewards/rejected": -13.796875, + "step": 10160 + }, + { + "epoch": 3.273275931439607, + "grad_norm": 0.19145970281840807, + "learning_rate": 1.8150354153251771e-07, + "logits/chosen": -0.6227051019668579, + "logits/rejected": -0.777496337890625, + "logps/chosen": -454.1499938964844, + "logps/rejected": -489.8999938964844, + "loss": 0.0136, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.87548828125, + "rewards/margins": 10.126562118530273, + "rewards/rejected": -14.001562118530273, + "step": 10170 + }, + { + "epoch": 3.276494729218637, + "grad_norm": 0.217562853697584, + "learning_rate": 1.8069864777849322e-07, + "logits/chosen": -0.704724133014679, + "logits/rejected": -1.000146508216858, + "logps/chosen": -390.0249938964844, + "logps/rejected": -474.1499938964844, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4862303733825684, + "rewards/margins": 10.387499809265137, + "rewards/rejected": -13.875, + "step": 10180 + }, + { + "epoch": 3.2797135269976665, + "grad_norm": 1.952978151173215, + "learning_rate": 1.7989375402446876e-07, + "logits/chosen": -0.6843322515487671, + "logits/rejected": -0.848376452922821, + "logps/chosen": -436.0625, + "logps/rejected": -474.3999938964844, + "loss": 0.0135, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.2029387950897217, + "rewards/margins": 10.260156631469727, + "rewards/rejected": -13.462499618530273, + "step": 10190 + }, + { + "epoch": 3.282932324776696, + "grad_norm": 0.7603816986237905, + "learning_rate": 1.790888602704443e-07, + "logits/chosen": -0.840954601764679, + "logits/rejected": -1.0157349109649658, + "logps/chosen": -456.2250061035156, + "logps/rejected": -496.70001220703125, + "loss": 0.0048, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.6214842796325684, + "rewards/margins": 10.603124618530273, + "rewards/rejected": -14.220312118530273, + "step": 10200 + }, + { + "epoch": 3.2861511225557254, + "grad_norm": 0.3370895649578437, + "learning_rate": 1.7828396651641983e-07, + "logits/chosen": -0.589019775390625, + "logits/rejected": -0.874072253704071, + "logps/chosen": -417.3500061035156, + "logps/rejected": -481.3500061035156, + "loss": 0.0222, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.6640625, + "rewards/margins": 10.239062309265137, + "rewards/rejected": -13.90625, + "step": 10210 + }, + { + "epoch": 3.289369920334755, + "grad_norm": 5.768377933562274, + "learning_rate": 1.7747907276239534e-07, + "logits/chosen": -0.775103747844696, + "logits/rejected": -0.9190918207168579, + "logps/chosen": -435.82501220703125, + "logps/rejected": -505.70001220703125, + "loss": 0.0094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.674023389816284, + "rewards/margins": 10.774218559265137, + "rewards/rejected": -14.446874618530273, + "step": 10220 + }, + { + "epoch": 3.2925887181137847, + "grad_norm": 1.0789393688639668, + "learning_rate": 1.7667417900837088e-07, + "logits/chosen": -0.857495129108429, + "logits/rejected": -1.0554687976837158, + "logps/chosen": -425.125, + "logps/rejected": -469.92498779296875, + "loss": 0.0048, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.796191453933716, + "rewards/margins": 10.8203125, + "rewards/rejected": -14.6171875, + "step": 10230 + }, + { + "epoch": 3.295807515892814, + "grad_norm": 0.30526481217261636, + "learning_rate": 1.7586928525434642e-07, + "logits/chosen": -0.7433837652206421, + "logits/rejected": -0.9485839605331421, + "logps/chosen": -448.2875061035156, + "logps/rejected": -494.32501220703125, + "loss": 0.0177, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.8388671875, + "rewards/margins": 10.964062690734863, + "rewards/rejected": -14.793749809265137, + "step": 10240 + }, + { + "epoch": 3.2990263136718436, + "grad_norm": 1.7133050990723089, + "learning_rate": 1.7506439150032195e-07, + "logits/chosen": -0.6616607904434204, + "logits/rejected": -0.849926769733429, + "logps/chosen": -477.04998779296875, + "logps/rejected": -526.4249877929688, + "loss": 0.0101, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.2853026390075684, + "rewards/margins": 10.44921875, + "rewards/rejected": -13.7265625, + "step": 10250 + }, + { + "epoch": 3.302245111450873, + "grad_norm": 0.3979696823271677, + "learning_rate": 1.7425949774629746e-07, + "logits/chosen": -0.6968749761581421, + "logits/rejected": -0.86004638671875, + "logps/chosen": -433.875, + "logps/rejected": -503.875, + "loss": 0.0098, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.592529296875, + "rewards/margins": 10.446093559265137, + "rewards/rejected": -14.043749809265137, + "step": 10260 + }, + { + "epoch": 3.3054639092299025, + "grad_norm": 1.3945418506002913, + "learning_rate": 1.7345460399227303e-07, + "logits/chosen": -0.6935499310493469, + "logits/rejected": -0.958666980266571, + "logps/chosen": -467.20001220703125, + "logps/rejected": -477.45001220703125, + "loss": 0.006, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.622753858566284, + "rewards/margins": 9.917187690734863, + "rewards/rejected": -13.537500381469727, + "step": 10270 + }, + { + "epoch": 3.3086827070089324, + "grad_norm": 14.229046594552312, + "learning_rate": 1.7264971023824854e-07, + "logits/chosen": -0.627978503704071, + "logits/rejected": -0.885363757610321, + "logps/chosen": -460.7250061035156, + "logps/rejected": -537.5250244140625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.813281297683716, + "rewards/margins": 10.603124618530273, + "rewards/rejected": -14.409375190734863, + "step": 10280 + }, + { + "epoch": 3.311901504787962, + "grad_norm": 0.06357262527986461, + "learning_rate": 1.7184481648422407e-07, + "logits/chosen": -0.6296142339706421, + "logits/rejected": -0.8638855218887329, + "logps/chosen": -415.1000061035156, + "logps/rejected": -489.04998779296875, + "loss": 0.0059, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.5985350608825684, + "rewards/margins": 9.995312690734863, + "rewards/rejected": -13.584375381469727, + "step": 10290 + }, + { + "epoch": 3.3151203025669913, + "grad_norm": 0.2400711945550553, + "learning_rate": 1.710399227301996e-07, + "logits/chosen": -0.6626952886581421, + "logits/rejected": -0.861956775188446, + "logps/chosen": -422.32501220703125, + "logps/rejected": -485.125, + "loss": 0.0137, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.4297852516174316, + "rewards/margins": 10.51171875, + "rewards/rejected": -13.9453125, + "step": 10300 + }, + { + "epoch": 3.3183391003460208, + "grad_norm": 0.12231092835002914, + "learning_rate": 1.7023502897617515e-07, + "logits/chosen": -0.4934326112270355, + "logits/rejected": -0.794970691204071, + "logps/chosen": -459.75, + "logps/rejected": -515.9000244140625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1568360328674316, + "rewards/margins": 11.102343559265137, + "rewards/rejected": -14.260937690734863, + "step": 10310 + }, + { + "epoch": 3.32155789812505, + "grad_norm": 1.8330437329689664, + "learning_rate": 1.6943013522215066e-07, + "logits/chosen": -0.570483386516571, + "logits/rejected": -0.7862793207168579, + "logps/chosen": -472.3500061035156, + "logps/rejected": -539.0750122070312, + "loss": 0.0101, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.8121581077575684, + "rewards/margins": 10.13671875, + "rewards/rejected": -12.946874618530273, + "step": 10320 + }, + { + "epoch": 3.3247766959040796, + "grad_norm": 1.4718767916059177, + "learning_rate": 1.686252414681262e-07, + "logits/chosen": -0.4749206602573395, + "logits/rejected": -0.8125976324081421, + "logps/chosen": -454.5249938964844, + "logps/rejected": -486.42498779296875, + "loss": 0.0053, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.7914185523986816, + "rewards/margins": 10.413281440734863, + "rewards/rejected": -13.20703125, + "step": 10330 + }, + { + "epoch": 3.3279954936831095, + "grad_norm": 0.14385879222654616, + "learning_rate": 1.6782034771410173e-07, + "logits/chosen": -0.574017345905304, + "logits/rejected": -0.812304675579071, + "logps/chosen": -453.7250061035156, + "logps/rejected": -525.5250244140625, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.1026854515075684, + "rewards/margins": 10.291406631469727, + "rewards/rejected": -13.393750190734863, + "step": 10340 + }, + { + "epoch": 3.331214291462139, + "grad_norm": 0.4817397502444063, + "learning_rate": 1.6701545396007727e-07, + "logits/chosen": -0.719573974609375, + "logits/rejected": -0.9063965082168579, + "logps/chosen": -403.92498779296875, + "logps/rejected": -500.29998779296875, + "loss": 0.0048, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.215429782867432, + "rewards/margins": 10.660937309265137, + "rewards/rejected": -14.879687309265137, + "step": 10350 + }, + { + "epoch": 3.3344330892411684, + "grad_norm": 0.12328072039228925, + "learning_rate": 1.6621056020605278e-07, + "logits/chosen": -0.7245117425918579, + "logits/rejected": -0.9367920160293579, + "logps/chosen": -438.92498779296875, + "logps/rejected": -491.0, + "loss": 0.0094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.1252503395080566, + "rewards/margins": 10.73046875, + "rewards/rejected": -13.854687690734863, + "step": 10360 + }, + { + "epoch": 3.337651887020198, + "grad_norm": 0.23574189719939692, + "learning_rate": 1.6540566645202834e-07, + "logits/chosen": -0.700390636920929, + "logits/rejected": -0.944531261920929, + "logps/chosen": -433.4750061035156, + "logps/rejected": -490.25, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.36083984375, + "rewards/margins": 10.681249618530273, + "rewards/rejected": -14.046875, + "step": 10370 + }, + { + "epoch": 3.3408706847992273, + "grad_norm": 0.09139571813198634, + "learning_rate": 1.6460077269800385e-07, + "logits/chosen": -0.707592785358429, + "logits/rejected": -0.750457763671875, + "logps/chosen": -439.32501220703125, + "logps/rejected": -516.4500122070312, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.974072217941284, + "rewards/margins": 10.357030868530273, + "rewards/rejected": -14.337499618530273, + "step": 10380 + }, + { + "epoch": 3.3440894825782568, + "grad_norm": 0.13970373809774145, + "learning_rate": 1.637958789439794e-07, + "logits/chosen": -0.761645495891571, + "logits/rejected": -1.069726586341858, + "logps/chosen": -419.1000061035156, + "logps/rejected": -479.8999938964844, + "loss": 0.0094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.557910203933716, + "rewards/margins": 10.268750190734863, + "rewards/rejected": -13.834375381469727, + "step": 10390 + }, + { + "epoch": 3.3473082803572867, + "grad_norm": 0.39102161824363035, + "learning_rate": 1.629909851899549e-07, + "logits/chosen": -0.775378406047821, + "logits/rejected": -0.9048827886581421, + "logps/chosen": -462.8500061035156, + "logps/rejected": -525.2999877929688, + "loss": 0.0053, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.8248534202575684, + "rewards/margins": 10.565625190734863, + "rewards/rejected": -14.390625, + "step": 10400 + }, + { + "epoch": 3.350527078136316, + "grad_norm": 0.33551742493013886, + "learning_rate": 1.6218609143593046e-07, + "logits/chosen": -0.68133544921875, + "logits/rejected": -0.780896008014679, + "logps/chosen": -432.125, + "logps/rejected": -474.70001220703125, + "loss": 0.0137, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.340625047683716, + "rewards/margins": 10.125781059265137, + "rewards/rejected": -13.477343559265137, + "step": 10410 + }, + { + "epoch": 3.3537458759153456, + "grad_norm": 2.4574492254559117, + "learning_rate": 1.6138119768190597e-07, + "logits/chosen": -0.6961425542831421, + "logits/rejected": -0.8793700933456421, + "logps/chosen": -471.2250061035156, + "logps/rejected": -527.9000244140625, + "loss": 0.0091, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.993847608566284, + "rewards/margins": 10.540624618530273, + "rewards/rejected": -14.537500381469727, + "step": 10420 + }, + { + "epoch": 3.356964673694375, + "grad_norm": 0.6997201296564323, + "learning_rate": 1.605763039278815e-07, + "logits/chosen": -0.672039806842804, + "logits/rejected": -0.8965820074081421, + "logps/chosen": -430.7749938964844, + "logps/rejected": -498.57501220703125, + "loss": 0.0135, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.227832078933716, + "rewards/margins": 10.286718368530273, + "rewards/rejected": -13.518750190734863, + "step": 10430 + }, + { + "epoch": 3.3601834714734045, + "grad_norm": 0.1344462483647303, + "learning_rate": 1.5977141017385705e-07, + "logits/chosen": -0.6821746826171875, + "logits/rejected": -0.88336181640625, + "logps/chosen": -462.7250061035156, + "logps/rejected": -524.0250244140625, + "loss": 0.009, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.498046875, + "rewards/margins": 11.008593559265137, + "rewards/rejected": -14.504687309265137, + "step": 10440 + }, + { + "epoch": 3.3634022692524344, + "grad_norm": 3.845438717660343, + "learning_rate": 1.5896651641983258e-07, + "logits/chosen": -0.722607433795929, + "logits/rejected": -0.8047119379043579, + "logps/chosen": -421.70001220703125, + "logps/rejected": -512.5999755859375, + "loss": 0.0135, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.9169921875, + "rewards/margins": 10.713281631469727, + "rewards/rejected": -14.635937690734863, + "step": 10450 + }, + { + "epoch": 3.366621067031464, + "grad_norm": 0.3271238534258152, + "learning_rate": 1.581616226658081e-07, + "logits/chosen": -0.7420593500137329, + "logits/rejected": -0.890637218952179, + "logps/chosen": -434.7250061035156, + "logps/rejected": -505.1499938964844, + "loss": 0.0051, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.582226514816284, + "rewards/margins": 10.581250190734863, + "rewards/rejected": -14.1640625, + "step": 10460 + }, + { + "epoch": 3.3698398648104932, + "grad_norm": 1.833628752679259, + "learning_rate": 1.5735672891178363e-07, + "logits/chosen": -0.624041736125946, + "logits/rejected": -0.8067626953125, + "logps/chosen": -444.17498779296875, + "logps/rejected": -502.25, + "loss": 0.0112, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.036425590515137, + "rewards/margins": 10.323437690734863, + "rewards/rejected": -14.3671875, + "step": 10470 + }, + { + "epoch": 3.3730586625895227, + "grad_norm": 0.7943086324176684, + "learning_rate": 1.5655183515775917e-07, + "logits/chosen": -0.758984386920929, + "logits/rejected": -0.96826171875, + "logps/chosen": -476.04998779296875, + "logps/rejected": -508.0, + "loss": 0.0094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.6509766578674316, + "rewards/margins": 10.446874618530273, + "rewards/rejected": -14.1015625, + "step": 10480 + }, + { + "epoch": 3.376277460368552, + "grad_norm": 0.6622665956865464, + "learning_rate": 1.557469414037347e-07, + "logits/chosen": -0.714874267578125, + "logits/rejected": -0.906787097454071, + "logps/chosen": -446.70001220703125, + "logps/rejected": -511.9750061035156, + "loss": 0.0091, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.7803711891174316, + "rewards/margins": 10.810937881469727, + "rewards/rejected": -14.596875190734863, + "step": 10490 + }, + { + "epoch": 3.379496258147582, + "grad_norm": 1.9540314388939348, + "learning_rate": 1.5494204764971021e-07, + "logits/chosen": -0.767254650592804, + "logits/rejected": -0.941986083984375, + "logps/chosen": -466.79998779296875, + "logps/rejected": -514.5750122070312, + "loss": 0.0096, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.978808641433716, + "rewards/margins": 11.056249618530273, + "rewards/rejected": -15.037500381469727, + "step": 10500 + }, + { + "epoch": 3.3827150559266115, + "grad_norm": 0.41904990860902386, + "learning_rate": 1.5413715389568578e-07, + "logits/chosen": -0.827929675579071, + "logits/rejected": -0.9854736328125, + "logps/chosen": -441.82501220703125, + "logps/rejected": -494.6499938964844, + "loss": 0.0124, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.421679735183716, + "rewards/margins": 10.301562309265137, + "rewards/rejected": -13.720312118530273, + "step": 10510 + }, + { + "epoch": 3.385933853705641, + "grad_norm": 1.0015489037185834, + "learning_rate": 1.533322601416613e-07, + "logits/chosen": -0.6612548828125, + "logits/rejected": -0.857983410358429, + "logps/chosen": -404.3500061035156, + "logps/rejected": -499.5249938964844, + "loss": 0.0089, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.9253907203674316, + "rewards/margins": 10.585156440734863, + "rewards/rejected": -14.512499809265137, + "step": 10520 + }, + { + "epoch": 3.3891526514846704, + "grad_norm": 0.47116560368526866, + "learning_rate": 1.5252736638763682e-07, + "logits/chosen": -0.6803344488143921, + "logits/rejected": -0.861560046672821, + "logps/chosen": -456.625, + "logps/rejected": -504.20001220703125, + "loss": 0.005, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.461718797683716, + "rewards/margins": 10.62109375, + "rewards/rejected": -14.079687118530273, + "step": 10530 + }, + { + "epoch": 3.3923714492637, + "grad_norm": 2.58065010766101, + "learning_rate": 1.5172247263361233e-07, + "logits/chosen": -0.685961902141571, + "logits/rejected": -1.0260498523712158, + "logps/chosen": -443.70001220703125, + "logps/rejected": -519.4000244140625, + "loss": 0.0051, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.763964891433716, + "rewards/margins": 10.33984375, + "rewards/rejected": -14.109375, + "step": 10540 + }, + { + "epoch": 3.3955902470427297, + "grad_norm": 0.15672985183910548, + "learning_rate": 1.509175788795879e-07, + "logits/chosen": -0.7437744140625, + "logits/rejected": -0.886138916015625, + "logps/chosen": -451.4624938964844, + "logps/rejected": -528.7750244140625, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.21173095703125, + "rewards/margins": 10.913281440734863, + "rewards/rejected": -14.129687309265137, + "step": 10550 + }, + { + "epoch": 3.398809044821759, + "grad_norm": 0.5250205505848603, + "learning_rate": 1.501126851255634e-07, + "logits/chosen": -0.7090209722518921, + "logits/rejected": -0.9725097417831421, + "logps/chosen": -426.29998779296875, + "logps/rejected": -481.6499938964844, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.9012694358825684, + "rewards/margins": 10.307031631469727, + "rewards/rejected": -14.209375381469727, + "step": 10560 + }, + { + "epoch": 3.4020278426007886, + "grad_norm": 0.39634361165314996, + "learning_rate": 1.4930779137153894e-07, + "logits/chosen": -0.599682629108429, + "logits/rejected": -0.9242919683456421, + "logps/chosen": -475.6000061035156, + "logps/rejected": -514.7000122070312, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.147363185882568, + "rewards/margins": 10.502344131469727, + "rewards/rejected": -14.646875381469727, + "step": 10570 + }, + { + "epoch": 3.405246640379818, + "grad_norm": 0.10236741554989712, + "learning_rate": 1.4850289761751448e-07, + "logits/chosen": -0.6856933832168579, + "logits/rejected": -0.877148449420929, + "logps/chosen": -472.5, + "logps/rejected": -503.3500061035156, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.969531297683716, + "rewards/margins": 10.513280868530273, + "rewards/rejected": -14.482812881469727, + "step": 10580 + }, + { + "epoch": 3.4084654381588475, + "grad_norm": 0.8261518345742542, + "learning_rate": 1.4769800386349002e-07, + "logits/chosen": -0.734790027141571, + "logits/rejected": -0.8880859613418579, + "logps/chosen": -443.1000061035156, + "logps/rejected": -499.6000061035156, + "loss": 0.0112, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.8687500953674316, + "rewards/margins": 10.21875, + "rewards/rejected": -14.100000381469727, + "step": 10590 + }, + { + "epoch": 3.4116842359378774, + "grad_norm": 0.7922928283030919, + "learning_rate": 1.4689311010946553e-07, + "logits/chosen": -0.7701050043106079, + "logits/rejected": -1.001977562904358, + "logps/chosen": -406.2749938964844, + "logps/rejected": -464.54998779296875, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.7569336891174316, + "rewards/margins": 10.5546875, + "rewards/rejected": -14.3125, + "step": 10600 + }, + { + "epoch": 3.414903033716907, + "grad_norm": 42.30864369946298, + "learning_rate": 1.460882163554411e-07, + "logits/chosen": -0.6590820550918579, + "logits/rejected": -0.8947998285293579, + "logps/chosen": -451.9750061035156, + "logps/rejected": -507.07501220703125, + "loss": 0.0136, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.565502882003784, + "rewards/margins": 10.421875, + "rewards/rejected": -13.984375, + "step": 10610 + }, + { + "epoch": 3.4181218314959363, + "grad_norm": 0.11970173880321525, + "learning_rate": 1.452833226014166e-07, + "logits/chosen": -0.791369616985321, + "logits/rejected": -1.0011475086212158, + "logps/chosen": -434.3500061035156, + "logps/rejected": -518.7000122070312, + "loss": 0.0052, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.724316358566284, + "rewards/margins": 10.767187118530273, + "rewards/rejected": -14.493749618530273, + "step": 10620 + }, + { + "epoch": 3.4213406292749657, + "grad_norm": 0.8121184083199166, + "learning_rate": 1.4447842884739214e-07, + "logits/chosen": -0.7388824224472046, + "logits/rejected": -0.92657470703125, + "logps/chosen": -456.7749938964844, + "logps/rejected": -537.7999877929688, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.272558689117432, + "rewards/margins": 10.860937118530273, + "rewards/rejected": -15.1328125, + "step": 10630 + }, + { + "epoch": 3.424559427053995, + "grad_norm": 1.1509391024933457, + "learning_rate": 1.4367353509336765e-07, + "logits/chosen": -0.673443615436554, + "logits/rejected": -0.8910461664199829, + "logps/chosen": -418.3999938964844, + "logps/rejected": -475.5, + "loss": 0.0107, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.8487305641174316, + "rewards/margins": 10.41796875, + "rewards/rejected": -14.265625, + "step": 10640 + }, + { + "epoch": 3.427778224833025, + "grad_norm": 0.9213148387588964, + "learning_rate": 1.428686413393432e-07, + "logits/chosen": -0.573504626750946, + "logits/rejected": -0.76031494140625, + "logps/chosen": -416.11248779296875, + "logps/rejected": -462.25, + "loss": 0.0065, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.8594727516174316, + "rewards/margins": 10.365625381469727, + "rewards/rejected": -14.22265625, + "step": 10650 + }, + { + "epoch": 3.4309970226120545, + "grad_norm": 1.2814320236994485, + "learning_rate": 1.4206374758531872e-07, + "logits/chosen": -0.69879150390625, + "logits/rejected": -0.9196411371231079, + "logps/chosen": -445.42498779296875, + "logps/rejected": -511.5, + "loss": 0.0056, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.709277391433716, + "rewards/margins": 10.582812309265137, + "rewards/rejected": -14.301562309265137, + "step": 10660 + }, + { + "epoch": 3.434215820391084, + "grad_norm": 0.1997536232087831, + "learning_rate": 1.4125885383129426e-07, + "logits/chosen": -0.53887939453125, + "logits/rejected": -0.773876965045929, + "logps/chosen": -446.8500061035156, + "logps/rejected": -511.3500061035156, + "loss": 0.0194, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.391894578933716, + "rewards/margins": 10.177343368530273, + "rewards/rejected": -13.560937881469727, + "step": 10670 + }, + { + "epoch": 3.4374346181701134, + "grad_norm": 0.9107032371072225, + "learning_rate": 1.404539600772698e-07, + "logits/chosen": -0.82965087890625, + "logits/rejected": -1.000244140625, + "logps/chosen": -430.20001220703125, + "logps/rejected": -520.8499755859375, + "loss": 0.005, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.9945311546325684, + "rewards/margins": 10.314844131469727, + "rewards/rejected": -14.309374809265137, + "step": 10680 + }, + { + "epoch": 3.440653415949143, + "grad_norm": 0.5353507819938572, + "learning_rate": 1.3964906632324533e-07, + "logits/chosen": -0.654254138469696, + "logits/rejected": -0.9794677495956421, + "logps/chosen": -403.04998779296875, + "logps/rejected": -477.0, + "loss": 0.0178, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.9012694358825684, + "rewards/margins": 10.881250381469727, + "rewards/rejected": -14.774999618530273, + "step": 10690 + }, + { + "epoch": 3.4438722137281728, + "grad_norm": 0.16047644885782153, + "learning_rate": 1.3884417256922084e-07, + "logits/chosen": -0.7689880132675171, + "logits/rejected": -0.992419421672821, + "logps/chosen": -432.7124938964844, + "logps/rejected": -466.8500061035156, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.534350633621216, + "rewards/margins": 10.598437309265137, + "rewards/rejected": -14.126562118530273, + "step": 10700 + }, + { + "epoch": 3.447091011507202, + "grad_norm": 0.4275292479485586, + "learning_rate": 1.3803927881519638e-07, + "logits/chosen": -0.591351330280304, + "logits/rejected": -0.8939453363418579, + "logps/chosen": -480.04998779296875, + "logps/rejected": -482.875, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.024853467941284, + "rewards/margins": 10.637499809265137, + "rewards/rejected": -13.659375190734863, + "step": 10710 + }, + { + "epoch": 3.4503098092862317, + "grad_norm": 3.451444826487016, + "learning_rate": 1.3723438506117192e-07, + "logits/chosen": -0.7418212890625, + "logits/rejected": -0.7838134765625, + "logps/chosen": -411.3374938964844, + "logps/rejected": -485.92498779296875, + "loss": 0.0059, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.092871189117432, + "rewards/margins": 10.322656631469727, + "rewards/rejected": -14.415624618530273, + "step": 10720 + }, + { + "epoch": 3.453528607065261, + "grad_norm": 0.24987699084721546, + "learning_rate": 1.3642949130714745e-07, + "logits/chosen": -0.573803722858429, + "logits/rejected": -0.850634753704071, + "logps/chosen": -464.92498779296875, + "logps/rejected": -530.9000244140625, + "loss": 0.0136, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.713671922683716, + "rewards/margins": 10.618749618530273, + "rewards/rejected": -14.326562881469727, + "step": 10730 + }, + { + "epoch": 3.4567474048442905, + "grad_norm": 0.1383069929821788, + "learning_rate": 1.3562459755312296e-07, + "logits/chosen": -0.608642578125, + "logits/rejected": -0.8260742425918579, + "logps/chosen": -482.57501220703125, + "logps/rejected": -551.5, + "loss": 0.0099, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.047656059265137, + "rewards/margins": 11.017187118530273, + "rewards/rejected": -15.050000190734863, + "step": 10740 + }, + { + "epoch": 3.45996620262332, + "grad_norm": 0.41414605834029183, + "learning_rate": 1.3481970379909853e-07, + "logits/chosen": -0.63934326171875, + "logits/rejected": -0.828417956829071, + "logps/chosen": -447.1499938964844, + "logps/rejected": -502.1000061035156, + "loss": 0.0095, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.533679246902466, + "rewards/margins": 10.184374809265137, + "rewards/rejected": -13.714062690734863, + "step": 10750 + }, + { + "epoch": 3.46318500040235, + "grad_norm": 0.25363120172361575, + "learning_rate": 1.3401481004507404e-07, + "logits/chosen": -0.601330578327179, + "logits/rejected": -0.8365234136581421, + "logps/chosen": -459.70001220703125, + "logps/rejected": -483.6000061035156, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.094531297683716, + "rewards/margins": 10.40625, + "rewards/rejected": -13.498437881469727, + "step": 10760 + }, + { + "epoch": 3.4664037981813793, + "grad_norm": 0.7358174017020472, + "learning_rate": 1.3320991629104957e-07, + "logits/chosen": -0.7226928472518921, + "logits/rejected": -0.998046875, + "logps/chosen": -435.1000061035156, + "logps/rejected": -501.0, + "loss": 0.0049, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.6124510765075684, + "rewards/margins": 10.560937881469727, + "rewards/rejected": -14.168749809265137, + "step": 10770 + }, + { + "epoch": 3.469622595960409, + "grad_norm": 0.2983162282214547, + "learning_rate": 1.3240502253702508e-07, + "logits/chosen": -0.78326416015625, + "logits/rejected": -0.945117175579071, + "logps/chosen": -495.8999938964844, + "logps/rejected": -536.3499755859375, + "loss": 0.0048, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.133544921875, + "rewards/margins": 10.5625, + "rewards/rejected": -13.692187309265137, + "step": 10780 + }, + { + "epoch": 3.4728413937394382, + "grad_norm": 1.5208905331313687, + "learning_rate": 1.3160012878300065e-07, + "logits/chosen": -0.571673572063446, + "logits/rejected": -0.742907702922821, + "logps/chosen": -439.5249938964844, + "logps/rejected": -484.70001220703125, + "loss": 0.0065, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.365771532058716, + "rewards/margins": 10.282031059265137, + "rewards/rejected": -13.646875381469727, + "step": 10790 + }, + { + "epoch": 3.4760601915184677, + "grad_norm": 0.1997560701597425, + "learning_rate": 1.3079523502897616e-07, + "logits/chosen": -0.6420532464981079, + "logits/rejected": -0.9231201410293579, + "logps/chosen": -381.25, + "logps/rejected": -445.3999938964844, + "loss": 0.0052, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.5298829078674316, + "rewards/margins": 9.924219131469727, + "rewards/rejected": -13.4609375, + "step": 10800 + }, + { + "epoch": 3.4792789892974976, + "grad_norm": 0.33092288376321194, + "learning_rate": 1.299903412749517e-07, + "logits/chosen": -0.809399425983429, + "logits/rejected": -0.987872302532196, + "logps/chosen": -415.8999938964844, + "logps/rejected": -521.5999755859375, + "loss": 0.0059, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.3563475608825684, + "rewards/margins": 11.4453125, + "rewards/rejected": -14.800000190734863, + "step": 10810 + }, + { + "epoch": 3.482497787076527, + "grad_norm": 0.3205372487567149, + "learning_rate": 1.2918544752092723e-07, + "logits/chosen": -0.668896496295929, + "logits/rejected": -0.867504894733429, + "logps/chosen": -458.7250061035156, + "logps/rejected": -526.0499877929688, + "loss": 0.005, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.9751954078674316, + "rewards/margins": 10.506250381469727, + "rewards/rejected": -14.4765625, + "step": 10820 + }, + { + "epoch": 3.4857165848555565, + "grad_norm": 0.27841522954367576, + "learning_rate": 1.2838055376690277e-07, + "logits/chosen": -0.7272415161132812, + "logits/rejected": -0.9171142578125, + "logps/chosen": -441.42498779296875, + "logps/rejected": -490.8500061035156, + "loss": 0.019, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.7835450172424316, + "rewards/margins": 9.954687118530273, + "rewards/rejected": -13.745312690734863, + "step": 10830 + }, + { + "epoch": 3.488935382634586, + "grad_norm": 0.21833924632294016, + "learning_rate": 1.2757566001287828e-07, + "logits/chosen": -0.722198486328125, + "logits/rejected": -0.9811035394668579, + "logps/chosen": -435.20001220703125, + "logps/rejected": -508.29998779296875, + "loss": 0.0047, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.956225633621216, + "rewards/margins": 10.764843940734863, + "rewards/rejected": -14.714062690734863, + "step": 10840 + }, + { + "epoch": 3.4921541804136154, + "grad_norm": 1.368905085166863, + "learning_rate": 1.2677076625885384e-07, + "logits/chosen": -0.621142566204071, + "logits/rejected": -0.964184582233429, + "logps/chosen": -482.17498779296875, + "logps/rejected": -523.6500244140625, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.880615234375, + "rewards/margins": 10.948437690734863, + "rewards/rejected": -14.8203125, + "step": 10850 + }, + { + "epoch": 3.495372978192645, + "grad_norm": 0.2889026437044651, + "learning_rate": 1.2596587250482935e-07, + "logits/chosen": -0.62017822265625, + "logits/rejected": -0.9713500738143921, + "logps/chosen": -508.125, + "logps/rejected": -538.5, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.761035203933716, + "rewards/margins": 10.685937881469727, + "rewards/rejected": -14.454687118530273, + "step": 10860 + }, + { + "epoch": 3.4985917759716747, + "grad_norm": 2.349883881164722, + "learning_rate": 1.251609787508049e-07, + "logits/chosen": -0.84002685546875, + "logits/rejected": -0.957385241985321, + "logps/chosen": -446.8999938964844, + "logps/rejected": -518.2000122070312, + "loss": 0.0053, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.9129881858825684, + "rewards/margins": 10.899999618530273, + "rewards/rejected": -14.815625190734863, + "step": 10870 + }, + { + "epoch": 3.501810573750704, + "grad_norm": 0.6051587151420811, + "learning_rate": 1.2435608499678042e-07, + "logits/chosen": -0.8535400629043579, + "logits/rejected": -1.012170433998108, + "logps/chosen": -462.54998779296875, + "logps/rejected": -503.20001220703125, + "loss": 0.0074, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.073437690734863, + "rewards/margins": 10.814062118530273, + "rewards/rejected": -14.890625, + "step": 10880 + }, + { + "epoch": 3.5050293715297336, + "grad_norm": 7.15634383290402, + "learning_rate": 1.2355119124275596e-07, + "logits/chosen": -0.687835693359375, + "logits/rejected": -0.9671386480331421, + "logps/chosen": -466.79998779296875, + "logps/rejected": -518.8499755859375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5447998046875, + "rewards/margins": 11.3046875, + "rewards/rejected": -14.848437309265137, + "step": 10890 + }, + { + "epoch": 3.508248169308763, + "grad_norm": 0.21621501062097506, + "learning_rate": 1.227462974887315e-07, + "logits/chosen": -0.6602843999862671, + "logits/rejected": -0.8871399164199829, + "logps/chosen": -465.57501220703125, + "logps/rejected": -531.7999877929688, + "loss": 0.0089, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.7992186546325684, + "rewards/margins": 10.7578125, + "rewards/rejected": -14.557812690734863, + "step": 10900 + }, + { + "epoch": 3.5114669670877925, + "grad_norm": 0.08046630375302363, + "learning_rate": 1.21941403734707e-07, + "logits/chosen": -0.744049072265625, + "logits/rejected": -0.977673351764679, + "logps/chosen": -434.625, + "logps/rejected": -499.79998779296875, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.619433641433716, + "rewards/margins": 10.800000190734863, + "rewards/rejected": -14.426562309265137, + "step": 10910 + }, + { + "epoch": 3.5146857648668224, + "grad_norm": 0.2855889125895367, + "learning_rate": 1.2113650998068254e-07, + "logits/chosen": -0.8382202386856079, + "logits/rejected": -1.0453262329101562, + "logps/chosen": -423.79998779296875, + "logps/rejected": -494.04998779296875, + "loss": 0.0063, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.125878810882568, + "rewards/margins": 10.796093940734863, + "rewards/rejected": -14.9140625, + "step": 10920 + }, + { + "epoch": 3.517904562645852, + "grad_norm": 1.17561228587212, + "learning_rate": 1.2033161622665808e-07, + "logits/chosen": -0.546490490436554, + "logits/rejected": -0.798388659954071, + "logps/chosen": -472.6499938964844, + "logps/rejected": -506.75, + "loss": 0.0113, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.162451267242432, + "rewards/margins": 10.04296875, + "rewards/rejected": -14.207812309265137, + "step": 10930 + }, + { + "epoch": 3.5211233604248813, + "grad_norm": 0.2250635378869385, + "learning_rate": 1.1952672247263362e-07, + "logits/chosen": -0.6241455078125, + "logits/rejected": -0.900677502155304, + "logps/chosen": -470.1499938964844, + "logps/rejected": -525.4500122070312, + "loss": 0.0048, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.092187404632568, + "rewards/margins": 10.568750381469727, + "rewards/rejected": -14.665624618530273, + "step": 10940 + }, + { + "epoch": 3.5243421582039107, + "grad_norm": 5.642306189479214, + "learning_rate": 1.1872182871860914e-07, + "logits/chosen": -0.653820812702179, + "logits/rejected": -0.9058593511581421, + "logps/chosen": -409.82501220703125, + "logps/rejected": -499.5, + "loss": 0.015, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.8011717796325684, + "rewards/margins": 10.450780868530273, + "rewards/rejected": -14.260156631469727, + "step": 10950 + }, + { + "epoch": 3.52756095598294, + "grad_norm": 0.5591102304594336, + "learning_rate": 1.1791693496458468e-07, + "logits/chosen": -0.6981598138809204, + "logits/rejected": -0.9189208745956421, + "logps/chosen": -463.25, + "logps/rejected": -529.5999755859375, + "loss": 0.0091, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.576709032058716, + "rewards/margins": 10.598437309265137, + "rewards/rejected": -14.172656059265137, + "step": 10960 + }, + { + "epoch": 3.53077975376197, + "grad_norm": 0.9304308149663253, + "learning_rate": 1.171120412105602e-07, + "logits/chosen": -0.533856213092804, + "logits/rejected": -0.833740234375, + "logps/chosen": -467.75, + "logps/rejected": -501.375, + "loss": 0.0179, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.5003905296325684, + "rewards/margins": 10.69140625, + "rewards/rejected": -14.201562881469727, + "step": 10970 + }, + { + "epoch": 3.5339985515409995, + "grad_norm": 0.3689991190905707, + "learning_rate": 1.1630714745653574e-07, + "logits/chosen": -0.7017120122909546, + "logits/rejected": -1.04974365234375, + "logps/chosen": -457.3500061035156, + "logps/rejected": -499.8999938964844, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.996777296066284, + "rewards/margins": 10.628125190734863, + "rewards/rejected": -14.628125190734863, + "step": 10980 + }, + { + "epoch": 3.537217349320029, + "grad_norm": 0.2343056560749886, + "learning_rate": 1.1550225370251126e-07, + "logits/chosen": -0.80340576171875, + "logits/rejected": -0.9451904296875, + "logps/chosen": -429.4750061035156, + "logps/rejected": -502.6000061035156, + "loss": 0.0051, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.3236327171325684, + "rewards/margins": 10.576562881469727, + "rewards/rejected": -13.896875381469727, + "step": 10990 + }, + { + "epoch": 3.5404361470990584, + "grad_norm": 2.4828918068529497, + "learning_rate": 1.146973599484868e-07, + "logits/chosen": -0.8696044683456421, + "logits/rejected": -1.027587890625, + "logps/chosen": -439.6499938964844, + "logps/rejected": -506.29998779296875, + "loss": 0.0158, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.941650390625, + "rewards/margins": 10.6953125, + "rewards/rejected": -14.635937690734863, + "step": 11000 + }, + { + "epoch": 3.543654944878088, + "grad_norm": 0.7502152721685494, + "learning_rate": 1.1389246619446232e-07, + "logits/chosen": -0.5247253179550171, + "logits/rejected": -0.842742919921875, + "logps/chosen": -474.0, + "logps/rejected": -472.4750061035156, + "loss": 0.0136, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.8875975608825684, + "rewards/margins": 10.482812881469727, + "rewards/rejected": -13.368749618530273, + "step": 11010 + }, + { + "epoch": 3.5468737426571177, + "grad_norm": 0.24274151706415079, + "learning_rate": 1.1308757244043786e-07, + "logits/chosen": -0.8032470941543579, + "logits/rejected": -1.0098755359649658, + "logps/chosen": -450.92498779296875, + "logps/rejected": -521.7999877929688, + "loss": 0.005, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.7997069358825684, + "rewards/margins": 10.346094131469727, + "rewards/rejected": -14.143750190734863, + "step": 11020 + }, + { + "epoch": 3.550092540436147, + "grad_norm": 3.840495705982623, + "learning_rate": 1.122826786864134e-07, + "logits/chosen": -0.7690063714981079, + "logits/rejected": -1.035302758216858, + "logps/chosen": -459.875, + "logps/rejected": -505.0, + "loss": 0.0053, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.6526856422424316, + "rewards/margins": 10.817968368530273, + "rewards/rejected": -14.470312118530273, + "step": 11030 + }, + { + "epoch": 3.5533113382151766, + "grad_norm": 3.926213090426727, + "learning_rate": 1.1147778493238892e-07, + "logits/chosen": -0.6629272699356079, + "logits/rejected": -0.91796875, + "logps/chosen": -443.5, + "logps/rejected": -498.1499938964844, + "loss": 0.0058, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.243945121765137, + "rewards/margins": 10.600781440734863, + "rewards/rejected": -14.850000381469727, + "step": 11040 + }, + { + "epoch": 3.556530135994206, + "grad_norm": 0.22787208213716809, + "learning_rate": 1.1067289117836446e-07, + "logits/chosen": -0.677929699420929, + "logits/rejected": -1.01611328125, + "logps/chosen": -488.8500061035156, + "logps/rejected": -519.9500122070312, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.800976514816284, + "rewards/margins": 10.69921875, + "rewards/rejected": -14.4921875, + "step": 11050 + }, + { + "epoch": 3.5597489337732355, + "grad_norm": 0.6559582306819219, + "learning_rate": 1.0986799742433998e-07, + "logits/chosen": -0.741955578327179, + "logits/rejected": -0.9928833246231079, + "logps/chosen": -452.6000061035156, + "logps/rejected": -505.75, + "loss": 0.005, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.416748046875, + "rewards/margins": 10.751562118530273, + "rewards/rejected": -15.168749809265137, + "step": 11060 + }, + { + "epoch": 3.5629677315522654, + "grad_norm": 0.6152873410229347, + "learning_rate": 1.0906310367031552e-07, + "logits/chosen": -0.610913097858429, + "logits/rejected": -0.8073486089706421, + "logps/chosen": -410.875, + "logps/rejected": -493.5249938964844, + "loss": 0.0094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.690136671066284, + "rewards/margins": 10.7890625, + "rewards/rejected": -14.481249809265137, + "step": 11070 + }, + { + "epoch": 3.566186529331295, + "grad_norm": 0.22787367113332022, + "learning_rate": 1.0825820991629105e-07, + "logits/chosen": -0.6688903570175171, + "logits/rejected": -0.8340514898300171, + "logps/chosen": -393.4624938964844, + "logps/rejected": -485.04998779296875, + "loss": 0.0182, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.7482666969299316, + "rewards/margins": 9.853124618530273, + "rewards/rejected": -13.604687690734863, + "step": 11080 + }, + { + "epoch": 3.5694053271103243, + "grad_norm": 0.9289759755825173, + "learning_rate": 1.0745331616226658e-07, + "logits/chosen": -0.731860339641571, + "logits/rejected": -1.014917016029358, + "logps/chosen": -444.3999938964844, + "logps/rejected": -493.54998779296875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.068359375, + "rewards/margins": 11.221875190734863, + "rewards/rejected": -15.300000190734863, + "step": 11090 + }, + { + "epoch": 3.5726241248893538, + "grad_norm": 0.40277644190191453, + "learning_rate": 1.0664842240824211e-07, + "logits/chosen": -0.689746081829071, + "logits/rejected": -0.852185070514679, + "logps/chosen": -434.3999938964844, + "logps/rejected": -532.2000122070312, + "loss": 0.0056, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.982226610183716, + "rewards/margins": 10.974218368530273, + "rewards/rejected": -14.951562881469727, + "step": 11100 + }, + { + "epoch": 3.575842922668383, + "grad_norm": 6.953108230277364, + "learning_rate": 1.0584352865421764e-07, + "logits/chosen": -0.651959240436554, + "logits/rejected": -0.9585815668106079, + "logps/chosen": -449.875, + "logps/rejected": -511.0, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.04290771484375, + "rewards/margins": 10.43359375, + "rewards/rejected": -14.46875, + "step": 11110 + }, + { + "epoch": 3.579061720447413, + "grad_norm": 0.6147308568649027, + "learning_rate": 1.0503863490019317e-07, + "logits/chosen": -0.7005615234375, + "logits/rejected": -1.010644555091858, + "logps/chosen": -460.6000061035156, + "logps/rejected": -535.4000244140625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.445019483566284, + "rewards/margins": 11.3125, + "rewards/rejected": -14.7578125, + "step": 11120 + }, + { + "epoch": 3.5822805182264426, + "grad_norm": 0.9172559195572786, + "learning_rate": 1.042337411461687e-07, + "logits/chosen": -0.4068054258823395, + "logits/rejected": -0.6947876214981079, + "logps/chosen": -465.3999938964844, + "logps/rejected": -502.17498779296875, + "loss": 0.0179, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.566162109375, + "rewards/margins": 10.399218559265137, + "rewards/rejected": -13.959375381469727, + "step": 11130 + }, + { + "epoch": 3.585499316005472, + "grad_norm": 0.3023928712263126, + "learning_rate": 1.0342884739214423e-07, + "logits/chosen": -0.7299407720565796, + "logits/rejected": -0.918536365032196, + "logps/chosen": -475.04998779296875, + "logps/rejected": -525.1749877929688, + "loss": 0.0134, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.671679735183716, + "rewards/margins": 10.59375, + "rewards/rejected": -14.259374618530273, + "step": 11140 + }, + { + "epoch": 3.5887181137845015, + "grad_norm": 0.6405431619722088, + "learning_rate": 1.0262395363811977e-07, + "logits/chosen": -0.890820324420929, + "logits/rejected": -1.010986328125, + "logps/chosen": -438.42498779296875, + "logps/rejected": -517.6500244140625, + "loss": 0.0049, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.170996189117432, + "rewards/margins": 10.533594131469727, + "rewards/rejected": -14.699999809265137, + "step": 11150 + }, + { + "epoch": 3.591936911563531, + "grad_norm": 0.06638082482306856, + "learning_rate": 1.0181905988409529e-07, + "logits/chosen": -0.654327392578125, + "logits/rejected": -0.8588012456893921, + "logps/chosen": -466.625, + "logps/rejected": -508.2749938964844, + "loss": 0.0053, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.765551805496216, + "rewards/margins": 11.087499618530273, + "rewards/rejected": -14.86328125, + "step": 11160 + }, + { + "epoch": 3.595155709342561, + "grad_norm": 0.20370852491076066, + "learning_rate": 1.0101416613007083e-07, + "logits/chosen": -0.6958373785018921, + "logits/rejected": -0.896105945110321, + "logps/chosen": -437.2749938964844, + "logps/rejected": -517.3499755859375, + "loss": 0.0047, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.051562309265137, + "rewards/margins": 10.770312309265137, + "rewards/rejected": -14.815625190734863, + "step": 11170 + }, + { + "epoch": 3.5983745071215902, + "grad_norm": 0.5860003613115581, + "learning_rate": 1.0020927237604635e-07, + "logits/chosen": -0.7583984136581421, + "logits/rejected": -0.9295898675918579, + "logps/chosen": -460.79998779296875, + "logps/rejected": -512.7249755859375, + "loss": 0.0178, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.006909370422363, + "rewards/margins": 10.9453125, + "rewards/rejected": -14.954687118530273, + "step": 11180 + }, + { + "epoch": 3.6015933049006197, + "grad_norm": 1.9777480848081723, + "learning_rate": 9.940437862202189e-08, + "logits/chosen": -0.7362701296806335, + "logits/rejected": -0.9903564453125, + "logps/chosen": -437.125, + "logps/rejected": -497.3999938964844, + "loss": 0.0094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.488183498382568, + "rewards/margins": 11.02734375, + "rewards/rejected": -15.509374618530273, + "step": 11190 + }, + { + "epoch": 3.604812102679649, + "grad_norm": 1.670310826783305, + "learning_rate": 9.859948486799743e-08, + "logits/chosen": -0.6460937261581421, + "logits/rejected": -0.9097045660018921, + "logps/chosen": -500.1000061035156, + "logps/rejected": -512.9749755859375, + "loss": 0.0094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.472754001617432, + "rewards/margins": 10.670312881469727, + "rewards/rejected": -15.1484375, + "step": 11200 + }, + { + "epoch": 3.6080309004586786, + "grad_norm": 0.35808716158746046, + "learning_rate": 9.779459111397295e-08, + "logits/chosen": -0.7669922113418579, + "logits/rejected": -0.942822277545929, + "logps/chosen": -451.75, + "logps/rejected": -521.1749877929688, + "loss": 0.0136, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -4.38818359375, + "rewards/margins": 10.325780868530273, + "rewards/rejected": -14.712499618530273, + "step": 11210 + }, + { + "epoch": 3.6112496982377085, + "grad_norm": 0.14101521414342913, + "learning_rate": 9.698969735994849e-08, + "logits/chosen": -0.6224365234375, + "logits/rejected": -0.863726794719696, + "logps/chosen": -478.7124938964844, + "logps/rejected": -545.7999877929688, + "loss": 0.0097, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.020117282867432, + "rewards/margins": 10.708593368530273, + "rewards/rejected": -14.7265625, + "step": 11220 + }, + { + "epoch": 3.6144684960167375, + "grad_norm": 0.38296662523874403, + "learning_rate": 9.618480360592401e-08, + "logits/chosen": -0.5841308832168579, + "logits/rejected": -0.915759265422821, + "logps/chosen": -492.75, + "logps/rejected": -533.0, + "loss": 0.0057, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.096923828125, + "rewards/margins": 10.879687309265137, + "rewards/rejected": -14.973437309265137, + "step": 11230 + }, + { + "epoch": 3.6176872937957674, + "grad_norm": 0.6530527108372771, + "learning_rate": 9.537990985189955e-08, + "logits/chosen": -0.728747546672821, + "logits/rejected": -1.08050537109375, + "logps/chosen": -453.6625061035156, + "logps/rejected": -478.3999938964844, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.965625047683716, + "rewards/margins": 10.746874809265137, + "rewards/rejected": -14.717187881469727, + "step": 11240 + }, + { + "epoch": 3.620906091574797, + "grad_norm": 0.4687971154482075, + "learning_rate": 9.457501609787507e-08, + "logits/chosen": -0.678662121295929, + "logits/rejected": -0.9889892339706421, + "logps/chosen": -451.57501220703125, + "logps/rejected": -485.6499938964844, + "loss": 0.0098, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.3255372047424316, + "rewards/margins": 10.116406440734863, + "rewards/rejected": -13.442187309265137, + "step": 11250 + }, + { + "epoch": 3.6241248893538263, + "grad_norm": 0.6903567957466038, + "learning_rate": 9.377012234385061e-08, + "logits/chosen": -0.7318115234375, + "logits/rejected": -0.9175049066543579, + "logps/chosen": -453.4750061035156, + "logps/rejected": -527.4000244140625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.070116996765137, + "rewards/margins": 11.01171875, + "rewards/rejected": -15.089062690734863, + "step": 11260 + }, + { + "epoch": 3.627343687132856, + "grad_norm": 0.27564178151980856, + "learning_rate": 9.296522858982614e-08, + "logits/chosen": -0.7823241949081421, + "logits/rejected": -1.007714867591858, + "logps/chosen": -481.95001220703125, + "logps/rejected": -515.4500122070312, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.873828172683716, + "rewards/margins": 10.525781631469727, + "rewards/rejected": -14.401562690734863, + "step": 11270 + }, + { + "epoch": 3.630562484911885, + "grad_norm": 0.1926739922820376, + "learning_rate": 9.216033483580167e-08, + "logits/chosen": -0.628710925579071, + "logits/rejected": -0.896484375, + "logps/chosen": -467.4375, + "logps/rejected": -535.7999877929688, + "loss": 0.0135, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -4.397070407867432, + "rewards/margins": 11.079687118530273, + "rewards/rejected": -15.471875190734863, + "step": 11280 + }, + { + "epoch": 3.633781282690915, + "grad_norm": 0.4034571851398344, + "learning_rate": 9.13554410817772e-08, + "logits/chosen": -0.924212634563446, + "logits/rejected": -1.008886694908142, + "logps/chosen": -415.20001220703125, + "logps/rejected": -511.6499938964844, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.277734279632568, + "rewards/margins": 10.655468940734863, + "rewards/rejected": -14.9296875, + "step": 11290 + }, + { + "epoch": 3.6370000804699445, + "grad_norm": 0.25485555578458696, + "learning_rate": 9.055054732775273e-08, + "logits/chosen": -0.6569579839706421, + "logits/rejected": -0.773608386516571, + "logps/chosen": -449.2250061035156, + "logps/rejected": -525.9000244140625, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.916796922683716, + "rewards/margins": 10.762499809265137, + "rewards/rejected": -14.678125381469727, + "step": 11300 + }, + { + "epoch": 3.640218878248974, + "grad_norm": 3.5714057921189974, + "learning_rate": 8.974565357372826e-08, + "logits/chosen": -0.619335949420929, + "logits/rejected": -0.9156128168106079, + "logps/chosen": -478.25, + "logps/rejected": -516.7999877929688, + "loss": 0.005, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.105761528015137, + "rewards/margins": 11.063281059265137, + "rewards/rejected": -15.167187690734863, + "step": 11310 + }, + { + "epoch": 3.6434376760280034, + "grad_norm": 0.1610227385043046, + "learning_rate": 8.89407598197038e-08, + "logits/chosen": -0.6241455078125, + "logits/rejected": -0.987011730670929, + "logps/chosen": -498.4750061035156, + "logps/rejected": -537.5, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.183984279632568, + "rewards/margins": 11.225000381469727, + "rewards/rejected": -15.410937309265137, + "step": 11320 + }, + { + "epoch": 3.646656473807033, + "grad_norm": 1.6109294790338178, + "learning_rate": 8.813586606567932e-08, + "logits/chosen": -0.5247405767440796, + "logits/rejected": -0.8053222894668579, + "logps/chosen": -458.1625061035156, + "logps/rejected": -520.125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.332714796066284, + "rewards/margins": 10.758593559265137, + "rewards/rejected": -14.090624809265137, + "step": 11330 + }, + { + "epoch": 3.6498752715860627, + "grad_norm": 0.19581313875801504, + "learning_rate": 8.733097231165486e-08, + "logits/chosen": -0.7665039300918579, + "logits/rejected": -0.9317382574081421, + "logps/chosen": -430.75, + "logps/rejected": -517.9000244140625, + "loss": 0.0053, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.120923042297363, + "rewards/margins": 10.341405868530273, + "rewards/rejected": -14.456250190734863, + "step": 11340 + }, + { + "epoch": 3.653094069365092, + "grad_norm": 0.0962783449419986, + "learning_rate": 8.652607855763038e-08, + "logits/chosen": -0.8125, + "logits/rejected": -0.9974365234375, + "logps/chosen": -439.8999938964844, + "logps/rejected": -510.0, + "loss": 0.0178, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.986132860183716, + "rewards/margins": 10.690625190734863, + "rewards/rejected": -14.675000190734863, + "step": 11350 + }, + { + "epoch": 3.6563128671441216, + "grad_norm": 0.4222554973243759, + "learning_rate": 8.572118480360592e-08, + "logits/chosen": -0.757537841796875, + "logits/rejected": -0.9686523675918579, + "logps/chosen": -470.54998779296875, + "logps/rejected": -533.5, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.921875, + "rewards/margins": 10.692187309265137, + "rewards/rejected": -14.615625381469727, + "step": 11360 + }, + { + "epoch": 3.659531664923151, + "grad_norm": 0.25036946633427215, + "learning_rate": 8.491629104958145e-08, + "logits/chosen": -0.6458740234375, + "logits/rejected": -0.8212890625, + "logps/chosen": -462.4750061035156, + "logps/rejected": -514.5750122070312, + "loss": 0.009, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.397656440734863, + "rewards/margins": 11.0859375, + "rewards/rejected": -15.481249809265137, + "step": 11370 + }, + { + "epoch": 3.6627504627021805, + "grad_norm": 0.38604692579646793, + "learning_rate": 8.411139729555698e-08, + "logits/chosen": -0.6679931879043579, + "logits/rejected": -0.877673327922821, + "logps/chosen": -454.3999938964844, + "logps/rejected": -522.25, + "loss": 0.0095, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.005932807922363, + "rewards/margins": 11.0703125, + "rewards/rejected": -15.081250190734863, + "step": 11380 + }, + { + "epoch": 3.6659692604812104, + "grad_norm": 1.8580579602018494, + "learning_rate": 8.330650354153252e-08, + "logits/chosen": -0.5926452875137329, + "logits/rejected": -0.7843993902206421, + "logps/chosen": -432.13751220703125, + "logps/rejected": -500.2749938964844, + "loss": 0.01, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.9976563453674316, + "rewards/margins": 10.450780868530273, + "rewards/rejected": -14.454687118530273, + "step": 11390 + }, + { + "epoch": 3.66918805826024, + "grad_norm": 0.09647937103845883, + "learning_rate": 8.250160978750804e-08, + "logits/chosen": -0.7320343255996704, + "logits/rejected": -0.909741222858429, + "logps/chosen": -467.17498779296875, + "logps/rejected": -546.6500244140625, + "loss": 0.0046, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.464160203933716, + "rewards/margins": 10.830469131469727, + "rewards/rejected": -14.296875, + "step": 11400 + }, + { + "epoch": 3.6724068560392693, + "grad_norm": 0.8152877212536223, + "learning_rate": 8.169671603348358e-08, + "logits/chosen": -0.6595093011856079, + "logits/rejected": -0.8312622308731079, + "logps/chosen": -467.4750061035156, + "logps/rejected": -536.0, + "loss": 0.0062, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.2884764671325684, + "rewards/margins": 10.982030868530273, + "rewards/rejected": -14.265625, + "step": 11410 + }, + { + "epoch": 3.6756256538182988, + "grad_norm": 0.22783688304927902, + "learning_rate": 8.08918222794591e-08, + "logits/chosen": -0.733471691608429, + "logits/rejected": -0.8945068120956421, + "logps/chosen": -437.6000061035156, + "logps/rejected": -503.0, + "loss": 0.0091, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.997802734375, + "rewards/margins": 10.624218940734863, + "rewards/rejected": -14.625, + "step": 11420 + }, + { + "epoch": 3.678844451597328, + "grad_norm": 0.4454077080448253, + "learning_rate": 8.008692852543464e-08, + "logits/chosen": -0.895825207233429, + "logits/rejected": -1.007665991783142, + "logps/chosen": -429.0249938964844, + "logps/rejected": -504.8999938964844, + "loss": 0.0091, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.047070503234863, + "rewards/margins": 10.939062118530273, + "rewards/rejected": -14.982812881469727, + "step": 11430 + }, + { + "epoch": 3.682063249376358, + "grad_norm": 0.16409267123570848, + "learning_rate": 7.928203477141018e-08, + "logits/chosen": -0.6405792236328125, + "logits/rejected": -0.7509490847587585, + "logps/chosen": -471.17498779296875, + "logps/rejected": -557.1500244140625, + "loss": 0.0108, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.2870116233825684, + "rewards/margins": 10.435937881469727, + "rewards/rejected": -13.725000381469727, + "step": 11440 + }, + { + "epoch": 3.6852820471553875, + "grad_norm": 0.3468326932783348, + "learning_rate": 7.84771410173857e-08, + "logits/chosen": -0.751422107219696, + "logits/rejected": -0.997607409954071, + "logps/chosen": -442.125, + "logps/rejected": -512.25, + "loss": 0.0047, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.150586128234863, + "rewards/margins": 11.121874809265137, + "rewards/rejected": -15.276562690734863, + "step": 11450 + }, + { + "epoch": 3.688500844934417, + "grad_norm": 0.1255522924837672, + "learning_rate": 7.767224726336124e-08, + "logits/chosen": -0.7307494878768921, + "logits/rejected": -0.93353271484375, + "logps/chosen": -510.0874938964844, + "logps/rejected": -558.6500244140625, + "loss": 0.0182, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.782470703125, + "rewards/margins": 11.146875381469727, + "rewards/rejected": -15.9296875, + "step": 11460 + }, + { + "epoch": 3.6917196427134464, + "grad_norm": 0.23691450369490838, + "learning_rate": 7.686735350933676e-08, + "logits/chosen": -0.8301147222518921, + "logits/rejected": -0.9764159917831421, + "logps/chosen": -444.8999938964844, + "logps/rejected": -516.0999755859375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.562744140625, + "rewards/margins": 10.654687881469727, + "rewards/rejected": -15.215624809265137, + "step": 11470 + }, + { + "epoch": 3.694938440492476, + "grad_norm": 0.9078870607497522, + "learning_rate": 7.60624597553123e-08, + "logits/chosen": -0.6773315668106079, + "logits/rejected": -0.7084594964981079, + "logps/chosen": -422.20001220703125, + "logps/rejected": -510.25, + "loss": 0.014, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.773144483566284, + "rewards/margins": 10.314844131469727, + "rewards/rejected": -14.090624809265137, + "step": 11480 + }, + { + "epoch": 3.6981572382715058, + "grad_norm": 1.3575304537311237, + "learning_rate": 7.525756600128782e-08, + "logits/chosen": -0.7392333745956421, + "logits/rejected": -0.938647449016571, + "logps/chosen": -446.92498779296875, + "logps/rejected": -497.25, + "loss": 0.0184, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.397363185882568, + "rewards/margins": 10.58203125, + "rewards/rejected": -14.982030868530273, + "step": 11490 + }, + { + "epoch": 3.7013760360505352, + "grad_norm": 0.08873843658760057, + "learning_rate": 7.445267224726336e-08, + "logits/chosen": -0.7223175168037415, + "logits/rejected": -0.8141235113143921, + "logps/chosen": -463.875, + "logps/rejected": -553.9000244140625, + "loss": 0.0049, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.24560546875, + "rewards/margins": 11.1328125, + "rewards/rejected": -15.371874809265137, + "step": 11500 + }, + { + "epoch": 3.7045948338295647, + "grad_norm": 0.9992273781669826, + "learning_rate": 7.364777849323889e-08, + "logits/chosen": -0.842211902141571, + "logits/rejected": -1.049414038658142, + "logps/chosen": -450.5249938964844, + "logps/rejected": -519.1500244140625, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.525488376617432, + "rewards/margins": 10.837499618530273, + "rewards/rejected": -15.362500190734863, + "step": 11510 + }, + { + "epoch": 3.707813631608594, + "grad_norm": 0.1544731530430188, + "learning_rate": 7.284288473921442e-08, + "logits/chosen": -0.724682629108429, + "logits/rejected": -0.8817474246025085, + "logps/chosen": -465.17498779296875, + "logps/rejected": -516.8250122070312, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.216210842132568, + "rewards/margins": 10.296875, + "rewards/rejected": -14.506250381469727, + "step": 11520 + }, + { + "epoch": 3.7110324293876236, + "grad_norm": 0.5654570853858621, + "learning_rate": 7.203799098518995e-08, + "logits/chosen": -0.8440917730331421, + "logits/rejected": -0.89434814453125, + "logps/chosen": -416.0249938964844, + "logps/rejected": -499.95001220703125, + "loss": 0.009, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.9306640625, + "rewards/margins": 10.623437881469727, + "rewards/rejected": -15.556249618530273, + "step": 11530 + }, + { + "epoch": 3.7142512271666535, + "grad_norm": 2.88237662124032, + "learning_rate": 7.123309723116548e-08, + "logits/chosen": -0.849926769733429, + "logits/rejected": -1.010522484779358, + "logps/chosen": -430.7875061035156, + "logps/rejected": -516.4749755859375, + "loss": 0.0077, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.351855516433716, + "rewards/margins": 10.262499809265137, + "rewards/rejected": -13.610937118530273, + "step": 11540 + }, + { + "epoch": 3.717470024945683, + "grad_norm": 0.24407488858758494, + "learning_rate": 7.042820347714101e-08, + "logits/chosen": -0.650530993938446, + "logits/rejected": -0.817675769329071, + "logps/chosen": -481.95001220703125, + "logps/rejected": -512.2249755859375, + "loss": 0.006, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.3517823219299316, + "rewards/margins": 10.293749809265137, + "rewards/rejected": -13.6484375, + "step": 11550 + }, + { + "epoch": 3.7206888227247124, + "grad_norm": 0.5786335070848263, + "learning_rate": 6.962330972311655e-08, + "logits/chosen": -0.6134033203125, + "logits/rejected": -0.8653503656387329, + "logps/chosen": -437.1000061035156, + "logps/rejected": -502.67498779296875, + "loss": 0.0046, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.551708936691284, + "rewards/margins": 10.317187309265137, + "rewards/rejected": -13.862500190734863, + "step": 11560 + }, + { + "epoch": 3.723907620503742, + "grad_norm": 0.381188702420216, + "learning_rate": 6.881841596909207e-08, + "logits/chosen": -0.7383178472518921, + "logits/rejected": -0.968212902545929, + "logps/chosen": -444.32501220703125, + "logps/rejected": -504.125, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8060545921325684, + "rewards/margins": 10.410937309265137, + "rewards/rejected": -14.21875, + "step": 11570 + }, + { + "epoch": 3.7271264182827712, + "grad_norm": 0.7473372122558345, + "learning_rate": 6.801352221506761e-08, + "logits/chosen": -0.7582153081893921, + "logits/rejected": -0.9502929449081421, + "logps/chosen": -473.4750061035156, + "logps/rejected": -517.375, + "loss": 0.0094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.8779296875, + "rewards/margins": 10.459375381469727, + "rewards/rejected": -14.342187881469727, + "step": 11580 + }, + { + "epoch": 3.730345216061801, + "grad_norm": 2.9779076254063996, + "learning_rate": 6.720862846104313e-08, + "logits/chosen": -0.6862426996231079, + "logits/rejected": -0.9185546636581421, + "logps/chosen": -455.5249938964844, + "logps/rejected": -524.6500244140625, + "loss": 0.0101, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.9478516578674316, + "rewards/margins": 10.913281440734863, + "rewards/rejected": -14.878125190734863, + "step": 11590 + }, + { + "epoch": 3.7335640138408306, + "grad_norm": 1.7736963401373307, + "learning_rate": 6.640373470701867e-08, + "logits/chosen": -0.616790771484375, + "logits/rejected": -0.8672851324081421, + "logps/chosen": -399.6499938964844, + "logps/rejected": -458.04998779296875, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.193749904632568, + "rewards/margins": 10.3984375, + "rewards/rejected": -14.59375, + "step": 11600 + }, + { + "epoch": 3.73678281161986, + "grad_norm": 1.3797748805181682, + "learning_rate": 6.55988409529942e-08, + "logits/chosen": -0.7790390253067017, + "logits/rejected": -0.8735992312431335, + "logps/chosen": -431.9750061035156, + "logps/rejected": -499.1499938964844, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.944628953933716, + "rewards/margins": 10.575780868530273, + "rewards/rejected": -14.509374618530273, + "step": 11610 + }, + { + "epoch": 3.7400016093988895, + "grad_norm": 0.20548201797764368, + "learning_rate": 6.479394719896973e-08, + "logits/chosen": -0.672985851764679, + "logits/rejected": -1.043432593345642, + "logps/chosen": -450.4624938964844, + "logps/rejected": -489.8500061035156, + "loss": 0.0133, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.947070360183716, + "rewards/margins": 10.512499809265137, + "rewards/rejected": -14.462499618530273, + "step": 11620 + }, + { + "epoch": 3.743220407177919, + "grad_norm": 0.2751834417745249, + "learning_rate": 6.398905344494527e-08, + "logits/chosen": -0.63262939453125, + "logits/rejected": -0.9006592035293579, + "logps/chosen": -468.75, + "logps/rejected": -566.75, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.207739353179932, + "rewards/margins": 11.797656059265137, + "rewards/rejected": -15.995312690734863, + "step": 11630 + }, + { + "epoch": 3.746439204956949, + "grad_norm": 0.9961923789945111, + "learning_rate": 6.318415969092079e-08, + "logits/chosen": -0.7405853271484375, + "logits/rejected": -0.9697021245956421, + "logps/chosen": -423.25, + "logps/rejected": -470.0249938964844, + "loss": 0.0094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.8921141624450684, + "rewards/margins": 10.565625190734863, + "rewards/rejected": -14.459375381469727, + "step": 11640 + }, + { + "epoch": 3.7496580027359783, + "grad_norm": 0.24152149942799, + "learning_rate": 6.237926593689633e-08, + "logits/chosen": -0.747485339641571, + "logits/rejected": -0.9646942019462585, + "logps/chosen": -412.7875061035156, + "logps/rejected": -489.54998779296875, + "loss": 0.0065, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.230029106140137, + "rewards/margins": 10.2734375, + "rewards/rejected": -14.498437881469727, + "step": 11650 + }, + { + "epoch": 3.7528768005150077, + "grad_norm": 1.8136598528269507, + "learning_rate": 6.157437218287185e-08, + "logits/chosen": -0.645263671875, + "logits/rejected": -0.8403564691543579, + "logps/chosen": -409.45001220703125, + "logps/rejected": -482.8500061035156, + "loss": 0.0138, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.880603075027466, + "rewards/margins": 10.983593940734863, + "rewards/rejected": -14.8671875, + "step": 11660 + }, + { + "epoch": 3.756095598294037, + "grad_norm": 0.2138402240726892, + "learning_rate": 6.076947842884739e-08, + "logits/chosen": -0.730151355266571, + "logits/rejected": -0.9354919195175171, + "logps/chosen": -402.25, + "logps/rejected": -481.8999938964844, + "loss": 0.0098, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.035937309265137, + "rewards/margins": 10.678906440734863, + "rewards/rejected": -14.721875190734863, + "step": 11670 + }, + { + "epoch": 3.7593143960730666, + "grad_norm": 0.7517417557820429, + "learning_rate": 5.996458467482292e-08, + "logits/chosen": -0.7029784917831421, + "logits/rejected": -0.906811535358429, + "logps/chosen": -438.5874938964844, + "logps/rejected": -522.5750122070312, + "loss": 0.0054, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.880908250808716, + "rewards/margins": 10.998437881469727, + "rewards/rejected": -14.875, + "step": 11680 + }, + { + "epoch": 3.7625331938520965, + "grad_norm": 0.11010541367510819, + "learning_rate": 5.915969092079845e-08, + "logits/chosen": -0.782788097858429, + "logits/rejected": -0.9260498285293579, + "logps/chosen": -425.1000061035156, + "logps/rejected": -486.79998779296875, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.6399903297424316, + "rewards/margins": 10.404687881469727, + "rewards/rejected": -14.042187690734863, + "step": 11690 + }, + { + "epoch": 3.7657519916311255, + "grad_norm": 0.5921528914613254, + "learning_rate": 5.835479716677398e-08, + "logits/chosen": -0.7998412847518921, + "logits/rejected": -1.0155518054962158, + "logps/chosen": -454.9750061035156, + "logps/rejected": -503.0, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9867186546325684, + "rewards/margins": 10.612500190734863, + "rewards/rejected": -14.600000381469727, + "step": 11700 + }, + { + "epoch": 3.7689707894101554, + "grad_norm": 0.2373372732925005, + "learning_rate": 5.7549903412749515e-08, + "logits/chosen": -0.7930663824081421, + "logits/rejected": -0.985595703125, + "logps/chosen": -456.125, + "logps/rejected": -492.875, + "loss": 0.0097, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.284399509429932, + "rewards/margins": 10.584375381469727, + "rewards/rejected": -14.875781059265137, + "step": 11710 + }, + { + "epoch": 3.772189587189185, + "grad_norm": 0.21167627992788088, + "learning_rate": 5.6745009658725045e-08, + "logits/chosen": -0.6441284418106079, + "logits/rejected": -0.966992199420929, + "logps/chosen": -464.8500061035156, + "logps/rejected": -484.25, + "loss": 0.0097, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.239843845367432, + "rewards/margins": 10.567187309265137, + "rewards/rejected": -14.807812690734863, + "step": 11720 + }, + { + "epoch": 3.7754083849682143, + "grad_norm": 0.4449667181920484, + "learning_rate": 5.5940115904700575e-08, + "logits/chosen": -0.749591052532196, + "logits/rejected": -0.943798840045929, + "logps/chosen": -430.125, + "logps/rejected": -496.25, + "loss": 0.0091, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.053735256195068, + "rewards/margins": 10.686718940734863, + "rewards/rejected": -14.735937118530273, + "step": 11730 + }, + { + "epoch": 3.7786271827472437, + "grad_norm": 0.38760274313801096, + "learning_rate": 5.5135222150676105e-08, + "logits/chosen": -0.704968273639679, + "logits/rejected": -0.910815417766571, + "logps/chosen": -471.42498779296875, + "logps/rejected": -504.8999938964844, + "loss": 0.0106, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.324804782867432, + "rewards/margins": 10.469531059265137, + "rewards/rejected": -14.795312881469727, + "step": 11740 + }, + { + "epoch": 3.781845980526273, + "grad_norm": 0.2694130651540435, + "learning_rate": 5.4330328396651635e-08, + "logits/chosen": -0.728955090045929, + "logits/rejected": -0.81463623046875, + "logps/chosen": -456.9750061035156, + "logps/rejected": -534.7000122070312, + "loss": 0.0048, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.934277296066284, + "rewards/margins": 10.62109375, + "rewards/rejected": -14.551562309265137, + "step": 11750 + }, + { + "epoch": 3.785064778305303, + "grad_norm": 0.9120809632728631, + "learning_rate": 5.3525434642627165e-08, + "logits/chosen": -0.6722087860107422, + "logits/rejected": -0.8260132074356079, + "logps/chosen": -454.25, + "logps/rejected": -520.9000244140625, + "loss": 0.0094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.737011671066284, + "rewards/margins": 10.303906440734863, + "rewards/rejected": -14.035937309265137, + "step": 11760 + }, + { + "epoch": 3.7882835760843325, + "grad_norm": 0.7795093966254877, + "learning_rate": 5.27205408886027e-08, + "logits/chosen": -0.6629394292831421, + "logits/rejected": -0.763214111328125, + "logps/chosen": -427.07501220703125, + "logps/rejected": -515.875, + "loss": 0.0137, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -4.217382907867432, + "rewards/margins": 10.446874618530273, + "rewards/rejected": -14.670312881469727, + "step": 11770 + }, + { + "epoch": 3.791502373863362, + "grad_norm": 0.6112072753622798, + "learning_rate": 5.191564713457823e-08, + "logits/chosen": -0.675646960735321, + "logits/rejected": -1.0326659679412842, + "logps/chosen": -459.1000061035156, + "logps/rejected": -480.57501220703125, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.04443359375, + "rewards/margins": 10.690625190734863, + "rewards/rejected": -14.731249809265137, + "step": 11780 + }, + { + "epoch": 3.7947211716423914, + "grad_norm": 1.0021495743193611, + "learning_rate": 5.111075338055376e-08, + "logits/chosen": -0.6770232915878296, + "logits/rejected": -0.9114013910293579, + "logps/chosen": -457.8999938964844, + "logps/rejected": -527.7000122070312, + "loss": 0.0053, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.7158203125, + "rewards/margins": 10.946093559265137, + "rewards/rejected": -14.663281440734863, + "step": 11790 + }, + { + "epoch": 3.797939969421421, + "grad_norm": 0.09719542488307724, + "learning_rate": 5.030585962652929e-08, + "logits/chosen": -0.8030761480331421, + "logits/rejected": -0.92919921875, + "logps/chosen": -403.20001220703125, + "logps/rejected": -498.0, + "loss": 0.005, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.998046875, + "rewards/margins": 10.58203125, + "rewards/rejected": -14.581250190734863, + "step": 11800 + }, + { + "epoch": 3.8011587672004508, + "grad_norm": 0.20193064318528825, + "learning_rate": 4.950096587250482e-08, + "logits/chosen": -0.7008163332939148, + "logits/rejected": -1.020721435546875, + "logps/chosen": -479.2749938964844, + "logps/rejected": -479.8500061035156, + "loss": 0.0091, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.903210401535034, + "rewards/margins": 11.060937881469727, + "rewards/rejected": -14.96875, + "step": 11810 + }, + { + "epoch": 3.80437756497948, + "grad_norm": 0.34413899764846917, + "learning_rate": 4.869607211848035e-08, + "logits/chosen": -0.608044445514679, + "logits/rejected": -0.8050903081893921, + "logps/chosen": -512.8250122070312, + "logps/rejected": -541.2000122070312, + "loss": 0.0136, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.9408202171325684, + "rewards/margins": 10.573437690734863, + "rewards/rejected": -14.520312309265137, + "step": 11820 + }, + { + "epoch": 3.8075963627585097, + "grad_norm": 0.28871637592995564, + "learning_rate": 4.789117836445589e-08, + "logits/chosen": -0.825268566608429, + "logits/rejected": -1.063207983970642, + "logps/chosen": -430.1499938964844, + "logps/rejected": -470.8999938964844, + "loss": 0.0091, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.9468140602111816, + "rewards/margins": 10.390625, + "rewards/rejected": -14.340624809265137, + "step": 11830 + }, + { + "epoch": 3.810815160537539, + "grad_norm": 0.6048401733035009, + "learning_rate": 4.708628461043142e-08, + "logits/chosen": -0.8797851800918579, + "logits/rejected": -0.9517272710800171, + "logps/chosen": -407.2250061035156, + "logps/rejected": -507.79998779296875, + "loss": 0.0096, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.9375977516174316, + "rewards/margins": 10.914843559265137, + "rewards/rejected": -14.840624809265137, + "step": 11840 + }, + { + "epoch": 3.8140339583165686, + "grad_norm": 0.7947630491034547, + "learning_rate": 4.628139085640695e-08, + "logits/chosen": -0.759814441204071, + "logits/rejected": -0.9169555902481079, + "logps/chosen": -455.45001220703125, + "logps/rejected": -524.2000122070312, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.8612303733825684, + "rewards/margins": 10.73046875, + "rewards/rejected": -14.596875190734863, + "step": 11850 + }, + { + "epoch": 3.8172527560955984, + "grad_norm": 0.6478512784765477, + "learning_rate": 4.547649710238248e-08, + "logits/chosen": -0.7718261480331421, + "logits/rejected": -0.9376586675643921, + "logps/chosen": -457.92498779296875, + "logps/rejected": -509.125, + "loss": 0.0091, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.888378858566284, + "rewards/margins": 10.415624618530273, + "rewards/rejected": -14.295312881469727, + "step": 11860 + }, + { + "epoch": 3.820471553874628, + "grad_norm": 1.6967801989450426, + "learning_rate": 4.467160334835801e-08, + "logits/chosen": -0.758648693561554, + "logits/rejected": -0.9137328863143921, + "logps/chosen": -457.2749938964844, + "logps/rejected": -544.1749877929688, + "loss": 0.0134, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.627636671066284, + "rewards/margins": 10.845312118530273, + "rewards/rejected": -14.4765625, + "step": 11870 + }, + { + "epoch": 3.8236903516536573, + "grad_norm": 0.4976164006375748, + "learning_rate": 4.3866709594333547e-08, + "logits/chosen": -0.48345947265625, + "logits/rejected": -0.804003894329071, + "logps/chosen": -415.20001220703125, + "logps/rejected": -472.1000061035156, + "loss": 0.0067, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.580078125, + "rewards/margins": 10.258593559265137, + "rewards/rejected": -13.842187881469727, + "step": 11880 + }, + { + "epoch": 3.826909149432687, + "grad_norm": 0.11002456012807393, + "learning_rate": 4.306181584030908e-08, + "logits/chosen": -0.6572479009628296, + "logits/rejected": -0.8954223394393921, + "logps/chosen": -420.29998779296875, + "logps/rejected": -499.1499938964844, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.611328125, + "rewards/margins": 10.779687881469727, + "rewards/rejected": -14.3828125, + "step": 11890 + }, + { + "epoch": 3.8301279472117162, + "grad_norm": 0.3291106413472417, + "learning_rate": 4.2256922086284613e-08, + "logits/chosen": -0.6888427734375, + "logits/rejected": -0.8753417730331421, + "logps/chosen": -410.8999938964844, + "logps/rejected": -477.70001220703125, + "loss": 0.0135, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.562695264816284, + "rewards/margins": 10.780468940734863, + "rewards/rejected": -14.342187881469727, + "step": 11900 + }, + { + "epoch": 3.833346744990746, + "grad_norm": 0.2800236371220961, + "learning_rate": 4.1452028332260143e-08, + "logits/chosen": -0.8084777593612671, + "logits/rejected": -0.9374023675918579, + "logps/chosen": -403.3500061035156, + "logps/rejected": -488.29998779296875, + "loss": 0.0049, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.858593702316284, + "rewards/margins": 10.7109375, + "rewards/rejected": -14.565625190734863, + "step": 11910 + }, + { + "epoch": 3.8365655427697756, + "grad_norm": 0.8772449809560939, + "learning_rate": 4.0647134578235674e-08, + "logits/chosen": -0.5408874750137329, + "logits/rejected": -0.7589111328125, + "logps/chosen": -419.29998779296875, + "logps/rejected": -490.5, + "loss": 0.0168, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -4.019629001617432, + "rewards/margins": 10.161718368530273, + "rewards/rejected": -14.1796875, + "step": 11920 + }, + { + "epoch": 3.839784340548805, + "grad_norm": 0.34950685392288455, + "learning_rate": 3.9842240824211204e-08, + "logits/chosen": -0.713818371295929, + "logits/rejected": -0.93896484375, + "logps/chosen": -450.4750061035156, + "logps/rejected": -526.7000122070312, + "loss": 0.0137, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -4.515722751617432, + "rewards/margins": 10.966405868530273, + "rewards/rejected": -15.490625381469727, + "step": 11930 + }, + { + "epoch": 3.8430031383278345, + "grad_norm": 0.649105292456249, + "learning_rate": 3.9037347070186734e-08, + "logits/chosen": -0.65667724609375, + "logits/rejected": -0.851025402545929, + "logps/chosen": -445.32501220703125, + "logps/rejected": -523.7000122070312, + "loss": 0.0051, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.993212938308716, + "rewards/margins": 10.596875190734863, + "rewards/rejected": -14.582812309265137, + "step": 11940 + }, + { + "epoch": 3.846221936106864, + "grad_norm": 0.2661500253685774, + "learning_rate": 3.823245331616227e-08, + "logits/chosen": -0.8007446527481079, + "logits/rejected": -1.1084473133087158, + "logps/chosen": -424.45001220703125, + "logps/rejected": -491.1000061035156, + "loss": 0.0095, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.9124512672424316, + "rewards/margins": 10.366406440734863, + "rewards/rejected": -14.290624618530273, + "step": 11950 + }, + { + "epoch": 3.849440733885894, + "grad_norm": 1.9952998448129016, + "learning_rate": 3.74275595621378e-08, + "logits/chosen": -0.754229724407196, + "logits/rejected": -0.90057373046875, + "logps/chosen": -416.7250061035156, + "logps/rejected": -491.25, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.94677734375, + "rewards/margins": 10.302343368530273, + "rewards/rejected": -14.254687309265137, + "step": 11960 + }, + { + "epoch": 3.8526595316649233, + "grad_norm": 12.896976438988238, + "learning_rate": 3.662266580811333e-08, + "logits/chosen": -0.8008788824081421, + "logits/rejected": -0.9272063970565796, + "logps/chosen": -399.375, + "logps/rejected": -481.32501220703125, + "loss": 0.0097, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.9749999046325684, + "rewards/margins": 10.725781440734863, + "rewards/rejected": -14.699999809265137, + "step": 11970 + }, + { + "epoch": 3.8558783294439527, + "grad_norm": 0.409995084558386, + "learning_rate": 3.581777205408886e-08, + "logits/chosen": -0.67816162109375, + "logits/rejected": -0.9380859136581421, + "logps/chosen": -478.29998779296875, + "logps/rejected": -526.6500244140625, + "loss": 0.0047, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.016650199890137, + "rewards/margins": 11.073437690734863, + "rewards/rejected": -15.095312118530273, + "step": 11980 + }, + { + "epoch": 3.859097127222982, + "grad_norm": 1.0759404593527118, + "learning_rate": 3.501287830006439e-08, + "logits/chosen": -0.845288097858429, + "logits/rejected": -1.0594971179962158, + "logps/chosen": -441.375, + "logps/rejected": -517.1500244140625, + "loss": 0.007, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.328711032867432, + "rewards/margins": 10.5859375, + "rewards/rejected": -14.910937309265137, + "step": 11990 + }, + { + "epoch": 3.8623159250020116, + "grad_norm": 0.19812526894693516, + "learning_rate": 3.420798454603992e-08, + "logits/chosen": -0.563977062702179, + "logits/rejected": -0.8571411371231079, + "logps/chosen": -456.3999938964844, + "logps/rejected": -484.45001220703125, + "loss": 0.0047, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.759765625, + "rewards/margins": 10.936718940734863, + "rewards/rejected": -14.704687118530273, + "step": 12000 + }, + { + "epoch": 3.8655347227810415, + "grad_norm": 0.2019575119756879, + "learning_rate": 3.340309079201546e-08, + "logits/chosen": -0.7392944097518921, + "logits/rejected": -0.8964203000068665, + "logps/chosen": -448.45001220703125, + "logps/rejected": -527.9500122070312, + "loss": 0.0145, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.625293016433716, + "rewards/margins": 10.904687881469727, + "rewards/rejected": -14.532812118530273, + "step": 12010 + }, + { + "epoch": 3.868753520560071, + "grad_norm": 0.7044079075696652, + "learning_rate": 3.259819703799099e-08, + "logits/chosen": -0.8149963617324829, + "logits/rejected": -0.8679443597793579, + "logps/chosen": -397.36248779296875, + "logps/rejected": -501.375, + "loss": 0.0052, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.203027248382568, + "rewards/margins": 10.83203125, + "rewards/rejected": -15.03125, + "step": 12020 + }, + { + "epoch": 3.8719723183391004, + "grad_norm": 0.23609984711611895, + "learning_rate": 3.179330328396652e-08, + "logits/chosen": -0.7355407476425171, + "logits/rejected": -1.018530249595642, + "logps/chosen": -441.0, + "logps/rejected": -510.20001220703125, + "loss": 0.0046, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.185156345367432, + "rewards/margins": 11.367968559265137, + "rewards/rejected": -15.553125381469727, + "step": 12030 + }, + { + "epoch": 3.87519111611813, + "grad_norm": 0.324653574836781, + "learning_rate": 3.098840952994205e-08, + "logits/chosen": -0.7647460699081421, + "logits/rejected": -1.0397460460662842, + "logps/chosen": -469.0, + "logps/rejected": -493.7749938964844, + "loss": 0.0049, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.837207078933716, + "rewards/margins": 10.600781440734863, + "rewards/rejected": -14.439062118530273, + "step": 12040 + }, + { + "epoch": 3.8784099138971593, + "grad_norm": 0.08804943131896459, + "learning_rate": 3.018351577591758e-08, + "logits/chosen": -0.5939376950263977, + "logits/rejected": -0.8805176019668579, + "logps/chosen": -469.9750061035156, + "logps/rejected": -509.25, + "loss": 0.0161, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -4.234375, + "rewards/margins": 10.828125, + "rewards/rejected": -15.0546875, + "step": 12050 + }, + { + "epoch": 3.881628711676189, + "grad_norm": 0.2853161275695444, + "learning_rate": 2.9378622021893108e-08, + "logits/chosen": -0.678295910358429, + "logits/rejected": -0.9691192507743835, + "logps/chosen": -440.4750061035156, + "logps/rejected": -489.6000061035156, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.180639743804932, + "rewards/margins": 10.317968368530273, + "rewards/rejected": -14.501562118530273, + "step": 12060 + }, + { + "epoch": 3.8848475094552186, + "grad_norm": 3.40338036826059, + "learning_rate": 2.8573728267868638e-08, + "logits/chosen": -0.8391357660293579, + "logits/rejected": -1.026635766029358, + "logps/chosen": -461.125, + "logps/rejected": -512.75, + "loss": 0.0055, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.706860303878784, + "rewards/margins": 10.614843368530273, + "rewards/rejected": -14.314844131469727, + "step": 12070 + }, + { + "epoch": 3.888066307234248, + "grad_norm": 0.2604911007298673, + "learning_rate": 2.776883451384417e-08, + "logits/chosen": -0.742785632610321, + "logits/rejected": -0.8861846923828125, + "logps/chosen": -465.0, + "logps/rejected": -568.7999877929688, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9853515625, + "rewards/margins": 11.265625, + "rewards/rejected": -15.246874809265137, + "step": 12080 + }, + { + "epoch": 3.8912851050132775, + "grad_norm": 0.5248867208883365, + "learning_rate": 2.6963940759819702e-08, + "logits/chosen": -0.627520740032196, + "logits/rejected": -0.9205688238143921, + "logps/chosen": -422.8125, + "logps/rejected": -489.70001220703125, + "loss": 0.0138, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.493603467941284, + "rewards/margins": 10.454687118530273, + "rewards/rejected": -13.9453125, + "step": 12090 + }, + { + "epoch": 3.894503902792307, + "grad_norm": 0.383904302288239, + "learning_rate": 2.6159047005795232e-08, + "logits/chosen": -0.6684631109237671, + "logits/rejected": -0.960888683795929, + "logps/chosen": -447.82501220703125, + "logps/rejected": -497.8999938964844, + "loss": 0.005, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.640820264816284, + "rewards/margins": 10.584375381469727, + "rewards/rejected": -14.2265625, + "step": 12100 + }, + { + "epoch": 3.897722700571337, + "grad_norm": 0.5882854110800377, + "learning_rate": 2.5354153251770762e-08, + "logits/chosen": -0.7506958246231079, + "logits/rejected": -0.885668933391571, + "logps/chosen": -452.04998779296875, + "logps/rejected": -531.4500122070312, + "loss": 0.0091, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.1607666015625, + "rewards/margins": 10.540624618530273, + "rewards/rejected": -14.703125, + "step": 12110 + }, + { + "epoch": 3.9009414983503663, + "grad_norm": 0.7199135040697383, + "learning_rate": 2.4549259497746295e-08, + "logits/chosen": -0.7565063238143921, + "logits/rejected": -0.9015258550643921, + "logps/chosen": -424.32501220703125, + "logps/rejected": -522.6500244140625, + "loss": 0.0053, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.93408203125, + "rewards/margins": 10.557812690734863, + "rewards/rejected": -14.489062309265137, + "step": 12120 + }, + { + "epoch": 3.9041602961293957, + "grad_norm": 0.06091230133451961, + "learning_rate": 2.3744365743721826e-08, + "logits/chosen": -0.6497436761856079, + "logits/rejected": -0.9020019769668579, + "logps/chosen": -487.6499938964844, + "logps/rejected": -528.5999755859375, + "loss": 0.0049, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.904101610183716, + "rewards/margins": 10.853906631469727, + "rewards/rejected": -14.751562118530273, + "step": 12130 + }, + { + "epoch": 3.907379093908425, + "grad_norm": 0.2148462228014061, + "learning_rate": 2.2939471989697356e-08, + "logits/chosen": -0.6794067621231079, + "logits/rejected": -0.91143798828125, + "logps/chosen": -441.1000061035156, + "logps/rejected": -517.625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6201171875, + "rewards/margins": 10.672656059265137, + "rewards/rejected": -14.295312881469727, + "step": 12140 + }, + { + "epoch": 3.9105978916874546, + "grad_norm": 0.06043875368673618, + "learning_rate": 2.2134578235672892e-08, + "logits/chosen": -0.5641326904296875, + "logits/rejected": -0.8553832769393921, + "logps/chosen": -418.5249938964844, + "logps/rejected": -487.70001220703125, + "loss": 0.0229, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.681884765625, + "rewards/margins": 10.839062690734863, + "rewards/rejected": -14.526562690734863, + "step": 12150 + }, + { + "epoch": 3.9138166894664845, + "grad_norm": 0.3653476372978921, + "learning_rate": 2.1329684481648422e-08, + "logits/chosen": -0.6447082757949829, + "logits/rejected": -0.8944336175918579, + "logps/chosen": -469.625, + "logps/rejected": -520.7750244140625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.580273389816284, + "rewards/margins": 10.643750190734863, + "rewards/rejected": -14.231249809265137, + "step": 12160 + }, + { + "epoch": 3.9170354872455135, + "grad_norm": 0.4850497438349994, + "learning_rate": 2.0524790727623956e-08, + "logits/chosen": -0.77313232421875, + "logits/rejected": -1.0208008289337158, + "logps/chosen": -449.95001220703125, + "logps/rejected": -479.92498779296875, + "loss": 0.0067, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.544189453125, + "rewards/margins": 10.713281631469727, + "rewards/rejected": -14.260937690734863, + "step": 12170 + }, + { + "epoch": 3.9202542850245434, + "grad_norm": 0.20137967738604307, + "learning_rate": 1.9719896973599486e-08, + "logits/chosen": -0.7747802734375, + "logits/rejected": -0.9181884527206421, + "logps/chosen": -471.6499938964844, + "logps/rejected": -542.2000122070312, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.149804592132568, + "rewards/margins": 10.957812309265137, + "rewards/rejected": -15.106249809265137, + "step": 12180 + }, + { + "epoch": 3.923473082803573, + "grad_norm": 0.4652705626499597, + "learning_rate": 1.8915003219575016e-08, + "logits/chosen": -0.7914794683456421, + "logits/rejected": -0.949389636516571, + "logps/chosen": -467.42498779296875, + "logps/rejected": -527.0999755859375, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.363867282867432, + "rewards/margins": 10.60546875, + "rewards/rejected": -14.9609375, + "step": 12190 + }, + { + "epoch": 3.9266918805826023, + "grad_norm": 0.21163771300229609, + "learning_rate": 1.811010946555055e-08, + "logits/chosen": -0.7585083246231079, + "logits/rejected": -0.968945324420929, + "logps/chosen": -470.0625, + "logps/rejected": -541.4500122070312, + "loss": 0.0047, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.849316358566284, + "rewards/margins": 11.307812690734863, + "rewards/rejected": -15.154687881469727, + "step": 12200 + }, + { + "epoch": 3.9299106783616318, + "grad_norm": 2.8669967990848693, + "learning_rate": 1.730521571152608e-08, + "logits/chosen": -0.859912097454071, + "logits/rejected": -1.0270507335662842, + "logps/chosen": -480.0, + "logps/rejected": -508.6499938964844, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.145703315734863, + "rewards/margins": 10.720312118530273, + "rewards/rejected": -14.868749618530273, + "step": 12210 + }, + { + "epoch": 3.933129476140661, + "grad_norm": 0.6643025808181512, + "learning_rate": 1.650032195750161e-08, + "logits/chosen": -0.7503875494003296, + "logits/rejected": -0.9901153445243835, + "logps/chosen": -445.32501220703125, + "logps/rejected": -482.6499938964844, + "loss": 0.0053, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.868408203125, + "rewards/margins": 10.474218368530273, + "rewards/rejected": -14.342187881469727, + "step": 12220 + }, + { + "epoch": 3.936348273919691, + "grad_norm": 0.5235136340187281, + "learning_rate": 1.5695428203477143e-08, + "logits/chosen": -0.6179565191268921, + "logits/rejected": -0.8008667230606079, + "logps/chosen": -468.07501220703125, + "logps/rejected": -549.875, + "loss": 0.0053, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.388671875, + "rewards/margins": 10.733593940734863, + "rewards/rejected": -14.114062309265137, + "step": 12230 + }, + { + "epoch": 3.9395670716987206, + "grad_norm": 0.2988177322544238, + "learning_rate": 1.4890534449452672e-08, + "logits/chosen": -0.6802734136581421, + "logits/rejected": -0.9537597894668579, + "logps/chosen": -459.25, + "logps/rejected": -516.7999877929688, + "loss": 0.005, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.3453125953674316, + "rewards/margins": 11.559374809265137, + "rewards/rejected": -14.899999618530273, + "step": 12240 + }, + { + "epoch": 3.94278586947775, + "grad_norm": 0.700105123964485, + "learning_rate": 1.4085640695428202e-08, + "logits/chosen": -0.764605700969696, + "logits/rejected": -0.88031005859375, + "logps/chosen": -405.54998779296875, + "logps/rejected": -489.6000061035156, + "loss": 0.0052, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.6160645484924316, + "rewards/margins": 9.961718559265137, + "rewards/rejected": -13.571874618530273, + "step": 12250 + }, + { + "epoch": 3.9460046672567795, + "grad_norm": 0.33874430223673163, + "learning_rate": 1.3280746941403733e-08, + "logits/chosen": -0.8258911371231079, + "logits/rejected": -0.962451159954071, + "logps/chosen": -424.76251220703125, + "logps/rejected": -507.54998779296875, + "loss": 0.0051, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.117383003234863, + "rewards/margins": 9.963281631469727, + "rewards/rejected": -14.081250190734863, + "step": 12260 + }, + { + "epoch": 3.949223465035809, + "grad_norm": 0.044402039808682736, + "learning_rate": 1.2475853187379265e-08, + "logits/chosen": -0.798657238483429, + "logits/rejected": -0.9153686761856079, + "logps/chosen": -450.2749938964844, + "logps/rejected": -521.4000244140625, + "loss": 0.0099, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.303124904632568, + "rewards/margins": 10.75, + "rewards/rejected": -15.0625, + "step": 12270 + }, + { + "epoch": 3.952442262814839, + "grad_norm": 1.6190597046899349, + "learning_rate": 1.1670959433354795e-08, + "logits/chosen": -0.8603760004043579, + "logits/rejected": -0.99871826171875, + "logps/chosen": -458.3999938964844, + "logps/rejected": -528.7000122070312, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.70849609375, + "rewards/margins": 10.248437881469727, + "rewards/rejected": -14.956250190734863, + "step": 12280 + }, + { + "epoch": 3.9556610605938682, + "grad_norm": 0.238964024118089, + "learning_rate": 1.0866065679330329e-08, + "logits/chosen": -0.8295043706893921, + "logits/rejected": -1.031225562095642, + "logps/chosen": -420.0249938964844, + "logps/rejected": -491.3500061035156, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.049560546875, + "rewards/margins": 10.606249809265137, + "rewards/rejected": -14.659375190734863, + "step": 12290 + }, + { + "epoch": 3.9588798583728977, + "grad_norm": 0.8038065156963223, + "learning_rate": 1.006117192530586e-08, + "logits/chosen": -0.7170654535293579, + "logits/rejected": -0.965624988079071, + "logps/chosen": -455.4375, + "logps/rejected": -502.3500061035156, + "loss": 0.0091, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.000097751617432, + "rewards/margins": 10.903124809265137, + "rewards/rejected": -14.903124809265137, + "step": 12300 + }, + { + "epoch": 3.962098656151927, + "grad_norm": 0.09621658997338625, + "learning_rate": 9.25627817128139e-09, + "logits/chosen": -0.659912109375, + "logits/rejected": -0.9289306402206421, + "logps/chosen": -474.32501220703125, + "logps/rejected": -521.3499755859375, + "loss": 0.0051, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.8331055641174316, + "rewards/margins": 10.407031059265137, + "rewards/rejected": -14.246874809265137, + "step": 12310 + }, + { + "epoch": 3.9653174539309566, + "grad_norm": 1.0267827260389686, + "learning_rate": 8.451384417256922e-09, + "logits/chosen": -0.639874279499054, + "logits/rejected": -0.9310577511787415, + "logps/chosen": -492.5249938964844, + "logps/rejected": -533.5, + "loss": 0.0058, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.8958983421325684, + "rewards/margins": 10.837499618530273, + "rewards/rejected": -14.732812881469727, + "step": 12320 + }, + { + "epoch": 3.9685362517099865, + "grad_norm": 1.479328256143731, + "learning_rate": 7.646490663232452e-09, + "logits/chosen": -0.6932128667831421, + "logits/rejected": -0.9504150152206421, + "logps/chosen": -462.45001220703125, + "logps/rejected": -551.5, + "loss": 0.0051, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.206250190734863, + "rewards/margins": 10.685155868530273, + "rewards/rejected": -14.876562118530273, + "step": 12330 + }, + { + "epoch": 3.971755049489016, + "grad_norm": 1.8073706936419467, + "learning_rate": 6.841596909207984e-09, + "logits/chosen": -0.781909167766571, + "logits/rejected": -0.8913818597793579, + "logps/chosen": -453.3999938964844, + "logps/rejected": -501.2250061035156, + "loss": 0.0065, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.959277391433716, + "rewards/margins": 10.481249809265137, + "rewards/rejected": -14.442187309265137, + "step": 12340 + }, + { + "epoch": 3.9749738472680454, + "grad_norm": 0.22141330134455114, + "learning_rate": 6.036703155183515e-09, + "logits/chosen": -0.8670684695243835, + "logits/rejected": -1.0908203125, + "logps/chosen": -442.45001220703125, + "logps/rejected": -512.9000244140625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.492382764816284, + "rewards/margins": 10.767969131469727, + "rewards/rejected": -14.262499809265137, + "step": 12350 + }, + { + "epoch": 3.978192645047075, + "grad_norm": 0.5926228009653782, + "learning_rate": 5.231809401159047e-09, + "logits/chosen": -0.664642333984375, + "logits/rejected": -0.8946777582168579, + "logps/chosen": -489.625, + "logps/rejected": -526.5999755859375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.284960746765137, + "rewards/margins": 11.353124618530273, + "rewards/rejected": -15.628125190734863, + "step": 12360 + }, + { + "epoch": 3.9814114428261043, + "grad_norm": 0.6692721421790224, + "learning_rate": 4.426915647134579e-09, + "logits/chosen": -0.7732909917831421, + "logits/rejected": -0.956408679485321, + "logps/chosen": -419.20001220703125, + "logps/rejected": -485.0249938964844, + "loss": 0.0058, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.996630907058716, + "rewards/margins": 10.134374618530273, + "rewards/rejected": -14.126562118530273, + "step": 12370 + }, + { + "epoch": 3.984630240605134, + "grad_norm": 0.19719763158846174, + "learning_rate": 3.622021893110109e-09, + "logits/chosen": -0.6405273675918579, + "logits/rejected": -0.9609619379043579, + "logps/chosen": -481.3500061035156, + "logps/rejected": -513.25, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.794726610183716, + "rewards/margins": 10.788281440734863, + "rewards/rejected": -14.581250190734863, + "step": 12380 + }, + { + "epoch": 3.9878490383841636, + "grad_norm": 1.3881642173325834, + "learning_rate": 2.8171281390856405e-09, + "logits/chosen": -0.8176635503768921, + "logits/rejected": -1.048437476158142, + "logps/chosen": -453.8999938964844, + "logps/rejected": -511.25, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.058886528015137, + "rewards/margins": 10.747655868530273, + "rewards/rejected": -14.803125381469727, + "step": 12390 + }, + { + "epoch": 3.991067836163193, + "grad_norm": 0.890510541904273, + "learning_rate": 2.012234385061172e-09, + "logits/chosen": -0.7579101324081421, + "logits/rejected": -0.9278564453125, + "logps/chosen": -418.4750061035156, + "logps/rejected": -503.3999938964844, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.050585746765137, + "rewards/margins": 10.310155868530273, + "rewards/rejected": -14.359375, + "step": 12400 + }, + { + "epoch": 3.9942866339422225, + "grad_norm": 1.1376647402311773, + "learning_rate": 1.2073406310367032e-09, + "logits/chosen": -0.7470977902412415, + "logits/rejected": -0.9676269292831421, + "logps/chosen": -426.5249938964844, + "logps/rejected": -515.2999877929688, + "loss": 0.0049, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.9419922828674316, + "rewards/margins": 10.850000381469727, + "rewards/rejected": -14.78515625, + "step": 12410 + }, + { + "epoch": 3.997505431721252, + "grad_norm": 0.1545311575204133, + "learning_rate": 4.0244687701223433e-10, + "logits/chosen": -0.68475341796875, + "logits/rejected": -0.96044921875, + "logps/chosen": -469.6499938964844, + "logps/rejected": -502.0, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5442872047424316, + "rewards/margins": 10.409375190734863, + "rewards/rejected": -13.957812309265137, + "step": 12420 + } + ], + "logging_steps": 10, + "max_steps": 12424, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}