PEFT
Safetensors
MTIPA-7B-PositionTask / trainer_state.json
LLMMINE's picture
Upload 14 files
5dca9e7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.999656605198997,
"eval_steps": 1,
"global_step": 218400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004578597346702838,
"grad_norm": 1.834977626800537,
"learning_rate": 4.578754578754579e-07,
"loss": 0.7024,
"step": 100
},
{
"epoch": 0.009157194693405675,
"grad_norm": 1.148632287979126,
"learning_rate": 9.157509157509158e-07,
"loss": 0.6769,
"step": 200
},
{
"epoch": 0.013735792040108512,
"grad_norm": 0.9439795613288879,
"learning_rate": 1.3736263736263736e-06,
"loss": 0.6151,
"step": 300
},
{
"epoch": 0.01831438938681135,
"grad_norm": 0.7975717186927795,
"learning_rate": 1.8315018315018316e-06,
"loss": 0.5009,
"step": 400
},
{
"epoch": 0.022892986733514187,
"grad_norm": 0.6828369498252869,
"learning_rate": 2.2893772893772894e-06,
"loss": 0.4383,
"step": 500
},
{
"epoch": 0.027471584080217024,
"grad_norm": 0.7334387302398682,
"learning_rate": 2.747252747252747e-06,
"loss": 0.3948,
"step": 600
},
{
"epoch": 0.032050181426919865,
"grad_norm": 0.5922483205795288,
"learning_rate": 3.205128205128205e-06,
"loss": 0.3682,
"step": 700
},
{
"epoch": 0.0366287787736227,
"grad_norm": 0.42683616280555725,
"learning_rate": 3.663003663003663e-06,
"loss": 0.3471,
"step": 800
},
{
"epoch": 0.04120737612032554,
"grad_norm": 0.798675537109375,
"learning_rate": 4.120879120879121e-06,
"loss": 0.3391,
"step": 900
},
{
"epoch": 0.045785973467028375,
"grad_norm": 0.6987248659133911,
"learning_rate": 4.578754578754579e-06,
"loss": 0.3213,
"step": 1000
},
{
"epoch": 0.05036457081373121,
"grad_norm": 0.697691798210144,
"learning_rate": 5.036630036630037e-06,
"loss": 0.3021,
"step": 1100
},
{
"epoch": 0.05494316816043405,
"grad_norm": 1.0107570886611938,
"learning_rate": 5.494505494505494e-06,
"loss": 0.2854,
"step": 1200
},
{
"epoch": 0.059521765507136885,
"grad_norm": 0.9664294719696045,
"learning_rate": 5.9523809523809525e-06,
"loss": 0.2712,
"step": 1300
},
{
"epoch": 0.06410036285383973,
"grad_norm": 0.8629726767539978,
"learning_rate": 6.41025641025641e-06,
"loss": 0.2591,
"step": 1400
},
{
"epoch": 0.06867896020054257,
"grad_norm": 1.2182508707046509,
"learning_rate": 6.868131868131869e-06,
"loss": 0.2523,
"step": 1500
},
{
"epoch": 0.0732575575472454,
"grad_norm": 0.8369842171669006,
"learning_rate": 7.326007326007326e-06,
"loss": 0.2498,
"step": 1600
},
{
"epoch": 0.07783615489394824,
"grad_norm": 1.195522427558899,
"learning_rate": 7.783882783882785e-06,
"loss": 0.2345,
"step": 1700
},
{
"epoch": 0.08241475224065108,
"grad_norm": 1.7609663009643555,
"learning_rate": 8.241758241758243e-06,
"loss": 0.226,
"step": 1800
},
{
"epoch": 0.08699334958735391,
"grad_norm": 1.2075083255767822,
"learning_rate": 8.6996336996337e-06,
"loss": 0.2187,
"step": 1900
},
{
"epoch": 0.09157194693405675,
"grad_norm": 1.0836577415466309,
"learning_rate": 9.157509157509158e-06,
"loss": 0.2121,
"step": 2000
},
{
"epoch": 0.09615054428075959,
"grad_norm": 0.962322473526001,
"learning_rate": 9.615384615384616e-06,
"loss": 0.1984,
"step": 2100
},
{
"epoch": 0.10072914162746242,
"grad_norm": 0.6929520964622498,
"learning_rate": 1.0073260073260074e-05,
"loss": 0.1949,
"step": 2200
},
{
"epoch": 0.10530773897416526,
"grad_norm": 1.0407646894454956,
"learning_rate": 1.0531135531135532e-05,
"loss": 0.1968,
"step": 2300
},
{
"epoch": 0.1098863363208681,
"grad_norm": 1.3924702405929565,
"learning_rate": 1.0989010989010989e-05,
"loss": 0.1932,
"step": 2400
},
{
"epoch": 0.11446493366757093,
"grad_norm": 1.2128703594207764,
"learning_rate": 1.1446886446886447e-05,
"loss": 0.1855,
"step": 2500
},
{
"epoch": 0.11904353101427377,
"grad_norm": 1.8191227912902832,
"learning_rate": 1.1904761904761905e-05,
"loss": 0.1866,
"step": 2600
},
{
"epoch": 0.12362212836097662,
"grad_norm": 1.528939127922058,
"learning_rate": 1.2362637362637363e-05,
"loss": 0.1798,
"step": 2700
},
{
"epoch": 0.12820072570767946,
"grad_norm": 1.4381574392318726,
"learning_rate": 1.282051282051282e-05,
"loss": 0.1788,
"step": 2800
},
{
"epoch": 0.13277932305438228,
"grad_norm": 1.9535735845565796,
"learning_rate": 1.327838827838828e-05,
"loss": 0.1751,
"step": 2900
},
{
"epoch": 0.13735792040108513,
"grad_norm": 1.095574140548706,
"learning_rate": 1.3736263736263738e-05,
"loss": 0.1745,
"step": 3000
},
{
"epoch": 0.14193651774778795,
"grad_norm": 1.4498175382614136,
"learning_rate": 1.4194139194139194e-05,
"loss": 0.1654,
"step": 3100
},
{
"epoch": 0.1465151150944908,
"grad_norm": 1.274528980255127,
"learning_rate": 1.4652014652014653e-05,
"loss": 0.1759,
"step": 3200
},
{
"epoch": 0.15109371244119363,
"grad_norm": 2.9102790355682373,
"learning_rate": 1.510989010989011e-05,
"loss": 0.1663,
"step": 3300
},
{
"epoch": 0.15567230978789648,
"grad_norm": 0.8863092660903931,
"learning_rate": 1.556776556776557e-05,
"loss": 0.1613,
"step": 3400
},
{
"epoch": 0.1602509071345993,
"grad_norm": 1.825391411781311,
"learning_rate": 1.602564102564103e-05,
"loss": 0.1655,
"step": 3500
},
{
"epoch": 0.16482950448130215,
"grad_norm": 1.2341893911361694,
"learning_rate": 1.6483516483516486e-05,
"loss": 0.1532,
"step": 3600
},
{
"epoch": 0.169408101828005,
"grad_norm": 1.6574805974960327,
"learning_rate": 1.6941391941391942e-05,
"loss": 0.1485,
"step": 3700
},
{
"epoch": 0.17398669917470783,
"grad_norm": 1.424926996231079,
"learning_rate": 1.73992673992674e-05,
"loss": 0.1514,
"step": 3800
},
{
"epoch": 0.17856529652141068,
"grad_norm": 1.5658457279205322,
"learning_rate": 1.785714285714286e-05,
"loss": 0.1532,
"step": 3900
},
{
"epoch": 0.1831438938681135,
"grad_norm": 2.2447550296783447,
"learning_rate": 1.8315018315018315e-05,
"loss": 0.1536,
"step": 4000
},
{
"epoch": 0.18772249121481635,
"grad_norm": 1.8312195539474487,
"learning_rate": 1.8772893772893775e-05,
"loss": 0.152,
"step": 4100
},
{
"epoch": 0.19230108856151917,
"grad_norm": 4.884443283081055,
"learning_rate": 1.923076923076923e-05,
"loss": 0.1423,
"step": 4200
},
{
"epoch": 0.19687968590822202,
"grad_norm": 2.421905994415283,
"learning_rate": 1.9688644688644688e-05,
"loss": 0.1493,
"step": 4300
},
{
"epoch": 0.20145828325492485,
"grad_norm": 1.5298246145248413,
"learning_rate": 2.0146520146520148e-05,
"loss": 0.1399,
"step": 4400
},
{
"epoch": 0.2060368806016277,
"grad_norm": 1.8627902269363403,
"learning_rate": 2.0604395604395604e-05,
"loss": 0.1418,
"step": 4500
},
{
"epoch": 0.21061547794833052,
"grad_norm": 1.0591548681259155,
"learning_rate": 2.1062271062271064e-05,
"loss": 0.146,
"step": 4600
},
{
"epoch": 0.21519407529503337,
"grad_norm": 2.3305251598358154,
"learning_rate": 2.152014652014652e-05,
"loss": 0.1394,
"step": 4700
},
{
"epoch": 0.2197726726417362,
"grad_norm": 2.5741324424743652,
"learning_rate": 2.1978021978021977e-05,
"loss": 0.1357,
"step": 4800
},
{
"epoch": 0.22435126998843904,
"grad_norm": 1.3497207164764404,
"learning_rate": 2.2435897435897437e-05,
"loss": 0.1293,
"step": 4900
},
{
"epoch": 0.22892986733514187,
"grad_norm": 1.5644819736480713,
"learning_rate": 2.2893772893772894e-05,
"loss": 0.132,
"step": 5000
},
{
"epoch": 0.23350846468184472,
"grad_norm": 1.2510719299316406,
"learning_rate": 2.3351648351648354e-05,
"loss": 0.1266,
"step": 5100
},
{
"epoch": 0.23808706202854754,
"grad_norm": 2.4705810546875,
"learning_rate": 2.380952380952381e-05,
"loss": 0.1334,
"step": 5200
},
{
"epoch": 0.2426656593752504,
"grad_norm": 2.0139317512512207,
"learning_rate": 2.4267399267399267e-05,
"loss": 0.1336,
"step": 5300
},
{
"epoch": 0.24724425672195324,
"grad_norm": 1.2344926595687866,
"learning_rate": 2.4725274725274727e-05,
"loss": 0.1309,
"step": 5400
},
{
"epoch": 0.25182285406865607,
"grad_norm": 1.8490768671035767,
"learning_rate": 2.5183150183150183e-05,
"loss": 0.1337,
"step": 5500
},
{
"epoch": 0.2564014514153589,
"grad_norm": 1.2988219261169434,
"learning_rate": 2.564102564102564e-05,
"loss": 0.1373,
"step": 5600
},
{
"epoch": 0.26098004876206177,
"grad_norm": 1.7260547876358032,
"learning_rate": 2.6098901098901103e-05,
"loss": 0.1199,
"step": 5700
},
{
"epoch": 0.26555864610876456,
"grad_norm": 2.653820037841797,
"learning_rate": 2.655677655677656e-05,
"loss": 0.1175,
"step": 5800
},
{
"epoch": 0.2701372434554674,
"grad_norm": 2.190546989440918,
"learning_rate": 2.7014652014652016e-05,
"loss": 0.1216,
"step": 5900
},
{
"epoch": 0.27471584080217026,
"grad_norm": 1.3163684606552124,
"learning_rate": 2.7472527472527476e-05,
"loss": 0.127,
"step": 6000
},
{
"epoch": 0.2792944381488731,
"grad_norm": 2.4772284030914307,
"learning_rate": 2.7930402930402932e-05,
"loss": 0.1218,
"step": 6100
},
{
"epoch": 0.2838730354955759,
"grad_norm": 2.5586929321289062,
"learning_rate": 2.838827838827839e-05,
"loss": 0.1258,
"step": 6200
},
{
"epoch": 0.28845163284227876,
"grad_norm": 1.8947139978408813,
"learning_rate": 2.8846153846153845e-05,
"loss": 0.1242,
"step": 6300
},
{
"epoch": 0.2930302301889816,
"grad_norm": 2.729238271713257,
"learning_rate": 2.9304029304029305e-05,
"loss": 0.1183,
"step": 6400
},
{
"epoch": 0.29760882753568446,
"grad_norm": 1.338982343673706,
"learning_rate": 2.9761904761904762e-05,
"loss": 0.1203,
"step": 6500
},
{
"epoch": 0.30218742488238726,
"grad_norm": 1.6393356323242188,
"learning_rate": 3.021978021978022e-05,
"loss": 0.1173,
"step": 6600
},
{
"epoch": 0.3067660222290901,
"grad_norm": 2.4386088848114014,
"learning_rate": 3.067765567765568e-05,
"loss": 0.1262,
"step": 6700
},
{
"epoch": 0.31134461957579296,
"grad_norm": 1.6236072778701782,
"learning_rate": 3.113553113553114e-05,
"loss": 0.1163,
"step": 6800
},
{
"epoch": 0.3159232169224958,
"grad_norm": 1.6855531930923462,
"learning_rate": 3.1593406593406595e-05,
"loss": 0.1116,
"step": 6900
},
{
"epoch": 0.3205018142691986,
"grad_norm": 0.9769238233566284,
"learning_rate": 3.205128205128206e-05,
"loss": 0.1152,
"step": 7000
},
{
"epoch": 0.32508041161590145,
"grad_norm": 2.372692823410034,
"learning_rate": 3.2509157509157515e-05,
"loss": 0.1148,
"step": 7100
},
{
"epoch": 0.3296590089626043,
"grad_norm": 1.6294013261795044,
"learning_rate": 3.296703296703297e-05,
"loss": 0.1149,
"step": 7200
},
{
"epoch": 0.33423760630930716,
"grad_norm": 1.4730180501937866,
"learning_rate": 3.342490842490843e-05,
"loss": 0.113,
"step": 7300
},
{
"epoch": 0.33881620365601,
"grad_norm": 1.435680866241455,
"learning_rate": 3.3882783882783884e-05,
"loss": 0.1084,
"step": 7400
},
{
"epoch": 0.3433948010027128,
"grad_norm": 1.2709417343139648,
"learning_rate": 3.434065934065934e-05,
"loss": 0.1076,
"step": 7500
},
{
"epoch": 0.34797339834941565,
"grad_norm": 2.1665501594543457,
"learning_rate": 3.47985347985348e-05,
"loss": 0.1074,
"step": 7600
},
{
"epoch": 0.3525519956961185,
"grad_norm": 1.0768879652023315,
"learning_rate": 3.525641025641026e-05,
"loss": 0.1124,
"step": 7700
},
{
"epoch": 0.35713059304282135,
"grad_norm": 2.1648874282836914,
"learning_rate": 3.571428571428572e-05,
"loss": 0.1139,
"step": 7800
},
{
"epoch": 0.36170919038952415,
"grad_norm": 1.0463404655456543,
"learning_rate": 3.6172161172161173e-05,
"loss": 0.1084,
"step": 7900
},
{
"epoch": 0.366287787736227,
"grad_norm": 2.0209906101226807,
"learning_rate": 3.663003663003663e-05,
"loss": 0.1143,
"step": 8000
},
{
"epoch": 0.37086638508292985,
"grad_norm": 1.6264885663986206,
"learning_rate": 3.708791208791209e-05,
"loss": 0.1117,
"step": 8100
},
{
"epoch": 0.3754449824296327,
"grad_norm": 1.8169121742248535,
"learning_rate": 3.754578754578755e-05,
"loss": 0.1055,
"step": 8200
},
{
"epoch": 0.3800235797763355,
"grad_norm": 1.3127943277359009,
"learning_rate": 3.8003663003663006e-05,
"loss": 0.1135,
"step": 8300
},
{
"epoch": 0.38460217712303835,
"grad_norm": 1.2721083164215088,
"learning_rate": 3.846153846153846e-05,
"loss": 0.1144,
"step": 8400
},
{
"epoch": 0.3891807744697412,
"grad_norm": 1.393925666809082,
"learning_rate": 3.891941391941392e-05,
"loss": 0.106,
"step": 8500
},
{
"epoch": 0.39375937181644405,
"grad_norm": 1.0821542739868164,
"learning_rate": 3.9377289377289376e-05,
"loss": 0.105,
"step": 8600
},
{
"epoch": 0.39833796916314684,
"grad_norm": 1.5736069679260254,
"learning_rate": 3.983516483516483e-05,
"loss": 0.111,
"step": 8700
},
{
"epoch": 0.4029165665098497,
"grad_norm": 1.8037768602371216,
"learning_rate": 4.0293040293040296e-05,
"loss": 0.1094,
"step": 8800
},
{
"epoch": 0.40749516385655254,
"grad_norm": 1.1317250728607178,
"learning_rate": 4.075091575091575e-05,
"loss": 0.1028,
"step": 8900
},
{
"epoch": 0.4120737612032554,
"grad_norm": 1.362167477607727,
"learning_rate": 4.120879120879121e-05,
"loss": 0.1087,
"step": 9000
},
{
"epoch": 0.41665235854995825,
"grad_norm": 1.9178133010864258,
"learning_rate": 4.166666666666667e-05,
"loss": 0.1036,
"step": 9100
},
{
"epoch": 0.42123095589666104,
"grad_norm": 1.3326084613800049,
"learning_rate": 4.212454212454213e-05,
"loss": 0.1034,
"step": 9200
},
{
"epoch": 0.4258095532433639,
"grad_norm": 2.299654245376587,
"learning_rate": 4.2582417582417585e-05,
"loss": 0.0938,
"step": 9300
},
{
"epoch": 0.43038815059006674,
"grad_norm": 1.5850861072540283,
"learning_rate": 4.304029304029304e-05,
"loss": 0.0991,
"step": 9400
},
{
"epoch": 0.4349667479367696,
"grad_norm": 1.0600929260253906,
"learning_rate": 4.34981684981685e-05,
"loss": 0.1001,
"step": 9500
},
{
"epoch": 0.4395453452834724,
"grad_norm": 0.8734288811683655,
"learning_rate": 4.3956043956043955e-05,
"loss": 0.0991,
"step": 9600
},
{
"epoch": 0.44412394263017524,
"grad_norm": 1.5875756740570068,
"learning_rate": 4.441391941391941e-05,
"loss": 0.0982,
"step": 9700
},
{
"epoch": 0.4487025399768781,
"grad_norm": 1.2083957195281982,
"learning_rate": 4.4871794871794874e-05,
"loss": 0.1027,
"step": 9800
},
{
"epoch": 0.45328113732358094,
"grad_norm": 1.5730398893356323,
"learning_rate": 4.532967032967033e-05,
"loss": 0.1035,
"step": 9900
},
{
"epoch": 0.45785973467028374,
"grad_norm": 1.0928138494491577,
"learning_rate": 4.578754578754579e-05,
"loss": 0.1017,
"step": 10000
},
{
"epoch": 0.4624383320169866,
"grad_norm": 1.858508586883545,
"learning_rate": 4.624542124542125e-05,
"loss": 0.1056,
"step": 10100
},
{
"epoch": 0.46701692936368944,
"grad_norm": 0.7009546756744385,
"learning_rate": 4.670329670329671e-05,
"loss": 0.1014,
"step": 10200
},
{
"epoch": 0.4715955267103923,
"grad_norm": 1.1056081056594849,
"learning_rate": 4.7161172161172164e-05,
"loss": 0.0997,
"step": 10300
},
{
"epoch": 0.4761741240570951,
"grad_norm": 1.5328575372695923,
"learning_rate": 4.761904761904762e-05,
"loss": 0.0971,
"step": 10400
},
{
"epoch": 0.48075272140379793,
"grad_norm": 2.9480137825012207,
"learning_rate": 4.8076923076923084e-05,
"loss": 0.1004,
"step": 10500
},
{
"epoch": 0.4853313187505008,
"grad_norm": 0.9198638796806335,
"learning_rate": 4.8534798534798533e-05,
"loss": 0.093,
"step": 10600
},
{
"epoch": 0.48990991609720363,
"grad_norm": 1.3510689735412598,
"learning_rate": 4.899267399267399e-05,
"loss": 0.0966,
"step": 10700
},
{
"epoch": 0.4944885134439065,
"grad_norm": 1.1206891536712646,
"learning_rate": 4.945054945054945e-05,
"loss": 0.1007,
"step": 10800
},
{
"epoch": 0.4990671107906093,
"grad_norm": 1.6948041915893555,
"learning_rate": 4.990842490842491e-05,
"loss": 0.0904,
"step": 10900
},
{
"epoch": 0.5036457081373121,
"grad_norm": 0.7195038199424744,
"learning_rate": 5.0366300366300366e-05,
"loss": 0.0894,
"step": 11000
},
{
"epoch": 0.508224305484015,
"grad_norm": 0.9326936602592468,
"learning_rate": 5.082417582417582e-05,
"loss": 0.0935,
"step": 11100
},
{
"epoch": 0.5128029028307178,
"grad_norm": 1.224360704421997,
"learning_rate": 5.128205128205128e-05,
"loss": 0.1011,
"step": 11200
},
{
"epoch": 0.5173815001774207,
"grad_norm": 0.7471579313278198,
"learning_rate": 5.173992673992675e-05,
"loss": 0.0936,
"step": 11300
},
{
"epoch": 0.5219600975241235,
"grad_norm": 0.8234615921974182,
"learning_rate": 5.2197802197802206e-05,
"loss": 0.092,
"step": 11400
},
{
"epoch": 0.5265386948708263,
"grad_norm": 1.204841136932373,
"learning_rate": 5.265567765567766e-05,
"loss": 0.1006,
"step": 11500
},
{
"epoch": 0.5311172922175291,
"grad_norm": 0.980890691280365,
"learning_rate": 5.311355311355312e-05,
"loss": 0.0909,
"step": 11600
},
{
"epoch": 0.535695889564232,
"grad_norm": 0.8736656308174133,
"learning_rate": 5.3571428571428575e-05,
"loss": 0.0921,
"step": 11700
},
{
"epoch": 0.5402744869109348,
"grad_norm": 1.8916438817977905,
"learning_rate": 5.402930402930403e-05,
"loss": 0.092,
"step": 11800
},
{
"epoch": 0.5448530842576377,
"grad_norm": 0.49095866084098816,
"learning_rate": 5.448717948717948e-05,
"loss": 0.0922,
"step": 11900
},
{
"epoch": 0.5494316816043405,
"grad_norm": 1.418338656425476,
"learning_rate": 5.494505494505495e-05,
"loss": 0.088,
"step": 12000
},
{
"epoch": 0.5540102789510434,
"grad_norm": 0.6211123466491699,
"learning_rate": 5.540293040293041e-05,
"loss": 0.0931,
"step": 12100
},
{
"epoch": 0.5585888762977462,
"grad_norm": 1.9046452045440674,
"learning_rate": 5.5860805860805865e-05,
"loss": 0.0934,
"step": 12200
},
{
"epoch": 0.563167473644449,
"grad_norm": 0.9247643351554871,
"learning_rate": 5.631868131868132e-05,
"loss": 0.0889,
"step": 12300
},
{
"epoch": 0.5677460709911518,
"grad_norm": 1.4018969535827637,
"learning_rate": 5.677655677655678e-05,
"loss": 0.0904,
"step": 12400
},
{
"epoch": 0.5723246683378547,
"grad_norm": 0.510405421257019,
"learning_rate": 5.7234432234432234e-05,
"loss": 0.0856,
"step": 12500
},
{
"epoch": 0.5769032656845575,
"grad_norm": 0.7951760292053223,
"learning_rate": 5.769230769230769e-05,
"loss": 0.0881,
"step": 12600
},
{
"epoch": 0.5814818630312604,
"grad_norm": 1.340402364730835,
"learning_rate": 5.8150183150183154e-05,
"loss": 0.0869,
"step": 12700
},
{
"epoch": 0.5860604603779632,
"grad_norm": 1.1029311418533325,
"learning_rate": 5.860805860805861e-05,
"loss": 0.0922,
"step": 12800
},
{
"epoch": 0.5906390577246661,
"grad_norm": 0.9942110180854797,
"learning_rate": 5.906593406593407e-05,
"loss": 0.094,
"step": 12900
},
{
"epoch": 0.5952176550713689,
"grad_norm": 0.9533814787864685,
"learning_rate": 5.9523809523809524e-05,
"loss": 0.0846,
"step": 13000
},
{
"epoch": 0.5997962524180718,
"grad_norm": 0.9805833101272583,
"learning_rate": 5.998168498168498e-05,
"loss": 0.0889,
"step": 13100
},
{
"epoch": 0.6043748497647745,
"grad_norm": 0.6185852885246277,
"learning_rate": 6.043956043956044e-05,
"loss": 0.0832,
"step": 13200
},
{
"epoch": 0.6089534471114774,
"grad_norm": 0.508185088634491,
"learning_rate": 6.089743589743589e-05,
"loss": 0.0828,
"step": 13300
},
{
"epoch": 0.6135320444581802,
"grad_norm": 0.8816813826560974,
"learning_rate": 6.135531135531136e-05,
"loss": 0.0907,
"step": 13400
},
{
"epoch": 0.6181106418048831,
"grad_norm": 0.7851380109786987,
"learning_rate": 6.181318681318681e-05,
"loss": 0.0838,
"step": 13500
},
{
"epoch": 0.6226892391515859,
"grad_norm": 1.2309856414794922,
"learning_rate": 6.227106227106228e-05,
"loss": 0.0892,
"step": 13600
},
{
"epoch": 0.6272678364982888,
"grad_norm": 0.9368516802787781,
"learning_rate": 6.272893772893773e-05,
"loss": 0.0826,
"step": 13700
},
{
"epoch": 0.6318464338449916,
"grad_norm": 2.122927188873291,
"learning_rate": 6.318681318681319e-05,
"loss": 0.0891,
"step": 13800
},
{
"epoch": 0.6364250311916945,
"grad_norm": 1.421099305152893,
"learning_rate": 6.364468864468864e-05,
"loss": 0.0856,
"step": 13900
},
{
"epoch": 0.6410036285383972,
"grad_norm": 1.240886926651001,
"learning_rate": 6.410256410256412e-05,
"loss": 0.084,
"step": 14000
},
{
"epoch": 0.6455822258851001,
"grad_norm": 1.5990924835205078,
"learning_rate": 6.456043956043957e-05,
"loss": 0.08,
"step": 14100
},
{
"epoch": 0.6501608232318029,
"grad_norm": 1.1593393087387085,
"learning_rate": 6.501831501831503e-05,
"loss": 0.0872,
"step": 14200
},
{
"epoch": 0.6547394205785058,
"grad_norm": 1.4352833032608032,
"learning_rate": 6.547619047619048e-05,
"loss": 0.0855,
"step": 14300
},
{
"epoch": 0.6593180179252086,
"grad_norm": 1.0805554389953613,
"learning_rate": 6.593406593406594e-05,
"loss": 0.076,
"step": 14400
},
{
"epoch": 0.6638966152719115,
"grad_norm": 1.4789384603500366,
"learning_rate": 6.639194139194139e-05,
"loss": 0.0787,
"step": 14500
},
{
"epoch": 0.6684752126186143,
"grad_norm": 0.5183298587799072,
"learning_rate": 6.684981684981686e-05,
"loss": 0.0818,
"step": 14600
},
{
"epoch": 0.6730538099653172,
"grad_norm": 0.540027916431427,
"learning_rate": 6.730769230769232e-05,
"loss": 0.0821,
"step": 14700
},
{
"epoch": 0.67763240731202,
"grad_norm": 0.4286615550518036,
"learning_rate": 6.776556776556777e-05,
"loss": 0.0835,
"step": 14800
},
{
"epoch": 0.6822110046587228,
"grad_norm": 0.6089257597923279,
"learning_rate": 6.822344322344323e-05,
"loss": 0.0774,
"step": 14900
},
{
"epoch": 0.6867896020054256,
"grad_norm": 1.8646626472473145,
"learning_rate": 6.868131868131868e-05,
"loss": 0.0768,
"step": 15000
},
{
"epoch": 0.6913681993521285,
"grad_norm": 1.5041414499282837,
"learning_rate": 6.913919413919414e-05,
"loss": 0.0811,
"step": 15100
},
{
"epoch": 0.6959467966988313,
"grad_norm": 1.2774971723556519,
"learning_rate": 6.95970695970696e-05,
"loss": 0.0824,
"step": 15200
},
{
"epoch": 0.7005253940455342,
"grad_norm": 0.7839298248291016,
"learning_rate": 7.005494505494506e-05,
"loss": 0.0825,
"step": 15300
},
{
"epoch": 0.705103991392237,
"grad_norm": 0.8007500767707825,
"learning_rate": 7.051282051282052e-05,
"loss": 0.0822,
"step": 15400
},
{
"epoch": 0.7096825887389399,
"grad_norm": 0.9601584672927856,
"learning_rate": 7.097069597069597e-05,
"loss": 0.0735,
"step": 15500
},
{
"epoch": 0.7142611860856427,
"grad_norm": 0.6283702254295349,
"learning_rate": 7.142857142857143e-05,
"loss": 0.0834,
"step": 15600
},
{
"epoch": 0.7188397834323454,
"grad_norm": 0.9371336102485657,
"learning_rate": 7.188644688644688e-05,
"loss": 0.0805,
"step": 15700
},
{
"epoch": 0.7234183807790483,
"grad_norm": 0.703433096408844,
"learning_rate": 7.234432234432235e-05,
"loss": 0.0797,
"step": 15800
},
{
"epoch": 0.7279969781257511,
"grad_norm": 1.1103012561798096,
"learning_rate": 7.28021978021978e-05,
"loss": 0.0702,
"step": 15900
},
{
"epoch": 0.732575575472454,
"grad_norm": 1.1333719491958618,
"learning_rate": 7.326007326007326e-05,
"loss": 0.0785,
"step": 16000
},
{
"epoch": 0.7371541728191568,
"grad_norm": 1.4945460557937622,
"learning_rate": 7.371794871794872e-05,
"loss": 0.0759,
"step": 16100
},
{
"epoch": 0.7417327701658597,
"grad_norm": 1.2516579627990723,
"learning_rate": 7.417582417582419e-05,
"loss": 0.0773,
"step": 16200
},
{
"epoch": 0.7463113675125626,
"grad_norm": 0.6910843253135681,
"learning_rate": 7.463369963369964e-05,
"loss": 0.0791,
"step": 16300
},
{
"epoch": 0.7508899648592654,
"grad_norm": 1.752681851387024,
"learning_rate": 7.50915750915751e-05,
"loss": 0.0729,
"step": 16400
},
{
"epoch": 0.7554685622059683,
"grad_norm": 0.46465998888015747,
"learning_rate": 7.554945054945055e-05,
"loss": 0.0691,
"step": 16500
},
{
"epoch": 0.760047159552671,
"grad_norm": 0.6676632165908813,
"learning_rate": 7.600732600732601e-05,
"loss": 0.0773,
"step": 16600
},
{
"epoch": 0.7646257568993738,
"grad_norm": 0.5497579574584961,
"learning_rate": 7.646520146520146e-05,
"loss": 0.0734,
"step": 16700
},
{
"epoch": 0.7692043542460767,
"grad_norm": 1.6269124746322632,
"learning_rate": 7.692307692307693e-05,
"loss": 0.075,
"step": 16800
},
{
"epoch": 0.7737829515927795,
"grad_norm": 1.7342535257339478,
"learning_rate": 7.738095238095239e-05,
"loss": 0.0738,
"step": 16900
},
{
"epoch": 0.7783615489394824,
"grad_norm": 0.5286089181900024,
"learning_rate": 7.783882783882784e-05,
"loss": 0.0791,
"step": 17000
},
{
"epoch": 0.7829401462861852,
"grad_norm": 1.0948727130889893,
"learning_rate": 7.82967032967033e-05,
"loss": 0.074,
"step": 17100
},
{
"epoch": 0.7875187436328881,
"grad_norm": 0.7580143809318542,
"learning_rate": 7.875457875457875e-05,
"loss": 0.0776,
"step": 17200
},
{
"epoch": 0.792097340979591,
"grad_norm": 0.6144015789031982,
"learning_rate": 7.921245421245422e-05,
"loss": 0.0698,
"step": 17300
},
{
"epoch": 0.7966759383262937,
"grad_norm": 1.054747462272644,
"learning_rate": 7.967032967032966e-05,
"loss": 0.0773,
"step": 17400
},
{
"epoch": 0.8012545356729965,
"grad_norm": 0.7159505486488342,
"learning_rate": 8.012820512820514e-05,
"loss": 0.0751,
"step": 17500
},
{
"epoch": 0.8058331330196994,
"grad_norm": 0.7566177248954773,
"learning_rate": 8.058608058608059e-05,
"loss": 0.0734,
"step": 17600
},
{
"epoch": 0.8104117303664022,
"grad_norm": 0.6282426714897156,
"learning_rate": 8.104395604395605e-05,
"loss": 0.0778,
"step": 17700
},
{
"epoch": 0.8149903277131051,
"grad_norm": 1.3555270433425903,
"learning_rate": 8.15018315018315e-05,
"loss": 0.0702,
"step": 17800
},
{
"epoch": 0.8195689250598079,
"grad_norm": 0.43876418471336365,
"learning_rate": 8.195970695970697e-05,
"loss": 0.0736,
"step": 17900
},
{
"epoch": 0.8241475224065108,
"grad_norm": 0.8096747994422913,
"learning_rate": 8.241758241758242e-05,
"loss": 0.0743,
"step": 18000
},
{
"epoch": 0.8287261197532136,
"grad_norm": 0.5688252449035645,
"learning_rate": 8.287545787545788e-05,
"loss": 0.0701,
"step": 18100
},
{
"epoch": 0.8333047170999165,
"grad_norm": 0.711829662322998,
"learning_rate": 8.333333333333334e-05,
"loss": 0.0795,
"step": 18200
},
{
"epoch": 0.8378833144466192,
"grad_norm": 0.9951382875442505,
"learning_rate": 8.37912087912088e-05,
"loss": 0.0752,
"step": 18300
},
{
"epoch": 0.8424619117933221,
"grad_norm": 1.2362946271896362,
"learning_rate": 8.424908424908426e-05,
"loss": 0.0726,
"step": 18400
},
{
"epoch": 0.8470405091400249,
"grad_norm": 0.6342608332633972,
"learning_rate": 8.470695970695971e-05,
"loss": 0.0784,
"step": 18500
},
{
"epoch": 0.8516191064867278,
"grad_norm": 0.4258309006690979,
"learning_rate": 8.516483516483517e-05,
"loss": 0.0725,
"step": 18600
},
{
"epoch": 0.8561977038334306,
"grad_norm": 0.6683163642883301,
"learning_rate": 8.562271062271062e-05,
"loss": 0.0711,
"step": 18700
},
{
"epoch": 0.8607763011801335,
"grad_norm": 0.7911510467529297,
"learning_rate": 8.608058608058608e-05,
"loss": 0.0683,
"step": 18800
},
{
"epoch": 0.8653548985268363,
"grad_norm": 0.5352203845977783,
"learning_rate": 8.653846153846155e-05,
"loss": 0.0702,
"step": 18900
},
{
"epoch": 0.8699334958735392,
"grad_norm": 0.850853443145752,
"learning_rate": 8.6996336996337e-05,
"loss": 0.0702,
"step": 19000
},
{
"epoch": 0.8745120932202419,
"grad_norm": 0.5566896796226501,
"learning_rate": 8.745421245421246e-05,
"loss": 0.0764,
"step": 19100
},
{
"epoch": 0.8790906905669448,
"grad_norm": 0.28583312034606934,
"learning_rate": 8.791208791208791e-05,
"loss": 0.0701,
"step": 19200
},
{
"epoch": 0.8836692879136476,
"grad_norm": 0.4633546471595764,
"learning_rate": 8.836996336996337e-05,
"loss": 0.0748,
"step": 19300
},
{
"epoch": 0.8882478852603505,
"grad_norm": 0.6778764724731445,
"learning_rate": 8.882783882783882e-05,
"loss": 0.0719,
"step": 19400
},
{
"epoch": 0.8928264826070533,
"grad_norm": 0.9359253644943237,
"learning_rate": 8.92857142857143e-05,
"loss": 0.0729,
"step": 19500
},
{
"epoch": 0.8974050799537562,
"grad_norm": 4.642319679260254,
"learning_rate": 8.974358974358975e-05,
"loss": 0.0704,
"step": 19600
},
{
"epoch": 0.901983677300459,
"grad_norm": 1.6843513250350952,
"learning_rate": 9.020146520146521e-05,
"loss": 0.0703,
"step": 19700
},
{
"epoch": 0.9065622746471619,
"grad_norm": 0.6702886819839478,
"learning_rate": 9.065934065934066e-05,
"loss": 0.072,
"step": 19800
},
{
"epoch": 0.9111408719938647,
"grad_norm": 0.7958008646965027,
"learning_rate": 9.111721611721613e-05,
"loss": 0.0717,
"step": 19900
},
{
"epoch": 0.9157194693405675,
"grad_norm": 0.41371116042137146,
"learning_rate": 9.157509157509158e-05,
"loss": 0.0701,
"step": 20000
},
{
"epoch": 0.9202980666872703,
"grad_norm": 0.446638286113739,
"learning_rate": 9.203296703296704e-05,
"loss": 0.0643,
"step": 20100
},
{
"epoch": 0.9248766640339732,
"grad_norm": 0.5474185347557068,
"learning_rate": 9.24908424908425e-05,
"loss": 0.0672,
"step": 20200
},
{
"epoch": 0.929455261380676,
"grad_norm": 1.0076775550842285,
"learning_rate": 9.294871794871795e-05,
"loss": 0.0699,
"step": 20300
},
{
"epoch": 0.9340338587273789,
"grad_norm": 0.45534393191337585,
"learning_rate": 9.340659340659341e-05,
"loss": 0.0712,
"step": 20400
},
{
"epoch": 0.9386124560740817,
"grad_norm": 0.6854729652404785,
"learning_rate": 9.386446886446886e-05,
"loss": 0.0695,
"step": 20500
},
{
"epoch": 0.9431910534207846,
"grad_norm": 1.4581429958343506,
"learning_rate": 9.432234432234433e-05,
"loss": 0.0676,
"step": 20600
},
{
"epoch": 0.9477696507674874,
"grad_norm": 1.6819262504577637,
"learning_rate": 9.478021978021978e-05,
"loss": 0.0683,
"step": 20700
},
{
"epoch": 0.9523482481141902,
"grad_norm": 0.8808913826942444,
"learning_rate": 9.523809523809524e-05,
"loss": 0.0668,
"step": 20800
},
{
"epoch": 0.956926845460893,
"grad_norm": 0.3119984269142151,
"learning_rate": 9.56959706959707e-05,
"loss": 0.074,
"step": 20900
},
{
"epoch": 0.9615054428075959,
"grad_norm": 0.6743124723434448,
"learning_rate": 9.615384615384617e-05,
"loss": 0.0728,
"step": 21000
},
{
"epoch": 0.9660840401542987,
"grad_norm": 0.6196538209915161,
"learning_rate": 9.661172161172162e-05,
"loss": 0.0677,
"step": 21100
},
{
"epoch": 0.9706626375010016,
"grad_norm": 0.7010948657989502,
"learning_rate": 9.706959706959707e-05,
"loss": 0.0716,
"step": 21200
},
{
"epoch": 0.9752412348477044,
"grad_norm": 0.7601842880249023,
"learning_rate": 9.752747252747253e-05,
"loss": 0.0675,
"step": 21300
},
{
"epoch": 0.9798198321944073,
"grad_norm": 0.5342845320701599,
"learning_rate": 9.798534798534798e-05,
"loss": 0.0732,
"step": 21400
},
{
"epoch": 0.9843984295411101,
"grad_norm": 0.7880052328109741,
"learning_rate": 9.844322344322346e-05,
"loss": 0.0671,
"step": 21500
},
{
"epoch": 0.988977026887813,
"grad_norm": 0.690728485584259,
"learning_rate": 9.89010989010989e-05,
"loss": 0.069,
"step": 21600
},
{
"epoch": 0.9935556242345157,
"grad_norm": 0.6646633148193359,
"learning_rate": 9.935897435897437e-05,
"loss": 0.0667,
"step": 21700
},
{
"epoch": 0.9981342215812186,
"grad_norm": 1.2037309408187866,
"learning_rate": 9.981684981684982e-05,
"loss": 0.0683,
"step": 21800
},
{
"epoch": 0.9999656605198998,
"eval_loss": 0.14296908676624298,
"eval_runtime": 256.7574,
"eval_samples_per_second": 21.421,
"eval_steps_per_second": 21.421,
"step": 21840
},
{
"epoch": 1.0027128189279215,
"grad_norm": 0.4327790439128876,
"learning_rate": 9.999997700931376e-05,
"loss": 0.0692,
"step": 21900
},
{
"epoch": 1.0072914162746243,
"grad_norm": 0.8181611895561218,
"learning_rate": 9.999983651075218e-05,
"loss": 0.0542,
"step": 22000
},
{
"epoch": 1.011870013621327,
"grad_norm": 0.6566409468650818,
"learning_rate": 9.999956828659095e-05,
"loss": 0.0571,
"step": 22100
},
{
"epoch": 1.01644861096803,
"grad_norm": 0.7238597273826599,
"learning_rate": 9.999917233751526e-05,
"loss": 0.0611,
"step": 22200
},
{
"epoch": 1.0210272083147327,
"grad_norm": 0.3051077127456665,
"learning_rate": 9.999864866453658e-05,
"loss": 0.059,
"step": 22300
},
{
"epoch": 1.0256058056614357,
"grad_norm": 1.5912861824035645,
"learning_rate": 9.999799726899262e-05,
"loss": 0.0615,
"step": 22400
},
{
"epoch": 1.0301844030081384,
"grad_norm": 0.6656569242477417,
"learning_rate": 9.999721815254742e-05,
"loss": 0.0618,
"step": 22500
},
{
"epoch": 1.0347630003548414,
"grad_norm": 1.1994621753692627,
"learning_rate": 9.999631131719119e-05,
"loss": 0.0614,
"step": 22600
},
{
"epoch": 1.039341597701544,
"grad_norm": 0.6420437097549438,
"learning_rate": 9.999527676524052e-05,
"loss": 0.0565,
"step": 22700
},
{
"epoch": 1.043920195048247,
"grad_norm": 0.6077245473861694,
"learning_rate": 9.999411449933816e-05,
"loss": 0.0609,
"step": 22800
},
{
"epoch": 1.0484987923949498,
"grad_norm": 0.6168214082717896,
"learning_rate": 9.999282452245315e-05,
"loss": 0.0579,
"step": 22900
},
{
"epoch": 1.0530773897416525,
"grad_norm": 0.4628690779209137,
"learning_rate": 9.999140683788078e-05,
"loss": 0.0576,
"step": 23000
},
{
"epoch": 1.0576559870883555,
"grad_norm": 0.43243736028671265,
"learning_rate": 9.998986144924251e-05,
"loss": 0.0615,
"step": 23100
},
{
"epoch": 1.0622345844350582,
"grad_norm": 0.7162685394287109,
"learning_rate": 9.998818836048611e-05,
"loss": 0.0598,
"step": 23200
},
{
"epoch": 1.0668131817817612,
"grad_norm": 0.7162106037139893,
"learning_rate": 9.99863875758855e-05,
"loss": 0.0574,
"step": 23300
},
{
"epoch": 1.071391779128464,
"grad_norm": 0.4392016530036926,
"learning_rate": 9.998445910004082e-05,
"loss": 0.0576,
"step": 23400
},
{
"epoch": 1.075970376475167,
"grad_norm": 0.8344998955726624,
"learning_rate": 9.998240293787841e-05,
"loss": 0.0639,
"step": 23500
},
{
"epoch": 1.0805489738218697,
"grad_norm": 0.9016310572624207,
"learning_rate": 9.998021909465076e-05,
"loss": 0.058,
"step": 23600
},
{
"epoch": 1.0851275711685724,
"grad_norm": 0.1677553951740265,
"learning_rate": 9.997790757593657e-05,
"loss": 0.0648,
"step": 23700
},
{
"epoch": 1.0897061685152754,
"grad_norm": 0.6796389222145081,
"learning_rate": 9.997546838764065e-05,
"loss": 0.0589,
"step": 23800
},
{
"epoch": 1.094284765861978,
"grad_norm": 0.667464554309845,
"learning_rate": 9.997290153599394e-05,
"loss": 0.0557,
"step": 23900
},
{
"epoch": 1.098863363208681,
"grad_norm": 0.9013321995735168,
"learning_rate": 9.997020702755353e-05,
"loss": 0.0555,
"step": 24000
},
{
"epoch": 1.1034419605553838,
"grad_norm": 0.3552779257297516,
"learning_rate": 9.996738486920259e-05,
"loss": 0.0568,
"step": 24100
},
{
"epoch": 1.1080205579020868,
"grad_norm": 0.6730219721794128,
"learning_rate": 9.996443506815039e-05,
"loss": 0.0556,
"step": 24200
},
{
"epoch": 1.1125991552487895,
"grad_norm": 0.29462745785713196,
"learning_rate": 9.996135763193225e-05,
"loss": 0.055,
"step": 24300
},
{
"epoch": 1.1171777525954925,
"grad_norm": 0.3105739653110504,
"learning_rate": 9.995815256840955e-05,
"loss": 0.0592,
"step": 24400
},
{
"epoch": 1.1217563499421952,
"grad_norm": 0.5383213758468628,
"learning_rate": 9.995481988576968e-05,
"loss": 0.0525,
"step": 24500
},
{
"epoch": 1.126334947288898,
"grad_norm": 0.6290645003318787,
"learning_rate": 9.995135959252605e-05,
"loss": 0.058,
"step": 24600
},
{
"epoch": 1.130913544635601,
"grad_norm": 0.4531712532043457,
"learning_rate": 9.994777169751806e-05,
"loss": 0.0515,
"step": 24700
},
{
"epoch": 1.1354921419823036,
"grad_norm": 0.5031425952911377,
"learning_rate": 9.994405620991102e-05,
"loss": 0.0591,
"step": 24800
},
{
"epoch": 1.1400707393290066,
"grad_norm": 0.8398526310920715,
"learning_rate": 9.994021313919628e-05,
"loss": 0.0608,
"step": 24900
},
{
"epoch": 1.1446493366757093,
"grad_norm": 0.3783178925514221,
"learning_rate": 9.9936242495191e-05,
"loss": 0.0589,
"step": 25000
},
{
"epoch": 1.1492279340224123,
"grad_norm": 0.3554207384586334,
"learning_rate": 9.99321442880383e-05,
"loss": 0.0561,
"step": 25100
},
{
"epoch": 1.153806531369115,
"grad_norm": 0.8848966956138611,
"learning_rate": 9.992791852820709e-05,
"loss": 0.0571,
"step": 25200
},
{
"epoch": 1.158385128715818,
"grad_norm": 0.4907087981700897,
"learning_rate": 9.99235652264922e-05,
"loss": 0.0593,
"step": 25300
},
{
"epoch": 1.1629637260625207,
"grad_norm": 0.6268092393875122,
"learning_rate": 9.991908439401421e-05,
"loss": 0.0526,
"step": 25400
},
{
"epoch": 1.1675423234092235,
"grad_norm": 0.5183268785476685,
"learning_rate": 9.991447604221951e-05,
"loss": 0.0536,
"step": 25500
},
{
"epoch": 1.1721209207559264,
"grad_norm": 0.4522722065448761,
"learning_rate": 9.990974018288022e-05,
"loss": 0.05,
"step": 25600
},
{
"epoch": 1.1766995181026292,
"grad_norm": 0.8773862719535828,
"learning_rate": 9.990487682809418e-05,
"loss": 0.0539,
"step": 25700
},
{
"epoch": 1.1812781154493321,
"grad_norm": 0.5325748920440674,
"learning_rate": 9.989988599028492e-05,
"loss": 0.0604,
"step": 25800
},
{
"epoch": 1.1858567127960349,
"grad_norm": 0.5544828772544861,
"learning_rate": 9.989476768220168e-05,
"loss": 0.0538,
"step": 25900
},
{
"epoch": 1.1904353101427378,
"grad_norm": 0.8816759586334229,
"learning_rate": 9.988952191691925e-05,
"loss": 0.0568,
"step": 26000
},
{
"epoch": 1.1950139074894406,
"grad_norm": 0.8002095222473145,
"learning_rate": 9.988414870783806e-05,
"loss": 0.0573,
"step": 26100
},
{
"epoch": 1.1995925048361435,
"grad_norm": 0.5534511208534241,
"learning_rate": 9.987864806868405e-05,
"loss": 0.0597,
"step": 26200
},
{
"epoch": 1.2041711021828463,
"grad_norm": 0.4148072898387909,
"learning_rate": 9.987302001350875e-05,
"loss": 0.049,
"step": 26300
},
{
"epoch": 1.208749699529549,
"grad_norm": 0.30762553215026855,
"learning_rate": 9.986726455668913e-05,
"loss": 0.0559,
"step": 26400
},
{
"epoch": 1.213328296876252,
"grad_norm": 0.7850671410560608,
"learning_rate": 9.986138171292762e-05,
"loss": 0.0515,
"step": 26500
},
{
"epoch": 1.2179068942229547,
"grad_norm": 0.45396122336387634,
"learning_rate": 9.985537149725205e-05,
"loss": 0.0529,
"step": 26600
},
{
"epoch": 1.2224854915696577,
"grad_norm": 0.4627123177051544,
"learning_rate": 9.984923392501567e-05,
"loss": 0.0555,
"step": 26700
},
{
"epoch": 1.2270640889163604,
"grad_norm": 0.8190097212791443,
"learning_rate": 9.984296901189702e-05,
"loss": 0.0507,
"step": 26800
},
{
"epoch": 1.2316426862630634,
"grad_norm": 0.6249597668647766,
"learning_rate": 9.983657677389992e-05,
"loss": 0.0538,
"step": 26900
},
{
"epoch": 1.2362212836097661,
"grad_norm": 0.8909338116645813,
"learning_rate": 9.983005722735351e-05,
"loss": 0.0458,
"step": 27000
},
{
"epoch": 1.240799880956469,
"grad_norm": 0.4777618944644928,
"learning_rate": 9.98234103889121e-05,
"loss": 0.0552,
"step": 27100
},
{
"epoch": 1.2453784783031718,
"grad_norm": 0.30679649114608765,
"learning_rate": 9.981663627555515e-05,
"loss": 0.0547,
"step": 27200
},
{
"epoch": 1.2499570756498746,
"grad_norm": 0.5480089783668518,
"learning_rate": 9.980973490458728e-05,
"loss": 0.0584,
"step": 27300
},
{
"epoch": 1.2545356729965775,
"grad_norm": 0.7595780491828918,
"learning_rate": 9.980270629363819e-05,
"loss": 0.056,
"step": 27400
},
{
"epoch": 1.2591142703432803,
"grad_norm": 0.34684839844703674,
"learning_rate": 9.979555046066261e-05,
"loss": 0.0545,
"step": 27500
},
{
"epoch": 1.2636928676899832,
"grad_norm": 0.4605325758457184,
"learning_rate": 9.978826742394027e-05,
"loss": 0.0588,
"step": 27600
},
{
"epoch": 1.268271465036686,
"grad_norm": 0.8060219287872314,
"learning_rate": 9.97808572020758e-05,
"loss": 0.0529,
"step": 27700
},
{
"epoch": 1.2728500623833887,
"grad_norm": 0.4551374614238739,
"learning_rate": 9.97733198139988e-05,
"loss": 0.0506,
"step": 27800
},
{
"epoch": 1.2774286597300917,
"grad_norm": 0.5313341617584229,
"learning_rate": 9.976565527896366e-05,
"loss": 0.0524,
"step": 27900
},
{
"epoch": 1.2820072570767946,
"grad_norm": 0.511184811592102,
"learning_rate": 9.97578636165496e-05,
"loss": 0.0522,
"step": 28000
},
{
"epoch": 1.2865858544234974,
"grad_norm": 0.8772425055503845,
"learning_rate": 9.974994484666058e-05,
"loss": 0.0546,
"step": 28100
},
{
"epoch": 1.2911644517702001,
"grad_norm": 0.4593620002269745,
"learning_rate": 9.974189898952524e-05,
"loss": 0.0527,
"step": 28200
},
{
"epoch": 1.295743049116903,
"grad_norm": 0.49878451228141785,
"learning_rate": 9.973372606569692e-05,
"loss": 0.0536,
"step": 28300
},
{
"epoch": 1.3003216464636058,
"grad_norm": 0.8320513367652893,
"learning_rate": 9.97254260960535e-05,
"loss": 0.0522,
"step": 28400
},
{
"epoch": 1.3049002438103088,
"grad_norm": 0.4917149245738983,
"learning_rate": 9.971699910179742e-05,
"loss": 0.0574,
"step": 28500
},
{
"epoch": 1.3094788411570115,
"grad_norm": 0.561815083026886,
"learning_rate": 9.97084451044556e-05,
"loss": 0.0557,
"step": 28600
},
{
"epoch": 1.3140574385037143,
"grad_norm": 0.43367573618888855,
"learning_rate": 9.969976412587944e-05,
"loss": 0.0522,
"step": 28700
},
{
"epoch": 1.3186360358504172,
"grad_norm": 1.0113517045974731,
"learning_rate": 9.969095618824462e-05,
"loss": 0.0491,
"step": 28800
},
{
"epoch": 1.3232146331971202,
"grad_norm": 0.548916220664978,
"learning_rate": 9.968202131405124e-05,
"loss": 0.0499,
"step": 28900
},
{
"epoch": 1.327793230543823,
"grad_norm": 0.5541431903839111,
"learning_rate": 9.967295952612361e-05,
"loss": 0.0464,
"step": 29000
},
{
"epoch": 1.3323718278905257,
"grad_norm": 0.47956761717796326,
"learning_rate": 9.966377084761023e-05,
"loss": 0.0548,
"step": 29100
},
{
"epoch": 1.3369504252372286,
"grad_norm": 0.9489524960517883,
"learning_rate": 9.965445530198378e-05,
"loss": 0.0576,
"step": 29200
},
{
"epoch": 1.3415290225839314,
"grad_norm": 0.7664705514907837,
"learning_rate": 9.964501291304101e-05,
"loss": 0.055,
"step": 29300
},
{
"epoch": 1.3461076199306343,
"grad_norm": 0.5601370930671692,
"learning_rate": 9.96354437049027e-05,
"loss": 0.0525,
"step": 29400
},
{
"epoch": 1.350686217277337,
"grad_norm": 0.3737477958202362,
"learning_rate": 9.962574770201358e-05,
"loss": 0.049,
"step": 29500
},
{
"epoch": 1.3552648146240398,
"grad_norm": 0.8171801567077637,
"learning_rate": 9.96159249291423e-05,
"loss": 0.0501,
"step": 29600
},
{
"epoch": 1.3598434119707428,
"grad_norm": 0.8035039305686951,
"learning_rate": 9.960597541138131e-05,
"loss": 0.0493,
"step": 29700
},
{
"epoch": 1.3644220093174457,
"grad_norm": 0.2262045294046402,
"learning_rate": 9.959589917414687e-05,
"loss": 0.0503,
"step": 29800
},
{
"epoch": 1.3690006066641485,
"grad_norm": 0.5973814725875854,
"learning_rate": 9.958569624317893e-05,
"loss": 0.0528,
"step": 29900
},
{
"epoch": 1.3735792040108512,
"grad_norm": 0.66443932056427,
"learning_rate": 9.957536664454108e-05,
"loss": 0.0509,
"step": 30000
},
{
"epoch": 1.3781578013575542,
"grad_norm": 1.04296875,
"learning_rate": 9.956491040462052e-05,
"loss": 0.0515,
"step": 30100
},
{
"epoch": 1.382736398704257,
"grad_norm": 1.5576283931732178,
"learning_rate": 9.955432755012788e-05,
"loss": 0.0533,
"step": 30200
},
{
"epoch": 1.3873149960509599,
"grad_norm": 0.3329857885837555,
"learning_rate": 9.954361810809732e-05,
"loss": 0.0523,
"step": 30300
},
{
"epoch": 1.3918935933976626,
"grad_norm": 1.7028000354766846,
"learning_rate": 9.953278210588628e-05,
"loss": 0.0516,
"step": 30400
},
{
"epoch": 1.3964721907443653,
"grad_norm": 0.49420544505119324,
"learning_rate": 9.952181957117559e-05,
"loss": 0.0505,
"step": 30500
},
{
"epoch": 1.4010507880910683,
"grad_norm": 0.15591812133789062,
"learning_rate": 9.951073053196926e-05,
"loss": 0.0512,
"step": 30600
},
{
"epoch": 1.4056293854377713,
"grad_norm": 0.4006904661655426,
"learning_rate": 9.949951501659445e-05,
"loss": 0.0522,
"step": 30700
},
{
"epoch": 1.410207982784474,
"grad_norm": 0.46110183000564575,
"learning_rate": 9.948817305370143e-05,
"loss": 0.049,
"step": 30800
},
{
"epoch": 1.4147865801311768,
"grad_norm": 0.24079594016075134,
"learning_rate": 9.947670467226349e-05,
"loss": 0.0521,
"step": 30900
},
{
"epoch": 1.4193651774778797,
"grad_norm": 0.6515139937400818,
"learning_rate": 9.946510990157682e-05,
"loss": 0.0495,
"step": 31000
},
{
"epoch": 1.4239437748245825,
"grad_norm": 0.5415006279945374,
"learning_rate": 9.945338877126052e-05,
"loss": 0.0526,
"step": 31100
},
{
"epoch": 1.4285223721712854,
"grad_norm": 0.8711938261985779,
"learning_rate": 9.944154131125642e-05,
"loss": 0.0484,
"step": 31200
},
{
"epoch": 1.4331009695179882,
"grad_norm": 0.5021001696586609,
"learning_rate": 9.942956755182916e-05,
"loss": 0.0567,
"step": 31300
},
{
"epoch": 1.437679566864691,
"grad_norm": 1.9676926136016846,
"learning_rate": 9.941746752356588e-05,
"loss": 0.0496,
"step": 31400
},
{
"epoch": 1.4422581642113939,
"grad_norm": 0.5120891332626343,
"learning_rate": 9.94052412573764e-05,
"loss": 0.0492,
"step": 31500
},
{
"epoch": 1.4468367615580966,
"grad_norm": 0.9182060956954956,
"learning_rate": 9.939288878449294e-05,
"loss": 0.0525,
"step": 31600
},
{
"epoch": 1.4514153589047996,
"grad_norm": 0.6737085580825806,
"learning_rate": 9.938041013647016e-05,
"loss": 0.0462,
"step": 31700
},
{
"epoch": 1.4559939562515023,
"grad_norm": 0.7034218311309814,
"learning_rate": 9.936780534518502e-05,
"loss": 0.0497,
"step": 31800
},
{
"epoch": 1.4605725535982053,
"grad_norm": 0.9228888750076294,
"learning_rate": 9.935507444283669e-05,
"loss": 0.0482,
"step": 31900
},
{
"epoch": 1.465151150944908,
"grad_norm": 0.3609278202056885,
"learning_rate": 9.934221746194655e-05,
"loss": 0.0594,
"step": 32000
},
{
"epoch": 1.469729748291611,
"grad_norm": 0.12724661827087402,
"learning_rate": 9.932923443535798e-05,
"loss": 0.0476,
"step": 32100
},
{
"epoch": 1.4743083456383137,
"grad_norm": 0.5686663389205933,
"learning_rate": 9.931612539623643e-05,
"loss": 0.0538,
"step": 32200
},
{
"epoch": 1.4788869429850164,
"grad_norm": 0.6813719868659973,
"learning_rate": 9.930289037806919e-05,
"loss": 0.0511,
"step": 32300
},
{
"epoch": 1.4834655403317194,
"grad_norm": 0.678242027759552,
"learning_rate": 9.928952941466538e-05,
"loss": 0.0492,
"step": 32400
},
{
"epoch": 1.4880441376784221,
"grad_norm": 0.7721807360649109,
"learning_rate": 9.927604254015585e-05,
"loss": 0.0529,
"step": 32500
},
{
"epoch": 1.492622735025125,
"grad_norm": 0.6314060688018799,
"learning_rate": 9.926242978899312e-05,
"loss": 0.0462,
"step": 32600
},
{
"epoch": 1.4972013323718278,
"grad_norm": 0.5451350212097168,
"learning_rate": 9.924869119595119e-05,
"loss": 0.0476,
"step": 32700
},
{
"epoch": 1.5017799297185306,
"grad_norm": 0.5342521071434021,
"learning_rate": 9.923482679612563e-05,
"loss": 0.0505,
"step": 32800
},
{
"epoch": 1.5063585270652335,
"grad_norm": 0.7561967968940735,
"learning_rate": 9.922083662493329e-05,
"loss": 0.0491,
"step": 32900
},
{
"epoch": 1.5109371244119365,
"grad_norm": 0.2349376529455185,
"learning_rate": 9.920672071811237e-05,
"loss": 0.0463,
"step": 33000
},
{
"epoch": 1.5155157217586392,
"grad_norm": 0.3987545073032379,
"learning_rate": 9.919247911172224e-05,
"loss": 0.0528,
"step": 33100
},
{
"epoch": 1.520094319105342,
"grad_norm": 0.4922156035900116,
"learning_rate": 9.917811184214337e-05,
"loss": 0.0479,
"step": 33200
},
{
"epoch": 1.524672916452045,
"grad_norm": 0.9758409261703491,
"learning_rate": 9.916361894607722e-05,
"loss": 0.0537,
"step": 33300
},
{
"epoch": 1.529251513798748,
"grad_norm": 0.5304883718490601,
"learning_rate": 9.914900046054623e-05,
"loss": 0.0504,
"step": 33400
},
{
"epoch": 1.5338301111454506,
"grad_norm": 0.4293117821216583,
"learning_rate": 9.913425642289358e-05,
"loss": 0.0481,
"step": 33500
},
{
"epoch": 1.5384087084921534,
"grad_norm": 0.354592889547348,
"learning_rate": 9.911938687078324e-05,
"loss": 0.0496,
"step": 33600
},
{
"epoch": 1.5429873058388561,
"grad_norm": 0.36046740412712097,
"learning_rate": 9.910439184219978e-05,
"loss": 0.0451,
"step": 33700
},
{
"epoch": 1.547565903185559,
"grad_norm": 0.4680946171283722,
"learning_rate": 9.90892713754483e-05,
"loss": 0.048,
"step": 33800
},
{
"epoch": 1.552144500532262,
"grad_norm": 0.4586212635040283,
"learning_rate": 9.907402550915433e-05,
"loss": 0.0462,
"step": 33900
},
{
"epoch": 1.5567230978789648,
"grad_norm": 0.2608386278152466,
"learning_rate": 9.905865428226376e-05,
"loss": 0.0472,
"step": 34000
},
{
"epoch": 1.5613016952256675,
"grad_norm": 0.5291585922241211,
"learning_rate": 9.90431577340427e-05,
"loss": 0.044,
"step": 34100
},
{
"epoch": 1.5658802925723705,
"grad_norm": 0.9200330376625061,
"learning_rate": 9.90275359040774e-05,
"loss": 0.0487,
"step": 34200
},
{
"epoch": 1.5704588899190735,
"grad_norm": 0.550689160823822,
"learning_rate": 9.901178883227414e-05,
"loss": 0.0515,
"step": 34300
},
{
"epoch": 1.5750374872657762,
"grad_norm": 0.7476568818092346,
"learning_rate": 9.899591655885912e-05,
"loss": 0.0457,
"step": 34400
},
{
"epoch": 1.579616084612479,
"grad_norm": 0.8736041188240051,
"learning_rate": 9.89799191243784e-05,
"loss": 0.0473,
"step": 34500
},
{
"epoch": 1.5841946819591817,
"grad_norm": 0.17842432856559753,
"learning_rate": 9.896379656969776e-05,
"loss": 0.0456,
"step": 34600
},
{
"epoch": 1.5887732793058846,
"grad_norm": 0.5870159864425659,
"learning_rate": 9.894754893600258e-05,
"loss": 0.052,
"step": 34700
},
{
"epoch": 1.5933518766525876,
"grad_norm": 0.33038216829299927,
"learning_rate": 9.893117626479777e-05,
"loss": 0.0498,
"step": 34800
},
{
"epoch": 1.5979304739992903,
"grad_norm": 0.7480065226554871,
"learning_rate": 9.891467859790767e-05,
"loss": 0.0484,
"step": 34900
},
{
"epoch": 1.602509071345993,
"grad_norm": 0.46852391958236694,
"learning_rate": 9.889805597747588e-05,
"loss": 0.0471,
"step": 35000
},
{
"epoch": 1.607087668692696,
"grad_norm": 0.33162882924079895,
"learning_rate": 9.888130844596524e-05,
"loss": 0.0477,
"step": 35100
},
{
"epoch": 1.6116662660393988,
"grad_norm": 1.0083402395248413,
"learning_rate": 9.886443604615764e-05,
"loss": 0.051,
"step": 35200
},
{
"epoch": 1.6162448633861017,
"grad_norm": 0.6158673763275146,
"learning_rate": 9.8847438821154e-05,
"loss": 0.0459,
"step": 35300
},
{
"epoch": 1.6208234607328045,
"grad_norm": 1.0110929012298584,
"learning_rate": 9.883031681437405e-05,
"loss": 0.0481,
"step": 35400
},
{
"epoch": 1.6254020580795072,
"grad_norm": 0.35791000723838806,
"learning_rate": 9.881307006955634e-05,
"loss": 0.0466,
"step": 35500
},
{
"epoch": 1.6299806554262102,
"grad_norm": 0.5888839364051819,
"learning_rate": 9.879569863075799e-05,
"loss": 0.048,
"step": 35600
},
{
"epoch": 1.6345592527729131,
"grad_norm": 0.7552986741065979,
"learning_rate": 9.877820254235471e-05,
"loss": 0.0482,
"step": 35700
},
{
"epoch": 1.6391378501196159,
"grad_norm": 0.5620241165161133,
"learning_rate": 9.87605818490406e-05,
"loss": 0.0461,
"step": 35800
},
{
"epoch": 1.6437164474663186,
"grad_norm": 0.40786847472190857,
"learning_rate": 9.87428365958281e-05,
"loss": 0.0501,
"step": 35900
},
{
"epoch": 1.6482950448130216,
"grad_norm": 0.3627175986766815,
"learning_rate": 9.872496682804781e-05,
"loss": 0.0495,
"step": 36000
},
{
"epoch": 1.6528736421597243,
"grad_norm": 0.3631226122379303,
"learning_rate": 9.870697259134844e-05,
"loss": 0.0415,
"step": 36100
},
{
"epoch": 1.6574522395064273,
"grad_norm": 0.41811424493789673,
"learning_rate": 9.86888539316966e-05,
"loss": 0.0444,
"step": 36200
},
{
"epoch": 1.66203083685313,
"grad_norm": 0.8351331949234009,
"learning_rate": 9.867061089537677e-05,
"loss": 0.0499,
"step": 36300
},
{
"epoch": 1.6666094341998328,
"grad_norm": 0.26144087314605713,
"learning_rate": 9.865224352899119e-05,
"loss": 0.0488,
"step": 36400
},
{
"epoch": 1.6711880315465357,
"grad_norm": 1.9417400360107422,
"learning_rate": 9.863375187945967e-05,
"loss": 0.0456,
"step": 36500
},
{
"epoch": 1.6757666288932387,
"grad_norm": 0.762496292591095,
"learning_rate": 9.861513599401948e-05,
"loss": 0.0446,
"step": 36600
},
{
"epoch": 1.6803452262399414,
"grad_norm": 0.6936019659042358,
"learning_rate": 9.859639592022528e-05,
"loss": 0.046,
"step": 36700
},
{
"epoch": 1.6849238235866442,
"grad_norm": 0.3661505877971649,
"learning_rate": 9.857753170594897e-05,
"loss": 0.0445,
"step": 36800
},
{
"epoch": 1.689502420933347,
"grad_norm": 0.6424843668937683,
"learning_rate": 9.85585433993796e-05,
"loss": 0.0473,
"step": 36900
},
{
"epoch": 1.6940810182800499,
"grad_norm": 0.520645022392273,
"learning_rate": 9.853943104902315e-05,
"loss": 0.0474,
"step": 37000
},
{
"epoch": 1.6986596156267528,
"grad_norm": 0.3194561302661896,
"learning_rate": 9.852019470370253e-05,
"loss": 0.0482,
"step": 37100
},
{
"epoch": 1.7032382129734556,
"grad_norm": 0.6570625305175781,
"learning_rate": 9.850083441255735e-05,
"loss": 0.0457,
"step": 37200
},
{
"epoch": 1.7078168103201583,
"grad_norm": 0.4810948371887207,
"learning_rate": 9.84813502250439e-05,
"loss": 0.0474,
"step": 37300
},
{
"epoch": 1.7123954076668613,
"grad_norm": 0.5983640551567078,
"learning_rate": 9.846174219093491e-05,
"loss": 0.0451,
"step": 37400
},
{
"epoch": 1.7169740050135642,
"grad_norm": 0.4565774202346802,
"learning_rate": 9.844201036031951e-05,
"loss": 0.0436,
"step": 37500
},
{
"epoch": 1.721552602360267,
"grad_norm": 0.4429413974285126,
"learning_rate": 9.842215478360306e-05,
"loss": 0.0415,
"step": 37600
},
{
"epoch": 1.7261311997069697,
"grad_norm": 0.461791068315506,
"learning_rate": 9.840217551150706e-05,
"loss": 0.0436,
"step": 37700
},
{
"epoch": 1.7307097970536724,
"grad_norm": 0.7613334059715271,
"learning_rate": 9.838207259506891e-05,
"loss": 0.0433,
"step": 37800
},
{
"epoch": 1.7352883944003754,
"grad_norm": 0.1547241359949112,
"learning_rate": 9.836184608564198e-05,
"loss": 0.044,
"step": 37900
},
{
"epoch": 1.7398669917470784,
"grad_norm": 0.45752155780792236,
"learning_rate": 9.834149603489526e-05,
"loss": 0.0436,
"step": 38000
},
{
"epoch": 1.7444455890937811,
"grad_norm": 1.1743345260620117,
"learning_rate": 9.832102249481338e-05,
"loss": 0.0443,
"step": 38100
},
{
"epoch": 1.7490241864404839,
"grad_norm": 0.9375355243682861,
"learning_rate": 9.830042551769641e-05,
"loss": 0.0437,
"step": 38200
},
{
"epoch": 1.7536027837871868,
"grad_norm": 0.3472870886325836,
"learning_rate": 9.827970515615977e-05,
"loss": 0.0445,
"step": 38300
},
{
"epoch": 1.7581813811338898,
"grad_norm": 1.0196037292480469,
"learning_rate": 9.825886146313402e-05,
"loss": 0.0452,
"step": 38400
},
{
"epoch": 1.7627599784805925,
"grad_norm": 0.774361789226532,
"learning_rate": 9.82378944918648e-05,
"loss": 0.0465,
"step": 38500
},
{
"epoch": 1.7673385758272953,
"grad_norm": 1.1351568698883057,
"learning_rate": 9.821680429591269e-05,
"loss": 0.0438,
"step": 38600
},
{
"epoch": 1.771917173173998,
"grad_norm": 0.3935364782810211,
"learning_rate": 9.819559092915299e-05,
"loss": 0.0477,
"step": 38700
},
{
"epoch": 1.776495770520701,
"grad_norm": 0.48644939064979553,
"learning_rate": 9.81742544457757e-05,
"loss": 0.0482,
"step": 38800
},
{
"epoch": 1.781074367867404,
"grad_norm": 0.7816250324249268,
"learning_rate": 9.815279490028529e-05,
"loss": 0.0418,
"step": 38900
},
{
"epoch": 1.7856529652141067,
"grad_norm": 0.9440283179283142,
"learning_rate": 9.81312123475006e-05,
"loss": 0.0445,
"step": 39000
},
{
"epoch": 1.7902315625608094,
"grad_norm": 0.5908809304237366,
"learning_rate": 9.810950684255473e-05,
"loss": 0.0467,
"step": 39100
},
{
"epoch": 1.7948101599075124,
"grad_norm": 0.8200555443763733,
"learning_rate": 9.80876784408948e-05,
"loss": 0.044,
"step": 39200
},
{
"epoch": 1.7993887572542153,
"grad_norm": 0.513478696346283,
"learning_rate": 9.806572719828193e-05,
"loss": 0.0437,
"step": 39300
},
{
"epoch": 1.803967354600918,
"grad_norm": 0.34043118357658386,
"learning_rate": 9.8043653170791e-05,
"loss": 0.0412,
"step": 39400
},
{
"epoch": 1.8085459519476208,
"grad_norm": 0.7160608172416687,
"learning_rate": 9.802145641481056e-05,
"loss": 0.0475,
"step": 39500
},
{
"epoch": 1.8131245492943235,
"grad_norm": 0.7825314998626709,
"learning_rate": 9.799913698704269e-05,
"loss": 0.0475,
"step": 39600
},
{
"epoch": 1.8177031466410265,
"grad_norm": 0.38378453254699707,
"learning_rate": 9.797669494450281e-05,
"loss": 0.0416,
"step": 39700
},
{
"epoch": 1.8222817439877295,
"grad_norm": 0.3362584710121155,
"learning_rate": 9.795413034451959e-05,
"loss": 0.0439,
"step": 39800
},
{
"epoch": 1.8268603413344322,
"grad_norm": 0.42681771516799927,
"learning_rate": 9.793144324473473e-05,
"loss": 0.0458,
"step": 39900
},
{
"epoch": 1.831438938681135,
"grad_norm": 0.7747517824172974,
"learning_rate": 9.790863370310293e-05,
"loss": 0.0442,
"step": 40000
},
{
"epoch": 1.836017536027838,
"grad_norm": 0.481751024723053,
"learning_rate": 9.788570177789158e-05,
"loss": 0.0479,
"step": 40100
},
{
"epoch": 1.8405961333745409,
"grad_norm": 0.4065397381782532,
"learning_rate": 9.78626475276808e-05,
"loss": 0.0433,
"step": 40200
},
{
"epoch": 1.8451747307212436,
"grad_norm": 0.47623851895332336,
"learning_rate": 9.78394710113631e-05,
"loss": 0.043,
"step": 40300
},
{
"epoch": 1.8497533280679463,
"grad_norm": 0.5850650668144226,
"learning_rate": 9.781617228814339e-05,
"loss": 0.0413,
"step": 40400
},
{
"epoch": 1.854331925414649,
"grad_norm": 0.5443374514579773,
"learning_rate": 9.77927514175387e-05,
"loss": 0.044,
"step": 40500
},
{
"epoch": 1.858910522761352,
"grad_norm": 0.5081647634506226,
"learning_rate": 9.776920845937816e-05,
"loss": 0.0417,
"step": 40600
},
{
"epoch": 1.863489120108055,
"grad_norm": 0.9533047080039978,
"learning_rate": 9.774554347380271e-05,
"loss": 0.0438,
"step": 40700
},
{
"epoch": 1.8680677174547577,
"grad_norm": 0.7583338618278503,
"learning_rate": 9.772175652126503e-05,
"loss": 0.0437,
"step": 40800
},
{
"epoch": 1.8726463148014605,
"grad_norm": 0.6980351209640503,
"learning_rate": 9.769784766252941e-05,
"loss": 0.0453,
"step": 40900
},
{
"epoch": 1.8772249121481634,
"grad_norm": 0.523002564907074,
"learning_rate": 9.767381695867149e-05,
"loss": 0.0436,
"step": 41000
},
{
"epoch": 1.8818035094948664,
"grad_norm": 0.20366418361663818,
"learning_rate": 9.764966447107819e-05,
"loss": 0.0436,
"step": 41100
},
{
"epoch": 1.8863821068415692,
"grad_norm": 0.5247259140014648,
"learning_rate": 9.762539026144755e-05,
"loss": 0.0436,
"step": 41200
},
{
"epoch": 1.890960704188272,
"grad_norm": 0.7223037481307983,
"learning_rate": 9.760099439178852e-05,
"loss": 0.0433,
"step": 41300
},
{
"epoch": 1.8955393015349746,
"grad_norm": 0.8085638880729675,
"learning_rate": 9.757647692442083e-05,
"loss": 0.0438,
"step": 41400
},
{
"epoch": 1.9001178988816776,
"grad_norm": 0.360061913728714,
"learning_rate": 9.755183792197486e-05,
"loss": 0.0437,
"step": 41500
},
{
"epoch": 1.9046964962283806,
"grad_norm": 0.33270975947380066,
"learning_rate": 9.752707744739145e-05,
"loss": 0.0417,
"step": 41600
},
{
"epoch": 1.9092750935750833,
"grad_norm": 0.40628722310066223,
"learning_rate": 9.750219556392175e-05,
"loss": 0.0442,
"step": 41700
},
{
"epoch": 1.913853690921786,
"grad_norm": 0.41940930485725403,
"learning_rate": 9.7477192335127e-05,
"loss": 0.0401,
"step": 41800
},
{
"epoch": 1.918432288268489,
"grad_norm": 0.41912841796875,
"learning_rate": 9.74520678248785e-05,
"loss": 0.0444,
"step": 41900
},
{
"epoch": 1.9230108856151917,
"grad_norm": 0.5386761426925659,
"learning_rate": 9.742682209735727e-05,
"loss": 0.0419,
"step": 42000
},
{
"epoch": 1.9275894829618947,
"grad_norm": 0.4845391809940338,
"learning_rate": 9.74014552170541e-05,
"loss": 0.0423,
"step": 42100
},
{
"epoch": 1.9321680803085974,
"grad_norm": 0.7683694958686829,
"learning_rate": 9.737596724876914e-05,
"loss": 0.0423,
"step": 42200
},
{
"epoch": 1.9367466776553002,
"grad_norm": 0.3862452805042267,
"learning_rate": 9.735035825761197e-05,
"loss": 0.0391,
"step": 42300
},
{
"epoch": 1.9413252750020031,
"grad_norm": 0.34973305463790894,
"learning_rate": 9.732462830900124e-05,
"loss": 0.0421,
"step": 42400
},
{
"epoch": 1.945903872348706,
"grad_norm": 0.6201797127723694,
"learning_rate": 9.729877746866465e-05,
"loss": 0.044,
"step": 42500
},
{
"epoch": 1.9504824696954088,
"grad_norm": 1.0637577772140503,
"learning_rate": 9.72728058026387e-05,
"loss": 0.0445,
"step": 42600
},
{
"epoch": 1.9550610670421116,
"grad_norm": 0.4952399432659149,
"learning_rate": 9.724671337726854e-05,
"loss": 0.0428,
"step": 42700
},
{
"epoch": 1.9596396643888145,
"grad_norm": 0.8040750026702881,
"learning_rate": 9.722050025920778e-05,
"loss": 0.0422,
"step": 42800
},
{
"epoch": 1.9642182617355173,
"grad_norm": 0.5028950572013855,
"learning_rate": 9.719416651541839e-05,
"loss": 0.0435,
"step": 42900
},
{
"epoch": 1.9687968590822202,
"grad_norm": 0.3617078959941864,
"learning_rate": 9.716771221317042e-05,
"loss": 0.0414,
"step": 43000
},
{
"epoch": 1.973375456428923,
"grad_norm": 0.6627247929573059,
"learning_rate": 9.714113742004198e-05,
"loss": 0.0442,
"step": 43100
},
{
"epoch": 1.9779540537756257,
"grad_norm": 0.5225775241851807,
"learning_rate": 9.711444220391886e-05,
"loss": 0.041,
"step": 43200
},
{
"epoch": 1.9825326511223287,
"grad_norm": 0.44373607635498047,
"learning_rate": 9.708762663299456e-05,
"loss": 0.0498,
"step": 43300
},
{
"epoch": 1.9871112484690316,
"grad_norm": 0.6220275163650513,
"learning_rate": 9.706069077577001e-05,
"loss": 0.0431,
"step": 43400
},
{
"epoch": 1.9916898458157344,
"grad_norm": 0.14181743562221527,
"learning_rate": 9.703363470105338e-05,
"loss": 0.0405,
"step": 43500
},
{
"epoch": 1.9962684431624371,
"grad_norm": 0.5560967922210693,
"learning_rate": 9.700645847796e-05,
"loss": 0.0393,
"step": 43600
},
{
"epoch": 1.9999771070132666,
"eval_loss": 0.16101938486099243,
"eval_runtime": 260.5165,
"eval_samples_per_second": 21.112,
"eval_steps_per_second": 21.112,
"step": 43681
},
{
"epoch": 2.00084704050914,
"grad_norm": 0.33940353989601135,
"learning_rate": 9.697916217591206e-05,
"loss": 0.0412,
"step": 43700
},
{
"epoch": 2.005425637855843,
"grad_norm": 0.5346599817276001,
"learning_rate": 9.695174586463848e-05,
"loss": 0.0324,
"step": 43800
},
{
"epoch": 2.010004235202546,
"grad_norm": 0.33371779322624207,
"learning_rate": 9.692420961417488e-05,
"loss": 0.0289,
"step": 43900
},
{
"epoch": 2.0145828325492485,
"grad_norm": 0.7601104974746704,
"learning_rate": 9.689655349486309e-05,
"loss": 0.0301,
"step": 44000
},
{
"epoch": 2.0191614298959513,
"grad_norm": 1.0337655544281006,
"learning_rate": 9.686877757735127e-05,
"loss": 0.0308,
"step": 44100
},
{
"epoch": 2.023740027242654,
"grad_norm": 0.39981570839881897,
"learning_rate": 9.684088193259355e-05,
"loss": 0.03,
"step": 44200
},
{
"epoch": 2.028318624589357,
"grad_norm": 0.6537315845489502,
"learning_rate": 9.681286663184994e-05,
"loss": 0.0319,
"step": 44300
},
{
"epoch": 2.03289722193606,
"grad_norm": 0.4220235347747803,
"learning_rate": 9.678473174668606e-05,
"loss": 0.03,
"step": 44400
},
{
"epoch": 2.0374758192827627,
"grad_norm": 0.20741093158721924,
"learning_rate": 9.675647734897309e-05,
"loss": 0.0315,
"step": 44500
},
{
"epoch": 2.0420544166294654,
"grad_norm": 0.46414992213249207,
"learning_rate": 9.672810351088743e-05,
"loss": 0.0304,
"step": 44600
},
{
"epoch": 2.0466330139761686,
"grad_norm": 0.4579373002052307,
"learning_rate": 9.669961030491064e-05,
"loss": 0.0312,
"step": 44700
},
{
"epoch": 2.0512116113228713,
"grad_norm": 0.7499117851257324,
"learning_rate": 9.66709978038292e-05,
"loss": 0.0306,
"step": 44800
},
{
"epoch": 2.055790208669574,
"grad_norm": 0.2862129807472229,
"learning_rate": 9.664226608073431e-05,
"loss": 0.031,
"step": 44900
},
{
"epoch": 2.060368806016277,
"grad_norm": 0.1198749765753746,
"learning_rate": 9.661341520902176e-05,
"loss": 0.0335,
"step": 45000
},
{
"epoch": 2.0649474033629795,
"grad_norm": 0.32276931405067444,
"learning_rate": 9.658444526239168e-05,
"loss": 0.0308,
"step": 45100
},
{
"epoch": 2.0695260007096827,
"grad_norm": 0.30404484272003174,
"learning_rate": 9.655535631484838e-05,
"loss": 0.0293,
"step": 45200
},
{
"epoch": 2.0741045980563855,
"grad_norm": 0.4213615357875824,
"learning_rate": 9.652614844070018e-05,
"loss": 0.0314,
"step": 45300
},
{
"epoch": 2.078683195403088,
"grad_norm": 0.26582449674606323,
"learning_rate": 9.64968217145592e-05,
"loss": 0.035,
"step": 45400
},
{
"epoch": 2.083261792749791,
"grad_norm": 0.462671160697937,
"learning_rate": 9.646737621134112e-05,
"loss": 0.0313,
"step": 45500
},
{
"epoch": 2.087840390096494,
"grad_norm": 0.7931702733039856,
"learning_rate": 9.643781200626511e-05,
"loss": 0.0341,
"step": 45600
},
{
"epoch": 2.092418987443197,
"grad_norm": 0.7540990710258484,
"learning_rate": 9.640812917485353e-05,
"loss": 0.03,
"step": 45700
},
{
"epoch": 2.0969975847898996,
"grad_norm": 0.23272991180419922,
"learning_rate": 9.637832779293177e-05,
"loss": 0.0282,
"step": 45800
},
{
"epoch": 2.1015761821366024,
"grad_norm": 0.4618046283721924,
"learning_rate": 9.634840793662807e-05,
"loss": 0.0345,
"step": 45900
},
{
"epoch": 2.106154779483305,
"grad_norm": 0.5170308947563171,
"learning_rate": 9.63183696823733e-05,
"loss": 0.0301,
"step": 46000
},
{
"epoch": 2.1107333768300083,
"grad_norm": 0.4604352116584778,
"learning_rate": 9.628821310690082e-05,
"loss": 0.0304,
"step": 46100
},
{
"epoch": 2.115311974176711,
"grad_norm": 0.5687543749809265,
"learning_rate": 9.625793828724618e-05,
"loss": 0.0326,
"step": 46200
},
{
"epoch": 2.1198905715234138,
"grad_norm": 0.6474491357803345,
"learning_rate": 9.622754530074705e-05,
"loss": 0.0323,
"step": 46300
},
{
"epoch": 2.1244691688701165,
"grad_norm": 0.3418339490890503,
"learning_rate": 9.619703422504291e-05,
"loss": 0.0311,
"step": 46400
},
{
"epoch": 2.1290477662168197,
"grad_norm": 0.542149543762207,
"learning_rate": 9.616640513807493e-05,
"loss": 0.0302,
"step": 46500
},
{
"epoch": 2.1336263635635224,
"grad_norm": 0.3595920205116272,
"learning_rate": 9.613565811808576e-05,
"loss": 0.033,
"step": 46600
},
{
"epoch": 2.138204960910225,
"grad_norm": 0.30227652192115784,
"learning_rate": 9.610479324361926e-05,
"loss": 0.0333,
"step": 46700
},
{
"epoch": 2.142783558256928,
"grad_norm": 0.3367413580417633,
"learning_rate": 9.607381059352038e-05,
"loss": 0.0291,
"step": 46800
},
{
"epoch": 2.1473621556036306,
"grad_norm": 0.33046597242355347,
"learning_rate": 9.604271024693495e-05,
"loss": 0.0324,
"step": 46900
},
{
"epoch": 2.151940752950334,
"grad_norm": 0.3185320794582367,
"learning_rate": 9.601149228330944e-05,
"loss": 0.03,
"step": 47000
},
{
"epoch": 2.1565193502970366,
"grad_norm": 0.4530138075351715,
"learning_rate": 9.598015678239074e-05,
"loss": 0.0301,
"step": 47100
},
{
"epoch": 2.1610979476437393,
"grad_norm": 1.8580175638198853,
"learning_rate": 9.594870382422604e-05,
"loss": 0.0279,
"step": 47200
},
{
"epoch": 2.165676544990442,
"grad_norm": 0.7226896286010742,
"learning_rate": 9.591713348916258e-05,
"loss": 0.0376,
"step": 47300
},
{
"epoch": 2.170255142337145,
"grad_norm": 0.5682775974273682,
"learning_rate": 9.588544585784741e-05,
"loss": 0.0319,
"step": 47400
},
{
"epoch": 2.174833739683848,
"grad_norm": 0.35570502281188965,
"learning_rate": 9.585364101122723e-05,
"loss": 0.0323,
"step": 47500
},
{
"epoch": 2.1794123370305507,
"grad_norm": 0.34631285071372986,
"learning_rate": 9.582171903054816e-05,
"loss": 0.0349,
"step": 47600
},
{
"epoch": 2.1839909343772534,
"grad_norm": 0.6178816556930542,
"learning_rate": 9.578967999735556e-05,
"loss": 0.0309,
"step": 47700
},
{
"epoch": 2.188569531723956,
"grad_norm": 0.37293490767478943,
"learning_rate": 9.575752399349378e-05,
"loss": 0.0288,
"step": 47800
},
{
"epoch": 2.1931481290706594,
"grad_norm": 0.6338769197463989,
"learning_rate": 9.572525110110601e-05,
"loss": 0.0321,
"step": 47900
},
{
"epoch": 2.197726726417362,
"grad_norm": 0.19380028545856476,
"learning_rate": 9.569286140263399e-05,
"loss": 0.0306,
"step": 48000
},
{
"epoch": 2.202305323764065,
"grad_norm": 0.3065268099308014,
"learning_rate": 9.566035498081784e-05,
"loss": 0.03,
"step": 48100
},
{
"epoch": 2.2068839211107676,
"grad_norm": 0.29010531306266785,
"learning_rate": 9.562773191869594e-05,
"loss": 0.0327,
"step": 48200
},
{
"epoch": 2.2114625184574708,
"grad_norm": 0.40536558628082275,
"learning_rate": 9.559499229960451e-05,
"loss": 0.028,
"step": 48300
},
{
"epoch": 2.2160411158041735,
"grad_norm": 0.30061614513397217,
"learning_rate": 9.55621362071776e-05,
"loss": 0.0306,
"step": 48400
},
{
"epoch": 2.2206197131508763,
"grad_norm": 0.5350512266159058,
"learning_rate": 9.552916372534674e-05,
"loss": 0.0301,
"step": 48500
},
{
"epoch": 2.225198310497579,
"grad_norm": 0.4163435101509094,
"learning_rate": 9.549607493834085e-05,
"loss": 0.0333,
"step": 48600
},
{
"epoch": 2.2297769078442817,
"grad_norm": 0.6648384928703308,
"learning_rate": 9.546286993068588e-05,
"loss": 0.0323,
"step": 48700
},
{
"epoch": 2.234355505190985,
"grad_norm": 0.3643403947353363,
"learning_rate": 9.54295487872047e-05,
"loss": 0.0287,
"step": 48800
},
{
"epoch": 2.2389341025376877,
"grad_norm": 0.8857894539833069,
"learning_rate": 9.539611159301684e-05,
"loss": 0.0299,
"step": 48900
},
{
"epoch": 2.2435126998843904,
"grad_norm": 0.4569896459579468,
"learning_rate": 9.536255843353832e-05,
"loss": 0.0317,
"step": 49000
},
{
"epoch": 2.248091297231093,
"grad_norm": 0.46430703997612,
"learning_rate": 9.532888939448134e-05,
"loss": 0.0342,
"step": 49100
},
{
"epoch": 2.252669894577796,
"grad_norm": 0.4034232795238495,
"learning_rate": 9.529510456185417e-05,
"loss": 0.0316,
"step": 49200
},
{
"epoch": 2.257248491924499,
"grad_norm": 0.5079818964004517,
"learning_rate": 9.526120402196083e-05,
"loss": 0.0302,
"step": 49300
},
{
"epoch": 2.261827089271202,
"grad_norm": 0.4281846880912781,
"learning_rate": 9.522718786140097e-05,
"loss": 0.0328,
"step": 49400
},
{
"epoch": 2.2664056866179045,
"grad_norm": 1.395179033279419,
"learning_rate": 9.519305616706953e-05,
"loss": 0.0321,
"step": 49500
},
{
"epoch": 2.2709842839646073,
"grad_norm": 0.22532618045806885,
"learning_rate": 9.515880902615661e-05,
"loss": 0.0323,
"step": 49600
},
{
"epoch": 2.2755628813113105,
"grad_norm": 0.2474541962146759,
"learning_rate": 9.512444652614728e-05,
"loss": 0.0318,
"step": 49700
},
{
"epoch": 2.280141478658013,
"grad_norm": 0.2567112445831299,
"learning_rate": 9.50899687548212e-05,
"loss": 0.0334,
"step": 49800
},
{
"epoch": 2.284720076004716,
"grad_norm": 0.446916401386261,
"learning_rate": 9.505537580025256e-05,
"loss": 0.0314,
"step": 49900
},
{
"epoch": 2.2892986733514187,
"grad_norm": 0.4602959454059601,
"learning_rate": 9.502066775080976e-05,
"loss": 0.0287,
"step": 50000
},
{
"epoch": 2.293877270698122,
"grad_norm": 0.16146403551101685,
"learning_rate": 9.49858446951552e-05,
"loss": 0.0304,
"step": 50100
},
{
"epoch": 2.2984558680448246,
"grad_norm": 0.5386151075363159,
"learning_rate": 9.495090672224511e-05,
"loss": 0.0312,
"step": 50200
},
{
"epoch": 2.3030344653915273,
"grad_norm": 0.4908753037452698,
"learning_rate": 9.491585392132924e-05,
"loss": 0.0266,
"step": 50300
},
{
"epoch": 2.30761306273823,
"grad_norm": 0.2822779715061188,
"learning_rate": 9.48806863819507e-05,
"loss": 0.031,
"step": 50400
},
{
"epoch": 2.312191660084933,
"grad_norm": 0.4930579364299774,
"learning_rate": 9.484540419394568e-05,
"loss": 0.0264,
"step": 50500
},
{
"epoch": 2.316770257431636,
"grad_norm": 0.3992035686969757,
"learning_rate": 9.481000744744321e-05,
"loss": 0.0331,
"step": 50600
},
{
"epoch": 2.3213488547783387,
"grad_norm": 0.8977420926094055,
"learning_rate": 9.477449623286505e-05,
"loss": 0.0294,
"step": 50700
},
{
"epoch": 2.3259274521250415,
"grad_norm": 0.26158419251441956,
"learning_rate": 9.473887064092531e-05,
"loss": 0.0332,
"step": 50800
},
{
"epoch": 2.3305060494717442,
"grad_norm": 0.4764149487018585,
"learning_rate": 9.470313076263025e-05,
"loss": 0.0335,
"step": 50900
},
{
"epoch": 2.335084646818447,
"grad_norm": 0.21730680763721466,
"learning_rate": 9.466727668927816e-05,
"loss": 0.0285,
"step": 51000
},
{
"epoch": 2.33966324416515,
"grad_norm": 0.7260767817497253,
"learning_rate": 9.463130851245898e-05,
"loss": 0.0336,
"step": 51100
},
{
"epoch": 2.344241841511853,
"grad_norm": 0.688016951084137,
"learning_rate": 9.459522632405415e-05,
"loss": 0.0291,
"step": 51200
},
{
"epoch": 2.3488204388585556,
"grad_norm": 0.172237828373909,
"learning_rate": 9.455903021623637e-05,
"loss": 0.0287,
"step": 51300
},
{
"epoch": 2.3533990362052584,
"grad_norm": 0.42502665519714355,
"learning_rate": 9.452272028146932e-05,
"loss": 0.0304,
"step": 51400
},
{
"epoch": 2.3579776335519615,
"grad_norm": 0.6109219193458557,
"learning_rate": 9.448629661250745e-05,
"loss": 0.0353,
"step": 51500
},
{
"epoch": 2.3625562308986643,
"grad_norm": 0.6103388071060181,
"learning_rate": 9.444975930239581e-05,
"loss": 0.0293,
"step": 51600
},
{
"epoch": 2.367134828245367,
"grad_norm": 0.6018409132957458,
"learning_rate": 9.441310844446965e-05,
"loss": 0.0349,
"step": 51700
},
{
"epoch": 2.3717134255920698,
"grad_norm": 0.3021933436393738,
"learning_rate": 9.437634413235436e-05,
"loss": 0.0294,
"step": 51800
},
{
"epoch": 2.376292022938773,
"grad_norm": 0.42470723390579224,
"learning_rate": 9.433946645996514e-05,
"loss": 0.0296,
"step": 51900
},
{
"epoch": 2.3808706202854757,
"grad_norm": 0.3379852771759033,
"learning_rate": 9.430247552150673e-05,
"loss": 0.0294,
"step": 52000
},
{
"epoch": 2.3854492176321784,
"grad_norm": 0.22888457775115967,
"learning_rate": 9.426537141147322e-05,
"loss": 0.0286,
"step": 52100
},
{
"epoch": 2.390027814978881,
"grad_norm": 0.5915461778640747,
"learning_rate": 9.422815422464786e-05,
"loss": 0.0353,
"step": 52200
},
{
"epoch": 2.394606412325584,
"grad_norm": 0.4212239980697632,
"learning_rate": 9.419082405610267e-05,
"loss": 0.0293,
"step": 52300
},
{
"epoch": 2.399185009672287,
"grad_norm": 0.21963568031787872,
"learning_rate": 9.415338100119833e-05,
"loss": 0.0291,
"step": 52400
},
{
"epoch": 2.40376360701899,
"grad_norm": 0.40482693910598755,
"learning_rate": 9.41158251555839e-05,
"loss": 0.0299,
"step": 52500
},
{
"epoch": 2.4083422043656926,
"grad_norm": 1.0071722269058228,
"learning_rate": 9.407815661519655e-05,
"loss": 0.0272,
"step": 52600
},
{
"epoch": 2.4129208017123953,
"grad_norm": 0.9266312718391418,
"learning_rate": 9.404037547626134e-05,
"loss": 0.0292,
"step": 52700
},
{
"epoch": 2.417499399059098,
"grad_norm": 0.9991750121116638,
"learning_rate": 9.400248183529093e-05,
"loss": 0.0341,
"step": 52800
},
{
"epoch": 2.4220779964058012,
"grad_norm": 0.4451786279678345,
"learning_rate": 9.396447578908543e-05,
"loss": 0.0308,
"step": 52900
},
{
"epoch": 2.426656593752504,
"grad_norm": 0.7537618279457092,
"learning_rate": 9.392635743473204e-05,
"loss": 0.0335,
"step": 53000
},
{
"epoch": 2.4312351910992067,
"grad_norm": 0.3898552358150482,
"learning_rate": 9.388812686960486e-05,
"loss": 0.0303,
"step": 53100
},
{
"epoch": 2.4358137884459095,
"grad_norm": 0.1613057404756546,
"learning_rate": 9.384978419136468e-05,
"loss": 0.0319,
"step": 53200
},
{
"epoch": 2.440392385792612,
"grad_norm": 0.34397152066230774,
"learning_rate": 9.381132949795861e-05,
"loss": 0.0343,
"step": 53300
},
{
"epoch": 2.4449709831393154,
"grad_norm": 0.38366371393203735,
"learning_rate": 9.377276288761997e-05,
"loss": 0.0296,
"step": 53400
},
{
"epoch": 2.449549580486018,
"grad_norm": 0.15570569038391113,
"learning_rate": 9.373408445886798e-05,
"loss": 0.0294,
"step": 53500
},
{
"epoch": 2.454128177832721,
"grad_norm": 0.2775089144706726,
"learning_rate": 9.369529431050743e-05,
"loss": 0.0301,
"step": 53600
},
{
"epoch": 2.4587067751794236,
"grad_norm": 0.20707450807094574,
"learning_rate": 9.365639254162854e-05,
"loss": 0.0301,
"step": 53700
},
{
"epoch": 2.463285372526127,
"grad_norm": 0.16948607563972473,
"learning_rate": 9.36173792516067e-05,
"loss": 0.0329,
"step": 53800
},
{
"epoch": 2.4678639698728295,
"grad_norm": 0.4837573766708374,
"learning_rate": 9.357825454010213e-05,
"loss": 0.0299,
"step": 53900
},
{
"epoch": 2.4724425672195323,
"grad_norm": 0.4705110490322113,
"learning_rate": 9.353901850705972e-05,
"loss": 0.0312,
"step": 54000
},
{
"epoch": 2.477021164566235,
"grad_norm": 0.6251786947250366,
"learning_rate": 9.349967125270871e-05,
"loss": 0.0306,
"step": 54100
},
{
"epoch": 2.481599761912938,
"grad_norm": 0.27536630630493164,
"learning_rate": 9.346021287756246e-05,
"loss": 0.03,
"step": 54200
},
{
"epoch": 2.486178359259641,
"grad_norm": 0.581510066986084,
"learning_rate": 9.342064348241818e-05,
"loss": 0.0326,
"step": 54300
},
{
"epoch": 2.4907569566063437,
"grad_norm": 0.4884732961654663,
"learning_rate": 9.338096316835671e-05,
"loss": 0.0281,
"step": 54400
},
{
"epoch": 2.4953355539530464,
"grad_norm": 0.34184959530830383,
"learning_rate": 9.334117203674219e-05,
"loss": 0.0308,
"step": 54500
},
{
"epoch": 2.499914151299749,
"grad_norm": 1.2529618740081787,
"learning_rate": 9.330127018922194e-05,
"loss": 0.0286,
"step": 54600
},
{
"epoch": 2.5044927486464523,
"grad_norm": 0.3773830831050873,
"learning_rate": 9.326125772772597e-05,
"loss": 0.0313,
"step": 54700
},
{
"epoch": 2.509071345993155,
"grad_norm": 0.5453410744667053,
"learning_rate": 9.322113475446698e-05,
"loss": 0.029,
"step": 54800
},
{
"epoch": 2.513649943339858,
"grad_norm": 0.4246394634246826,
"learning_rate": 9.318090137193988e-05,
"loss": 0.0333,
"step": 54900
},
{
"epoch": 2.5182285406865605,
"grad_norm": 0.46837061643600464,
"learning_rate": 9.314055768292169e-05,
"loss": 0.0311,
"step": 55000
},
{
"epoch": 2.5228071380332633,
"grad_norm": 0.31000879406929016,
"learning_rate": 9.310010379047119e-05,
"loss": 0.0288,
"step": 55100
},
{
"epoch": 2.5273857353799665,
"grad_norm": 0.36738067865371704,
"learning_rate": 9.305953979792865e-05,
"loss": 0.0318,
"step": 55200
},
{
"epoch": 2.531964332726669,
"grad_norm": 0.29930517077445984,
"learning_rate": 9.301886580891562e-05,
"loss": 0.0285,
"step": 55300
},
{
"epoch": 2.536542930073372,
"grad_norm": 0.22497807443141937,
"learning_rate": 9.297808192733464e-05,
"loss": 0.0283,
"step": 55400
},
{
"epoch": 2.541121527420075,
"grad_norm": 0.6719942688941956,
"learning_rate": 9.293718825736897e-05,
"loss": 0.0283,
"step": 55500
},
{
"epoch": 2.5457001247667774,
"grad_norm": 0.32624194025993347,
"learning_rate": 9.289618490348228e-05,
"loss": 0.0309,
"step": 55600
},
{
"epoch": 2.5502787221134806,
"grad_norm": 0.5497521162033081,
"learning_rate": 9.285507197041853e-05,
"loss": 0.0288,
"step": 55700
},
{
"epoch": 2.5548573194601834,
"grad_norm": 0.8471511006355286,
"learning_rate": 9.281384956320153e-05,
"loss": 0.0302,
"step": 55800
},
{
"epoch": 2.559435916806886,
"grad_norm": 0.40366891026496887,
"learning_rate": 9.277251778713474e-05,
"loss": 0.0279,
"step": 55900
},
{
"epoch": 2.5640145141535893,
"grad_norm": 0.10733508318662643,
"learning_rate": 9.273107674780102e-05,
"loss": 0.0285,
"step": 56000
},
{
"epoch": 2.568593111500292,
"grad_norm": 0.2292618602514267,
"learning_rate": 9.268952655106236e-05,
"loss": 0.0266,
"step": 56100
},
{
"epoch": 2.5731717088469948,
"grad_norm": 0.4450601637363434,
"learning_rate": 9.26478673030596e-05,
"loss": 0.0297,
"step": 56200
},
{
"epoch": 2.5777503061936975,
"grad_norm": 1.0813257694244385,
"learning_rate": 9.260609911021209e-05,
"loss": 0.0319,
"step": 56300
},
{
"epoch": 2.5823289035404002,
"grad_norm": 0.3026310205459595,
"learning_rate": 9.256422207921757e-05,
"loss": 0.0315,
"step": 56400
},
{
"epoch": 2.5869075008871034,
"grad_norm": 0.23144447803497314,
"learning_rate": 9.252223631705175e-05,
"loss": 0.0294,
"step": 56500
},
{
"epoch": 2.591486098233806,
"grad_norm": 0.38160964846611023,
"learning_rate": 9.248014193096811e-05,
"loss": 0.031,
"step": 56600
},
{
"epoch": 2.596064695580509,
"grad_norm": 0.2660236060619354,
"learning_rate": 9.243793902849763e-05,
"loss": 0.0279,
"step": 56700
},
{
"epoch": 2.6006432929272116,
"grad_norm": 0.7620320320129395,
"learning_rate": 9.239562771744848e-05,
"loss": 0.0318,
"step": 56800
},
{
"epoch": 2.6052218902739144,
"grad_norm": 0.5840933918952942,
"learning_rate": 9.235320810590575e-05,
"loss": 0.0317,
"step": 56900
},
{
"epoch": 2.6098004876206176,
"grad_norm": 0.3403662443161011,
"learning_rate": 9.231068030223122e-05,
"loss": 0.0322,
"step": 57000
},
{
"epoch": 2.6143790849673203,
"grad_norm": 0.2513747811317444,
"learning_rate": 9.226804441506302e-05,
"loss": 0.0295,
"step": 57100
},
{
"epoch": 2.618957682314023,
"grad_norm": 0.433006227016449,
"learning_rate": 9.22253005533154e-05,
"loss": 0.0308,
"step": 57200
},
{
"epoch": 2.623536279660726,
"grad_norm": 0.3026902675628662,
"learning_rate": 9.218244882617842e-05,
"loss": 0.0253,
"step": 57300
},
{
"epoch": 2.6281148770074285,
"grad_norm": 0.4516427516937256,
"learning_rate": 9.213948934311767e-05,
"loss": 0.0295,
"step": 57400
},
{
"epoch": 2.6326934743541317,
"grad_norm": 0.26671695709228516,
"learning_rate": 9.209642221387405e-05,
"loss": 0.028,
"step": 57500
},
{
"epoch": 2.6372720717008344,
"grad_norm": 0.5790793299674988,
"learning_rate": 9.20532475484634e-05,
"loss": 0.0301,
"step": 57600
},
{
"epoch": 2.641850669047537,
"grad_norm": 0.37218374013900757,
"learning_rate": 9.200996545717629e-05,
"loss": 0.0302,
"step": 57700
},
{
"epoch": 2.6464292663942404,
"grad_norm": 0.5596415400505066,
"learning_rate": 9.196657605057769e-05,
"loss": 0.0332,
"step": 57800
},
{
"epoch": 2.651007863740943,
"grad_norm": 0.6026178002357483,
"learning_rate": 9.192307943950675e-05,
"loss": 0.0297,
"step": 57900
},
{
"epoch": 2.655586461087646,
"grad_norm": 0.13471604883670807,
"learning_rate": 9.187947573507642e-05,
"loss": 0.031,
"step": 58000
},
{
"epoch": 2.6601650584343486,
"grad_norm": 0.7578465342521667,
"learning_rate": 9.183576504867327e-05,
"loss": 0.0255,
"step": 58100
},
{
"epoch": 2.6647436557810513,
"grad_norm": 0.32717058062553406,
"learning_rate": 9.179194749195713e-05,
"loss": 0.0295,
"step": 58200
},
{
"epoch": 2.6693222531277545,
"grad_norm": 0.4371168911457062,
"learning_rate": 9.174802317686084e-05,
"loss": 0.0297,
"step": 58300
},
{
"epoch": 2.6739008504744572,
"grad_norm": 0.3458854854106903,
"learning_rate": 9.170399221558995e-05,
"loss": 0.0252,
"step": 58400
},
{
"epoch": 2.67847944782116,
"grad_norm": 0.8889488577842712,
"learning_rate": 9.165985472062246e-05,
"loss": 0.0292,
"step": 58500
},
{
"epoch": 2.6830580451678627,
"grad_norm": 0.6179521679878235,
"learning_rate": 9.161561080470847e-05,
"loss": 0.0304,
"step": 58600
},
{
"epoch": 2.6876366425145655,
"grad_norm": 0.3913422226905823,
"learning_rate": 9.157126058087e-05,
"loss": 0.0321,
"step": 58700
},
{
"epoch": 2.6922152398612686,
"grad_norm": 0.31714576482772827,
"learning_rate": 9.152680416240059e-05,
"loss": 0.0306,
"step": 58800
},
{
"epoch": 2.6967938372079714,
"grad_norm": 0.16598474979400635,
"learning_rate": 9.148224166286506e-05,
"loss": 0.0308,
"step": 58900
},
{
"epoch": 2.701372434554674,
"grad_norm": 0.4751458466053009,
"learning_rate": 9.14375731960992e-05,
"loss": 0.0328,
"step": 59000
},
{
"epoch": 2.705951031901377,
"grad_norm": 0.8825288414955139,
"learning_rate": 9.139279887620955e-05,
"loss": 0.0288,
"step": 59100
},
{
"epoch": 2.7105296292480796,
"grad_norm": 0.4172840714454651,
"learning_rate": 9.1347918817573e-05,
"loss": 0.0301,
"step": 59200
},
{
"epoch": 2.715108226594783,
"grad_norm": 0.3465460538864136,
"learning_rate": 9.13029331348366e-05,
"loss": 0.0252,
"step": 59300
},
{
"epoch": 2.7196868239414855,
"grad_norm": 1.264923095703125,
"learning_rate": 9.125784194291717e-05,
"loss": 0.0272,
"step": 59400
},
{
"epoch": 2.7242654212881883,
"grad_norm": 0.2547473907470703,
"learning_rate": 9.121264535700107e-05,
"loss": 0.0332,
"step": 59500
},
{
"epoch": 2.7288440186348915,
"grad_norm": 0.508148193359375,
"learning_rate": 9.116734349254393e-05,
"loss": 0.0317,
"step": 59600
},
{
"epoch": 2.733422615981594,
"grad_norm": 0.6783300638198853,
"learning_rate": 9.112193646527024e-05,
"loss": 0.0331,
"step": 59700
},
{
"epoch": 2.738001213328297,
"grad_norm": 0.16436424851417542,
"learning_rate": 9.107642439117321e-05,
"loss": 0.0288,
"step": 59800
},
{
"epoch": 2.7425798106749997,
"grad_norm": 0.4682653546333313,
"learning_rate": 9.103080738651434e-05,
"loss": 0.0287,
"step": 59900
},
{
"epoch": 2.7471584080217024,
"grad_norm": 0.6873565912246704,
"learning_rate": 9.09850855678232e-05,
"loss": 0.0337,
"step": 60000
},
{
"epoch": 2.7517370053684056,
"grad_norm": 0.6117233037948608,
"learning_rate": 9.093925905189713e-05,
"loss": 0.0298,
"step": 60100
},
{
"epoch": 2.7563156027151083,
"grad_norm": 0.17423506081104279,
"learning_rate": 9.089332795580086e-05,
"loss": 0.03,
"step": 60200
},
{
"epoch": 2.760894200061811,
"grad_norm": 0.5828815698623657,
"learning_rate": 9.084729239686633e-05,
"loss": 0.0289,
"step": 60300
},
{
"epoch": 2.765472797408514,
"grad_norm": 0.2698822021484375,
"learning_rate": 9.080115249269232e-05,
"loss": 0.0298,
"step": 60400
},
{
"epoch": 2.7700513947552166,
"grad_norm": 0.5367493629455566,
"learning_rate": 9.075490836114413e-05,
"loss": 0.0322,
"step": 60500
},
{
"epoch": 2.7746299921019197,
"grad_norm": 0.4073825478553772,
"learning_rate": 9.070856012035336e-05,
"loss": 0.0292,
"step": 60600
},
{
"epoch": 2.7792085894486225,
"grad_norm": 0.22106589376926422,
"learning_rate": 9.066210788871751e-05,
"loss": 0.0308,
"step": 60700
},
{
"epoch": 2.783787186795325,
"grad_norm": 0.575246274471283,
"learning_rate": 9.061555178489978e-05,
"loss": 0.0284,
"step": 60800
},
{
"epoch": 2.788365784142028,
"grad_norm": 0.44034871459007263,
"learning_rate": 9.056889192782866e-05,
"loss": 0.0277,
"step": 60900
},
{
"epoch": 2.7929443814887307,
"grad_norm": 0.2914714217185974,
"learning_rate": 9.05221284366977e-05,
"loss": 0.0298,
"step": 61000
},
{
"epoch": 2.797522978835434,
"grad_norm": 0.411410391330719,
"learning_rate": 9.04752614309652e-05,
"loss": 0.0256,
"step": 61100
},
{
"epoch": 2.8021015761821366,
"grad_norm": 0.172648623585701,
"learning_rate": 9.04282910303539e-05,
"loss": 0.0326,
"step": 61200
},
{
"epoch": 2.8066801735288394,
"grad_norm": 0.279862642288208,
"learning_rate": 9.038121735485062e-05,
"loss": 0.0275,
"step": 61300
},
{
"epoch": 2.8112587708755425,
"grad_norm": 0.2992120385169983,
"learning_rate": 9.033404052470602e-05,
"loss": 0.0287,
"step": 61400
},
{
"epoch": 2.815837368222245,
"grad_norm": 0.3917059004306793,
"learning_rate": 9.028676066043428e-05,
"loss": 0.0316,
"step": 61500
},
{
"epoch": 2.820415965568948,
"grad_norm": 0.5848602056503296,
"learning_rate": 9.023937788281278e-05,
"loss": 0.0303,
"step": 61600
},
{
"epoch": 2.8249945629156508,
"grad_norm": 0.4045267701148987,
"learning_rate": 9.019189231288176e-05,
"loss": 0.0282,
"step": 61700
},
{
"epoch": 2.8295731602623535,
"grad_norm": 0.38309866189956665,
"learning_rate": 9.014430407194413e-05,
"loss": 0.0287,
"step": 61800
},
{
"epoch": 2.8341517576090567,
"grad_norm": 0.7173412442207336,
"learning_rate": 9.009661328156498e-05,
"loss": 0.0274,
"step": 61900
},
{
"epoch": 2.8387303549557594,
"grad_norm": 0.37477946281433105,
"learning_rate": 9.00488200635714e-05,
"loss": 0.0303,
"step": 62000
},
{
"epoch": 2.843308952302462,
"grad_norm": 0.26493415236473083,
"learning_rate": 9.000092454005216e-05,
"loss": 0.0289,
"step": 62100
},
{
"epoch": 2.847887549649165,
"grad_norm": 0.15275776386260986,
"learning_rate": 8.995292683335733e-05,
"loss": 0.0304,
"step": 62200
},
{
"epoch": 2.8524661469958676,
"grad_norm": 0.2792358994483948,
"learning_rate": 8.990482706609805e-05,
"loss": 0.0311,
"step": 62300
},
{
"epoch": 2.857044744342571,
"grad_norm": 0.4240334630012512,
"learning_rate": 8.985662536114613e-05,
"loss": 0.0304,
"step": 62400
},
{
"epoch": 2.8616233416892736,
"grad_norm": 0.137941375374794,
"learning_rate": 8.980832184163382e-05,
"loss": 0.0309,
"step": 62500
},
{
"epoch": 2.8662019390359763,
"grad_norm": 0.2340019941329956,
"learning_rate": 8.975991663095344e-05,
"loss": 0.0296,
"step": 62600
},
{
"epoch": 2.870780536382679,
"grad_norm": 0.39523446559906006,
"learning_rate": 8.97114098527571e-05,
"loss": 0.0284,
"step": 62700
},
{
"epoch": 2.875359133729382,
"grad_norm": 0.5535847544670105,
"learning_rate": 8.966280163095633e-05,
"loss": 0.0325,
"step": 62800
},
{
"epoch": 2.879937731076085,
"grad_norm": 0.4570659101009369,
"learning_rate": 8.961409208972182e-05,
"loss": 0.0237,
"step": 62900
},
{
"epoch": 2.8845163284227877,
"grad_norm": 0.5584346055984497,
"learning_rate": 8.95652813534831e-05,
"loss": 0.0358,
"step": 63000
},
{
"epoch": 2.8890949257694905,
"grad_norm": 0.961768388748169,
"learning_rate": 8.951636954692819e-05,
"loss": 0.0299,
"step": 63100
},
{
"epoch": 2.893673523116193,
"grad_norm": 0.24575570225715637,
"learning_rate": 8.94673567950033e-05,
"loss": 0.0282,
"step": 63200
},
{
"epoch": 2.898252120462896,
"grad_norm": 0.32376107573509216,
"learning_rate": 8.941824322291246e-05,
"loss": 0.0263,
"step": 63300
},
{
"epoch": 2.902830717809599,
"grad_norm": 0.20682887732982635,
"learning_rate": 8.936902895611732e-05,
"loss": 0.0313,
"step": 63400
},
{
"epoch": 2.907409315156302,
"grad_norm": 0.29019802808761597,
"learning_rate": 8.931971412033673e-05,
"loss": 0.0327,
"step": 63500
},
{
"epoch": 2.9119879125030046,
"grad_norm": 0.6069703102111816,
"learning_rate": 8.927029884154646e-05,
"loss": 0.0272,
"step": 63600
},
{
"epoch": 2.9165665098497078,
"grad_norm": 0.5670173168182373,
"learning_rate": 8.922078324597879e-05,
"loss": 0.0317,
"step": 63700
},
{
"epoch": 2.9211451071964105,
"grad_norm": 0.29881516098976135,
"learning_rate": 8.917116746012235e-05,
"loss": 0.0283,
"step": 63800
},
{
"epoch": 2.9257237045431133,
"grad_norm": 0.722374439239502,
"learning_rate": 8.91214516107217e-05,
"loss": 0.0295,
"step": 63900
},
{
"epoch": 2.930302301889816,
"grad_norm": 0.4505271315574646,
"learning_rate": 8.907163582477693e-05,
"loss": 0.0282,
"step": 64000
},
{
"epoch": 2.9348808992365187,
"grad_norm": 0.9996728301048279,
"learning_rate": 8.902172022954353e-05,
"loss": 0.0283,
"step": 64100
},
{
"epoch": 2.939459496583222,
"grad_norm": 0.5205316543579102,
"learning_rate": 8.897170495253187e-05,
"loss": 0.0281,
"step": 64200
},
{
"epoch": 2.9440380939299247,
"grad_norm": 0.6521015763282776,
"learning_rate": 8.892159012150701e-05,
"loss": 0.0279,
"step": 64300
},
{
"epoch": 2.9486166912766274,
"grad_norm": 0.8637863397598267,
"learning_rate": 8.88713758644883e-05,
"loss": 0.0277,
"step": 64400
},
{
"epoch": 2.95319528862333,
"grad_norm": 0.9392446875572205,
"learning_rate": 8.88210623097491e-05,
"loss": 0.0256,
"step": 64500
},
{
"epoch": 2.957773885970033,
"grad_norm": 0.23240399360656738,
"learning_rate": 8.877064958581636e-05,
"loss": 0.0276,
"step": 64600
},
{
"epoch": 2.962352483316736,
"grad_norm": 0.5640022158622742,
"learning_rate": 8.872013782147047e-05,
"loss": 0.0294,
"step": 64700
},
{
"epoch": 2.966931080663439,
"grad_norm": 0.254486620426178,
"learning_rate": 8.86695271457447e-05,
"loss": 0.0267,
"step": 64800
},
{
"epoch": 2.9715096780101415,
"grad_norm": 0.4906103014945984,
"learning_rate": 8.86188176879251e-05,
"loss": 0.0279,
"step": 64900
},
{
"epoch": 2.9760882753568443,
"grad_norm": 0.3822503387928009,
"learning_rate": 8.856800957755e-05,
"loss": 0.0299,
"step": 65000
},
{
"epoch": 2.980666872703547,
"grad_norm": 0.4109038710594177,
"learning_rate": 8.851710294440973e-05,
"loss": 0.0297,
"step": 65100
},
{
"epoch": 2.98524547005025,
"grad_norm": 0.4413500130176544,
"learning_rate": 8.846609791854633e-05,
"loss": 0.0272,
"step": 65200
},
{
"epoch": 2.989824067396953,
"grad_norm": 0.762428879737854,
"learning_rate": 8.84149946302532e-05,
"loss": 0.0279,
"step": 65300
},
{
"epoch": 2.9944026647436557,
"grad_norm": 0.9755131602287292,
"learning_rate": 8.83637932100747e-05,
"loss": 0.0294,
"step": 65400
},
{
"epoch": 2.998981262090359,
"grad_norm": 0.3907323181629181,
"learning_rate": 8.831249378880591e-05,
"loss": 0.0312,
"step": 65500
},
{
"epoch": 2.9999885535066335,
"eval_loss": 0.14470230042934418,
"eval_runtime": 251.7148,
"eval_samples_per_second": 21.85,
"eval_steps_per_second": 21.85,
"step": 65522
},
{
"epoch": 3.0035598594370616,
"grad_norm": 0.09335774928331375,
"learning_rate": 8.826109649749224e-05,
"loss": 0.024,
"step": 65600
},
{
"epoch": 3.0081384567837643,
"grad_norm": 0.43074119091033936,
"learning_rate": 8.820960146742913e-05,
"loss": 0.0205,
"step": 65700
},
{
"epoch": 3.012717054130467,
"grad_norm": 0.5296483635902405,
"learning_rate": 8.815800883016168e-05,
"loss": 0.0223,
"step": 65800
},
{
"epoch": 3.01729565147717,
"grad_norm": 0.3759153187274933,
"learning_rate": 8.810631871748432e-05,
"loss": 0.0207,
"step": 65900
},
{
"epoch": 3.021874248823873,
"grad_norm": 0.6265881657600403,
"learning_rate": 8.805453126144047e-05,
"loss": 0.0218,
"step": 66000
},
{
"epoch": 3.0264528461705758,
"grad_norm": 1.2174720764160156,
"learning_rate": 8.800264659432232e-05,
"loss": 0.0217,
"step": 66100
},
{
"epoch": 3.0310314435172785,
"grad_norm": 0.9290931224822998,
"learning_rate": 8.795066484867023e-05,
"loss": 0.0199,
"step": 66200
},
{
"epoch": 3.0356100408639812,
"grad_norm": 0.6158362030982971,
"learning_rate": 8.789858615727265e-05,
"loss": 0.0182,
"step": 66300
},
{
"epoch": 3.040188638210684,
"grad_norm": 0.35175448656082153,
"learning_rate": 8.784641065316567e-05,
"loss": 0.0192,
"step": 66400
},
{
"epoch": 3.044767235557387,
"grad_norm": 0.6219223141670227,
"learning_rate": 8.779413846963267e-05,
"loss": 0.0174,
"step": 66500
},
{
"epoch": 3.04934583290409,
"grad_norm": 0.1079217791557312,
"learning_rate": 8.7741769740204e-05,
"loss": 0.0213,
"step": 66600
},
{
"epoch": 3.0539244302507926,
"grad_norm": 0.4346974790096283,
"learning_rate": 8.768930459865665e-05,
"loss": 0.0207,
"step": 66700
},
{
"epoch": 3.0585030275974954,
"grad_norm": 0.26265600323677063,
"learning_rate": 8.76367431790139e-05,
"loss": 0.0213,
"step": 66800
},
{
"epoch": 3.0630816249441986,
"grad_norm": 0.536638617515564,
"learning_rate": 8.758408561554495e-05,
"loss": 0.0207,
"step": 66900
},
{
"epoch": 3.0676602222909013,
"grad_norm": 0.4859350025653839,
"learning_rate": 8.753133204276462e-05,
"loss": 0.0208,
"step": 67000
},
{
"epoch": 3.072238819637604,
"grad_norm": 0.03394511342048645,
"learning_rate": 8.7478482595433e-05,
"loss": 0.0202,
"step": 67100
},
{
"epoch": 3.0768174169843068,
"grad_norm": 1.2979317903518677,
"learning_rate": 8.742553740855506e-05,
"loss": 0.02,
"step": 67200
},
{
"epoch": 3.0813960143310095,
"grad_norm": 0.7448957562446594,
"learning_rate": 8.737249661738036e-05,
"loss": 0.02,
"step": 67300
},
{
"epoch": 3.0859746116777127,
"grad_norm": 0.45380228757858276,
"learning_rate": 8.731936035740269e-05,
"loss": 0.0214,
"step": 67400
},
{
"epoch": 3.0905532090244154,
"grad_norm": 0.49080690741539,
"learning_rate": 8.726612876435972e-05,
"loss": 0.0206,
"step": 67500
},
{
"epoch": 3.095131806371118,
"grad_norm": 0.2271386682987213,
"learning_rate": 8.721280197423258e-05,
"loss": 0.0218,
"step": 67600
},
{
"epoch": 3.099710403717821,
"grad_norm": 0.7691048383712769,
"learning_rate": 8.71593801232457e-05,
"loss": 0.0184,
"step": 67700
},
{
"epoch": 3.104289001064524,
"grad_norm": 0.37762150168418884,
"learning_rate": 8.710586334786627e-05,
"loss": 0.0196,
"step": 67800
},
{
"epoch": 3.108867598411227,
"grad_norm": 0.4796387255191803,
"learning_rate": 8.705225178480398e-05,
"loss": 0.0194,
"step": 67900
},
{
"epoch": 3.1134461957579296,
"grad_norm": 0.1666077822446823,
"learning_rate": 8.699854557101063e-05,
"loss": 0.0215,
"step": 68000
},
{
"epoch": 3.1180247931046323,
"grad_norm": 0.287124365568161,
"learning_rate": 8.69447448436799e-05,
"loss": 0.0184,
"step": 68100
},
{
"epoch": 3.122603390451335,
"grad_norm": 0.2599179744720459,
"learning_rate": 8.689084974024677e-05,
"loss": 0.0185,
"step": 68200
},
{
"epoch": 3.1271819877980382,
"grad_norm": 0.33696624636650085,
"learning_rate": 8.683686039838742e-05,
"loss": 0.0199,
"step": 68300
},
{
"epoch": 3.131760585144741,
"grad_norm": 0.4512630105018616,
"learning_rate": 8.678277695601872e-05,
"loss": 0.0189,
"step": 68400
},
{
"epoch": 3.1363391824914437,
"grad_norm": 1.3083339929580688,
"learning_rate": 8.67285995512979e-05,
"loss": 0.0205,
"step": 68500
},
{
"epoch": 3.1409177798381465,
"grad_norm": 0.5254839658737183,
"learning_rate": 8.66743283226223e-05,
"loss": 0.021,
"step": 68600
},
{
"epoch": 3.145496377184849,
"grad_norm": 0.37214428186416626,
"learning_rate": 8.66199634086288e-05,
"loss": 0.0214,
"step": 68700
},
{
"epoch": 3.1500749745315524,
"grad_norm": 0.39814454317092896,
"learning_rate": 8.656550494819373e-05,
"loss": 0.0215,
"step": 68800
},
{
"epoch": 3.154653571878255,
"grad_norm": 0.7737843990325928,
"learning_rate": 8.651095308043232e-05,
"loss": 0.0199,
"step": 68900
},
{
"epoch": 3.159232169224958,
"grad_norm": 0.32976606488227844,
"learning_rate": 8.645630794469843e-05,
"loss": 0.0232,
"step": 69000
},
{
"epoch": 3.1638107665716606,
"grad_norm": 0.23388764262199402,
"learning_rate": 8.640156968058417e-05,
"loss": 0.0197,
"step": 69100
},
{
"epoch": 3.168389363918364,
"grad_norm": 0.15984760224819183,
"learning_rate": 8.634673842791956e-05,
"loss": 0.0212,
"step": 69200
},
{
"epoch": 3.1729679612650665,
"grad_norm": 0.20868225395679474,
"learning_rate": 8.629181432677213e-05,
"loss": 0.02,
"step": 69300
},
{
"epoch": 3.1775465586117693,
"grad_norm": 0.12190031260251999,
"learning_rate": 8.623679751744662e-05,
"loss": 0.0195,
"step": 69400
},
{
"epoch": 3.182125155958472,
"grad_norm": 0.7357327342033386,
"learning_rate": 8.61816881404846e-05,
"loss": 0.0212,
"step": 69500
},
{
"epoch": 3.186703753305175,
"grad_norm": 0.231657475233078,
"learning_rate": 8.612648633666406e-05,
"loss": 0.0181,
"step": 69600
},
{
"epoch": 3.191282350651878,
"grad_norm": 0.9028156995773315,
"learning_rate": 8.607119224699919e-05,
"loss": 0.0216,
"step": 69700
},
{
"epoch": 3.1958609479985807,
"grad_norm": 0.30773207545280457,
"learning_rate": 8.601580601273982e-05,
"loss": 0.0189,
"step": 69800
},
{
"epoch": 3.2004395453452834,
"grad_norm": 0.15716642141342163,
"learning_rate": 8.596032777537123e-05,
"loss": 0.022,
"step": 69900
},
{
"epoch": 3.205018142691986,
"grad_norm": 0.2637390196323395,
"learning_rate": 8.59047576766137e-05,
"loss": 0.0174,
"step": 70000
},
{
"epoch": 3.2095967400386893,
"grad_norm": 0.29018816351890564,
"learning_rate": 8.584909585842218e-05,
"loss": 0.0205,
"step": 70100
},
{
"epoch": 3.214175337385392,
"grad_norm": 0.6676698327064514,
"learning_rate": 8.579334246298593e-05,
"loss": 0.0176,
"step": 70200
},
{
"epoch": 3.218753934732095,
"grad_norm": 0.3571256101131439,
"learning_rate": 8.573749763272811e-05,
"loss": 0.0229,
"step": 70300
},
{
"epoch": 3.2233325320787976,
"grad_norm": 0.7378453016281128,
"learning_rate": 8.568156151030549e-05,
"loss": 0.0185,
"step": 70400
},
{
"epoch": 3.2279111294255003,
"grad_norm": 0.533330500125885,
"learning_rate": 8.562553423860802e-05,
"loss": 0.0207,
"step": 70500
},
{
"epoch": 3.2324897267722035,
"grad_norm": 0.28255611658096313,
"learning_rate": 8.556941596075852e-05,
"loss": 0.0185,
"step": 70600
},
{
"epoch": 3.237068324118906,
"grad_norm": 0.37244170904159546,
"learning_rate": 8.551320682011228e-05,
"loss": 0.0217,
"step": 70700
},
{
"epoch": 3.241646921465609,
"grad_norm": 0.16496537625789642,
"learning_rate": 8.545690696025666e-05,
"loss": 0.0238,
"step": 70800
},
{
"epoch": 3.2462255188123117,
"grad_norm": 1.0030924081802368,
"learning_rate": 8.540051652501082e-05,
"loss": 0.0213,
"step": 70900
},
{
"epoch": 3.250804116159015,
"grad_norm": 0.7419716715812683,
"learning_rate": 8.534403565842528e-05,
"loss": 0.0225,
"step": 71000
},
{
"epoch": 3.2553827135057176,
"grad_norm": 0.2792261242866516,
"learning_rate": 8.528746450478156e-05,
"loss": 0.0187,
"step": 71100
},
{
"epoch": 3.2599613108524204,
"grad_norm": 0.0836094543337822,
"learning_rate": 8.523080320859181e-05,
"loss": 0.0221,
"step": 71200
},
{
"epoch": 3.264539908199123,
"grad_norm": 0.10340839624404907,
"learning_rate": 8.517405191459847e-05,
"loss": 0.0213,
"step": 71300
},
{
"epoch": 3.2691185055458263,
"grad_norm": 0.7118562459945679,
"learning_rate": 8.511721076777389e-05,
"loss": 0.0193,
"step": 71400
},
{
"epoch": 3.273697102892529,
"grad_norm": 0.12246321886777878,
"learning_rate": 8.50602799133199e-05,
"loss": 0.0223,
"step": 71500
},
{
"epoch": 3.2782757002392318,
"grad_norm": 0.2873895764350891,
"learning_rate": 8.500325949666755e-05,
"loss": 0.0213,
"step": 71600
},
{
"epoch": 3.2828542975859345,
"grad_norm": 0.5243780016899109,
"learning_rate": 8.494614966347668e-05,
"loss": 0.0201,
"step": 71700
},
{
"epoch": 3.2874328949326372,
"grad_norm": 0.28602150082588196,
"learning_rate": 8.488895055963546e-05,
"loss": 0.0209,
"step": 71800
},
{
"epoch": 3.2920114922793404,
"grad_norm": 0.35241249203681946,
"learning_rate": 8.483166233126022e-05,
"loss": 0.0217,
"step": 71900
},
{
"epoch": 3.296590089626043,
"grad_norm": 0.6958779096603394,
"learning_rate": 8.477428512469488e-05,
"loss": 0.023,
"step": 72000
},
{
"epoch": 3.301168686972746,
"grad_norm": 0.13842323422431946,
"learning_rate": 8.471681908651067e-05,
"loss": 0.0202,
"step": 72100
},
{
"epoch": 3.3057472843194486,
"grad_norm": 0.21349883079528809,
"learning_rate": 8.46592643635058e-05,
"loss": 0.0209,
"step": 72200
},
{
"epoch": 3.3103258816661514,
"grad_norm": 0.3605678975582123,
"learning_rate": 8.460162110270494e-05,
"loss": 0.0241,
"step": 72300
},
{
"epoch": 3.3149044790128546,
"grad_norm": 0.46661075949668884,
"learning_rate": 8.454388945135895e-05,
"loss": 0.0193,
"step": 72400
},
{
"epoch": 3.3194830763595573,
"grad_norm": 0.24211075901985168,
"learning_rate": 8.448606955694457e-05,
"loss": 0.0214,
"step": 72500
},
{
"epoch": 3.32406167370626,
"grad_norm": 0.3622238337993622,
"learning_rate": 8.442816156716385e-05,
"loss": 0.0213,
"step": 72600
},
{
"epoch": 3.328640271052963,
"grad_norm": 1.0499359369277954,
"learning_rate": 8.437016562994397e-05,
"loss": 0.0196,
"step": 72700
},
{
"epoch": 3.3332188683996655,
"grad_norm": 0.2845001816749573,
"learning_rate": 8.43120818934367e-05,
"loss": 0.0202,
"step": 72800
},
{
"epoch": 3.3377974657463687,
"grad_norm": 0.5690521001815796,
"learning_rate": 8.42539105060181e-05,
"loss": 0.0209,
"step": 72900
},
{
"epoch": 3.3423760630930714,
"grad_norm": 0.09998586773872375,
"learning_rate": 8.419565161628823e-05,
"loss": 0.018,
"step": 73000
},
{
"epoch": 3.346954660439774,
"grad_norm": 0.9970934391021729,
"learning_rate": 8.413730537307056e-05,
"loss": 0.0213,
"step": 73100
},
{
"epoch": 3.351533257786477,
"grad_norm": 1.1385819911956787,
"learning_rate": 8.407887192541177e-05,
"loss": 0.0198,
"step": 73200
},
{
"epoch": 3.35611185513318,
"grad_norm": 0.6288115382194519,
"learning_rate": 8.402035142258131e-05,
"loss": 0.0211,
"step": 73300
},
{
"epoch": 3.360690452479883,
"grad_norm": 0.35352623462677,
"learning_rate": 8.396174401407095e-05,
"loss": 0.0189,
"step": 73400
},
{
"epoch": 3.3652690498265856,
"grad_norm": 0.5127176642417908,
"learning_rate": 8.390304984959454e-05,
"loss": 0.0195,
"step": 73500
},
{
"epoch": 3.3698476471732883,
"grad_norm": 0.9110797643661499,
"learning_rate": 8.384426907908754e-05,
"loss": 0.0219,
"step": 73600
},
{
"epoch": 3.3744262445199915,
"grad_norm": 0.22417746484279633,
"learning_rate": 8.378540185270656e-05,
"loss": 0.0194,
"step": 73700
},
{
"epoch": 3.3790048418666943,
"grad_norm": 0.49265140295028687,
"learning_rate": 8.372644832082917e-05,
"loss": 0.0205,
"step": 73800
},
{
"epoch": 3.383583439213397,
"grad_norm": 0.7536473870277405,
"learning_rate": 8.366740863405336e-05,
"loss": 0.0222,
"step": 73900
},
{
"epoch": 3.3881620365600997,
"grad_norm": 0.2447548657655716,
"learning_rate": 8.360828294319721e-05,
"loss": 0.0205,
"step": 74000
},
{
"epoch": 3.3927406339068025,
"grad_norm": 0.3335092067718506,
"learning_rate": 8.354907139929851e-05,
"loss": 0.0208,
"step": 74100
},
{
"epoch": 3.3973192312535057,
"grad_norm": 0.6961463689804077,
"learning_rate": 8.348977415361434e-05,
"loss": 0.018,
"step": 74200
},
{
"epoch": 3.4018978286002084,
"grad_norm": 0.4184730648994446,
"learning_rate": 8.343039135762071e-05,
"loss": 0.0198,
"step": 74300
},
{
"epoch": 3.406476425946911,
"grad_norm": 0.6484507918357849,
"learning_rate": 8.337092316301223e-05,
"loss": 0.0203,
"step": 74400
},
{
"epoch": 3.411055023293614,
"grad_norm": 0.31808891892433167,
"learning_rate": 8.331136972170155e-05,
"loss": 0.0202,
"step": 74500
},
{
"epoch": 3.4156336206403166,
"grad_norm": 0.6552246809005737,
"learning_rate": 8.325173118581919e-05,
"loss": 0.0198,
"step": 74600
},
{
"epoch": 3.42021221798702,
"grad_norm": 0.5105406641960144,
"learning_rate": 8.319200770771298e-05,
"loss": 0.0197,
"step": 74700
},
{
"epoch": 3.4247908153337225,
"grad_norm": 0.9565762877464294,
"learning_rate": 8.313219943994777e-05,
"loss": 0.019,
"step": 74800
},
{
"epoch": 3.4293694126804253,
"grad_norm": 0.7772880792617798,
"learning_rate": 8.3072306535305e-05,
"loss": 0.0207,
"step": 74900
},
{
"epoch": 3.433948010027128,
"grad_norm": 0.6711807250976562,
"learning_rate": 8.30123291467823e-05,
"loss": 0.0222,
"step": 75000
},
{
"epoch": 3.438526607373831,
"grad_norm": 0.10591955482959747,
"learning_rate": 8.295226742759315e-05,
"loss": 0.0199,
"step": 75100
},
{
"epoch": 3.443105204720534,
"grad_norm": 0.5128488540649414,
"learning_rate": 8.289212153116642e-05,
"loss": 0.0219,
"step": 75200
},
{
"epoch": 3.4476838020672367,
"grad_norm": 0.24297969043254852,
"learning_rate": 8.283189161114602e-05,
"loss": 0.0205,
"step": 75300
},
{
"epoch": 3.4522623994139394,
"grad_norm": 0.9164755344390869,
"learning_rate": 8.27715778213905e-05,
"loss": 0.0224,
"step": 75400
},
{
"epoch": 3.4568409967606426,
"grad_norm": 0.493466317653656,
"learning_rate": 8.271118031597271e-05,
"loss": 0.0204,
"step": 75500
},
{
"epoch": 3.4614195941073453,
"grad_norm": 0.27884870767593384,
"learning_rate": 8.265069924917925e-05,
"loss": 0.0199,
"step": 75600
},
{
"epoch": 3.465998191454048,
"grad_norm": 0.2624457776546478,
"learning_rate": 8.259013477551027e-05,
"loss": 0.0223,
"step": 75700
},
{
"epoch": 3.470576788800751,
"grad_norm": 0.6593875885009766,
"learning_rate": 8.252948704967896e-05,
"loss": 0.0186,
"step": 75800
},
{
"epoch": 3.4751553861474536,
"grad_norm": 0.398616760969162,
"learning_rate": 8.246875622661113e-05,
"loss": 0.0199,
"step": 75900
},
{
"epoch": 3.4797339834941567,
"grad_norm": 0.2612878978252411,
"learning_rate": 8.240794246144492e-05,
"loss": 0.0207,
"step": 76000
},
{
"epoch": 3.4843125808408595,
"grad_norm": 0.21333344280719757,
"learning_rate": 8.234704590953033e-05,
"loss": 0.0205,
"step": 76100
},
{
"epoch": 3.4888911781875622,
"grad_norm": 1.0213849544525146,
"learning_rate": 8.228606672642884e-05,
"loss": 0.0199,
"step": 76200
},
{
"epoch": 3.493469775534265,
"grad_norm": 0.29667162895202637,
"learning_rate": 8.222500506791304e-05,
"loss": 0.0215,
"step": 76300
},
{
"epoch": 3.4980483728809677,
"grad_norm": 0.20311638712882996,
"learning_rate": 8.216386108996614e-05,
"loss": 0.0219,
"step": 76400
},
{
"epoch": 3.502626970227671,
"grad_norm": 0.8317406177520752,
"learning_rate": 8.21026349487817e-05,
"loss": 0.0215,
"step": 76500
},
{
"epoch": 3.5072055675743736,
"grad_norm": 0.4841706156730652,
"learning_rate": 8.204132680076312e-05,
"loss": 0.0207,
"step": 76600
},
{
"epoch": 3.5117841649210764,
"grad_norm": 0.5647122263908386,
"learning_rate": 8.197993680252334e-05,
"loss": 0.0217,
"step": 76700
},
{
"epoch": 3.516362762267779,
"grad_norm": 0.9369067549705505,
"learning_rate": 8.191846511088435e-05,
"loss": 0.0215,
"step": 76800
},
{
"epoch": 3.520941359614482,
"grad_norm": 0.7805814743041992,
"learning_rate": 8.185691188287684e-05,
"loss": 0.0219,
"step": 76900
},
{
"epoch": 3.525519956961185,
"grad_norm": 1.2135581970214844,
"learning_rate": 8.179527727573975e-05,
"loss": 0.0193,
"step": 77000
},
{
"epoch": 3.5300985543078878,
"grad_norm": 0.14101019501686096,
"learning_rate": 8.173356144691999e-05,
"loss": 0.0211,
"step": 77100
},
{
"epoch": 3.5346771516545905,
"grad_norm": 0.7078022956848145,
"learning_rate": 8.167176455407187e-05,
"loss": 0.0204,
"step": 77200
},
{
"epoch": 3.5392557490012937,
"grad_norm": 1.2366012334823608,
"learning_rate": 8.160988675505679e-05,
"loss": 0.0183,
"step": 77300
},
{
"epoch": 3.5438343463479964,
"grad_norm": 0.26279062032699585,
"learning_rate": 8.15479282079429e-05,
"loss": 0.02,
"step": 77400
},
{
"epoch": 3.548412943694699,
"grad_norm": 0.21293646097183228,
"learning_rate": 8.148588907100454e-05,
"loss": 0.0203,
"step": 77500
},
{
"epoch": 3.552991541041402,
"grad_norm": 0.48216012120246887,
"learning_rate": 8.142376950272193e-05,
"loss": 0.0192,
"step": 77600
},
{
"epoch": 3.5575701383881047,
"grad_norm": 0.1273164004087448,
"learning_rate": 8.136156966178081e-05,
"loss": 0.0183,
"step": 77700
},
{
"epoch": 3.562148735734808,
"grad_norm": 0.621910035610199,
"learning_rate": 8.12992897070719e-05,
"loss": 0.0217,
"step": 77800
},
{
"epoch": 3.5667273330815106,
"grad_norm": 0.3813430964946747,
"learning_rate": 8.123692979769064e-05,
"loss": 0.0184,
"step": 77900
},
{
"epoch": 3.5713059304282133,
"grad_norm": 0.3676023781299591,
"learning_rate": 8.117449009293668e-05,
"loss": 0.0175,
"step": 78000
},
{
"epoch": 3.575884527774916,
"grad_norm": 0.41113948822021484,
"learning_rate": 8.111197075231351e-05,
"loss": 0.0194,
"step": 78100
},
{
"epoch": 3.580463125121619,
"grad_norm": 0.2245587855577469,
"learning_rate": 8.104937193552806e-05,
"loss": 0.0212,
"step": 78200
},
{
"epoch": 3.585041722468322,
"grad_norm": 0.08874198794364929,
"learning_rate": 8.098669380249029e-05,
"loss": 0.0192,
"step": 78300
},
{
"epoch": 3.5896203198150247,
"grad_norm": 0.29562532901763916,
"learning_rate": 8.092393651331275e-05,
"loss": 0.022,
"step": 78400
},
{
"epoch": 3.5941989171617275,
"grad_norm": 0.47509998083114624,
"learning_rate": 8.086110022831023e-05,
"loss": 0.0202,
"step": 78500
},
{
"epoch": 3.59877751450843,
"grad_norm": 0.41073593497276306,
"learning_rate": 8.079818510799928e-05,
"loss": 0.0214,
"step": 78600
},
{
"epoch": 3.603356111855133,
"grad_norm": 0.2985229790210724,
"learning_rate": 8.073519131309786e-05,
"loss": 0.0165,
"step": 78700
},
{
"epoch": 3.607934709201836,
"grad_norm": 0.7368443012237549,
"learning_rate": 8.067211900452492e-05,
"loss": 0.0177,
"step": 78800
},
{
"epoch": 3.612513306548539,
"grad_norm": 0.46281248331069946,
"learning_rate": 8.060896834339993e-05,
"loss": 0.0221,
"step": 78900
},
{
"epoch": 3.6170919038952416,
"grad_norm": 0.18318797647953033,
"learning_rate": 8.054573949104253e-05,
"loss": 0.0191,
"step": 79000
},
{
"epoch": 3.621670501241945,
"grad_norm": 0.19009487330913544,
"learning_rate": 8.048243260897217e-05,
"loss": 0.0212,
"step": 79100
},
{
"epoch": 3.6262490985886475,
"grad_norm": 0.38268911838531494,
"learning_rate": 8.041904785890749e-05,
"loss": 0.0197,
"step": 79200
},
{
"epoch": 3.6308276959353503,
"grad_norm": 0.3892700672149658,
"learning_rate": 8.035558540276618e-05,
"loss": 0.0214,
"step": 79300
},
{
"epoch": 3.635406293282053,
"grad_norm": 0.6497855186462402,
"learning_rate": 8.029204540266434e-05,
"loss": 0.0192,
"step": 79400
},
{
"epoch": 3.6399848906287557,
"grad_norm": 0.20039434731006622,
"learning_rate": 8.022842802091623e-05,
"loss": 0.0188,
"step": 79500
},
{
"epoch": 3.644563487975459,
"grad_norm": 0.19965870678424835,
"learning_rate": 8.016473342003372e-05,
"loss": 0.0204,
"step": 79600
},
{
"epoch": 3.6491420853221617,
"grad_norm": 0.12873798608779907,
"learning_rate": 8.010096176272595e-05,
"loss": 0.0189,
"step": 79700
},
{
"epoch": 3.6537206826688644,
"grad_norm": 0.26886749267578125,
"learning_rate": 8.003711321189895e-05,
"loss": 0.0206,
"step": 79800
},
{
"epoch": 3.658299280015567,
"grad_norm": 0.4891631305217743,
"learning_rate": 7.997318793065513e-05,
"loss": 0.0204,
"step": 79900
},
{
"epoch": 3.66287787736227,
"grad_norm": 0.2781907021999359,
"learning_rate": 7.99091860822929e-05,
"loss": 0.0192,
"step": 80000
},
{
"epoch": 3.667456474708973,
"grad_norm": 0.3009509742259979,
"learning_rate": 7.984510783030632e-05,
"loss": 0.0185,
"step": 80100
},
{
"epoch": 3.672035072055676,
"grad_norm": 0.5892056822776794,
"learning_rate": 7.978095333838457e-05,
"loss": 0.0191,
"step": 80200
},
{
"epoch": 3.6766136694023785,
"grad_norm": 0.3318547308444977,
"learning_rate": 7.97167227704116e-05,
"loss": 0.0194,
"step": 80300
},
{
"epoch": 3.6811922667490813,
"grad_norm": 0.4608217179775238,
"learning_rate": 7.965241629046571e-05,
"loss": 0.0215,
"step": 80400
},
{
"epoch": 3.685770864095784,
"grad_norm": 0.39660006761550903,
"learning_rate": 7.95880340628191e-05,
"loss": 0.0172,
"step": 80500
},
{
"epoch": 3.690349461442487,
"grad_norm": 0.182856485247612,
"learning_rate": 7.952357625193749e-05,
"loss": 0.0184,
"step": 80600
},
{
"epoch": 3.69492805878919,
"grad_norm": 0.6444191932678223,
"learning_rate": 7.945904302247969e-05,
"loss": 0.0179,
"step": 80700
},
{
"epoch": 3.6995066561358927,
"grad_norm": 0.4182109534740448,
"learning_rate": 7.939443453929712e-05,
"loss": 0.0217,
"step": 80800
},
{
"epoch": 3.704085253482596,
"grad_norm": 4.025650501251221,
"learning_rate": 7.932975096743346e-05,
"loss": 0.0203,
"step": 80900
},
{
"epoch": 3.708663850829298,
"grad_norm": 0.665017306804657,
"learning_rate": 7.926499247212422e-05,
"loss": 0.0186,
"step": 81000
},
{
"epoch": 3.7132424481760014,
"grad_norm": 0.12548814713954926,
"learning_rate": 7.920015921879631e-05,
"loss": 0.0182,
"step": 81100
},
{
"epoch": 3.717821045522704,
"grad_norm": 0.33034953474998474,
"learning_rate": 7.913525137306756e-05,
"loss": 0.0225,
"step": 81200
},
{
"epoch": 3.722399642869407,
"grad_norm": 0.2771977186203003,
"learning_rate": 7.907026910074643e-05,
"loss": 0.0206,
"step": 81300
},
{
"epoch": 3.72697824021611,
"grad_norm": 0.1603299379348755,
"learning_rate": 7.900521256783143e-05,
"loss": 0.0191,
"step": 81400
},
{
"epoch": 3.7315568375628128,
"grad_norm": 0.29296520352363586,
"learning_rate": 7.894008194051077e-05,
"loss": 0.0199,
"step": 81500
},
{
"epoch": 3.7361354349095155,
"grad_norm": 0.3158813416957855,
"learning_rate": 7.8874877385162e-05,
"loss": 0.0216,
"step": 81600
},
{
"epoch": 3.7407140322562182,
"grad_norm": 0.42911648750305176,
"learning_rate": 7.880959906835148e-05,
"loss": 0.0174,
"step": 81700
},
{
"epoch": 3.745292629602921,
"grad_norm": 0.3854501247406006,
"learning_rate": 7.8744247156834e-05,
"loss": 0.0217,
"step": 81800
},
{
"epoch": 3.749871226949624,
"grad_norm": 0.1661909967660904,
"learning_rate": 7.86788218175523e-05,
"loss": 0.0185,
"step": 81900
},
{
"epoch": 3.754449824296327,
"grad_norm": 0.3275599479675293,
"learning_rate": 7.861332321763682e-05,
"loss": 0.0172,
"step": 82000
},
{
"epoch": 3.7590284216430296,
"grad_norm": 0.4914777874946594,
"learning_rate": 7.854775152440501e-05,
"loss": 0.0206,
"step": 82100
},
{
"epoch": 3.7636070189897324,
"grad_norm": 0.6310822367668152,
"learning_rate": 7.84821069053611e-05,
"loss": 0.0193,
"step": 82200
},
{
"epoch": 3.768185616336435,
"grad_norm": 0.33729735016822815,
"learning_rate": 7.841638952819563e-05,
"loss": 0.0209,
"step": 82300
},
{
"epoch": 3.7727642136831383,
"grad_norm": 0.6020189523696899,
"learning_rate": 7.835059956078494e-05,
"loss": 0.0194,
"step": 82400
},
{
"epoch": 3.777342811029841,
"grad_norm": 0.3810158669948578,
"learning_rate": 7.828473717119088e-05,
"loss": 0.0199,
"step": 82500
},
{
"epoch": 3.781921408376544,
"grad_norm": 0.6647739410400391,
"learning_rate": 7.821880252766025e-05,
"loss": 0.0211,
"step": 82600
},
{
"epoch": 3.7865000057232465,
"grad_norm": 0.5358772873878479,
"learning_rate": 7.815279579862442e-05,
"loss": 0.0196,
"step": 82700
},
{
"epoch": 3.7910786030699493,
"grad_norm": 0.26241055130958557,
"learning_rate": 7.808671715269896e-05,
"loss": 0.0206,
"step": 82800
},
{
"epoch": 3.7956572004166524,
"grad_norm": 0.24061718583106995,
"learning_rate": 7.802056675868306e-05,
"loss": 0.0186,
"step": 82900
},
{
"epoch": 3.800235797763355,
"grad_norm": 0.16280798614025116,
"learning_rate": 7.79543447855593e-05,
"loss": 0.0185,
"step": 83000
},
{
"epoch": 3.804814395110058,
"grad_norm": 0.7385302186012268,
"learning_rate": 7.788805140249302e-05,
"loss": 0.0207,
"step": 83100
},
{
"epoch": 3.809392992456761,
"grad_norm": 0.20743854343891144,
"learning_rate": 7.782168677883206e-05,
"loss": 0.0177,
"step": 83200
},
{
"epoch": 3.813971589803464,
"grad_norm": 0.3482532501220703,
"learning_rate": 7.775525108410615e-05,
"loss": 0.0216,
"step": 83300
},
{
"epoch": 3.8185501871501666,
"grad_norm": 0.42130351066589355,
"learning_rate": 7.768874448802665e-05,
"loss": 0.0207,
"step": 83400
},
{
"epoch": 3.8231287844968693,
"grad_norm": 0.44204580783843994,
"learning_rate": 7.762216716048602e-05,
"loss": 0.0215,
"step": 83500
},
{
"epoch": 3.827707381843572,
"grad_norm": 0.20962856709957123,
"learning_rate": 7.755551927155739e-05,
"loss": 0.0183,
"step": 83600
},
{
"epoch": 3.8322859791902752,
"grad_norm": 0.19921015202999115,
"learning_rate": 7.748880099149415e-05,
"loss": 0.02,
"step": 83700
},
{
"epoch": 3.836864576536978,
"grad_norm": 0.2693636119365692,
"learning_rate": 7.742201249072948e-05,
"loss": 0.019,
"step": 83800
},
{
"epoch": 3.8414431738836807,
"grad_norm": 0.677135705947876,
"learning_rate": 7.735515393987602e-05,
"loss": 0.0195,
"step": 83900
},
{
"epoch": 3.8460217712303835,
"grad_norm": 0.34260210394859314,
"learning_rate": 7.728822550972523e-05,
"loss": 0.0194,
"step": 84000
},
{
"epoch": 3.850600368577086,
"grad_norm": 0.83556067943573,
"learning_rate": 7.72212273712472e-05,
"loss": 0.0226,
"step": 84100
},
{
"epoch": 3.8551789659237894,
"grad_norm": 0.22360268235206604,
"learning_rate": 7.715415969559002e-05,
"loss": 0.0177,
"step": 84200
},
{
"epoch": 3.859757563270492,
"grad_norm": 0.32109469175338745,
"learning_rate": 7.708702265407941e-05,
"loss": 0.0197,
"step": 84300
},
{
"epoch": 3.864336160617195,
"grad_norm": 0.4577140212059021,
"learning_rate": 7.701981641821834e-05,
"loss": 0.0173,
"step": 84400
},
{
"epoch": 3.8689147579638976,
"grad_norm": 0.30675482749938965,
"learning_rate": 7.695254115968648e-05,
"loss": 0.0198,
"step": 84500
},
{
"epoch": 3.8734933553106004,
"grad_norm": 0.6526969075202942,
"learning_rate": 7.688519705033989e-05,
"loss": 0.0222,
"step": 84600
},
{
"epoch": 3.8780719526573035,
"grad_norm": 0.09654036164283752,
"learning_rate": 7.681778426221042e-05,
"loss": 0.0194,
"step": 84700
},
{
"epoch": 3.8826505500040063,
"grad_norm": 0.2337755411863327,
"learning_rate": 7.675030296750542e-05,
"loss": 0.019,
"step": 84800
},
{
"epoch": 3.887229147350709,
"grad_norm": 0.05356181785464287,
"learning_rate": 7.668275333860724e-05,
"loss": 0.0202,
"step": 84900
},
{
"epoch": 3.891807744697412,
"grad_norm": 0.4630540907382965,
"learning_rate": 7.66151355480728e-05,
"loss": 0.0182,
"step": 85000
},
{
"epoch": 3.896386342044115,
"grad_norm": 0.21360653638839722,
"learning_rate": 7.65474497686331e-05,
"loss": 0.0198,
"step": 85100
},
{
"epoch": 3.9009649393908177,
"grad_norm": 0.2991812229156494,
"learning_rate": 7.647969617319282e-05,
"loss": 0.0201,
"step": 85200
},
{
"epoch": 3.9055435367375204,
"grad_norm": 0.214981809258461,
"learning_rate": 7.641187493482995e-05,
"loss": 0.0164,
"step": 85300
},
{
"epoch": 3.910122134084223,
"grad_norm": 0.48418205976486206,
"learning_rate": 7.634398622679517e-05,
"loss": 0.0192,
"step": 85400
},
{
"epoch": 3.9147007314309263,
"grad_norm": 0.5781142711639404,
"learning_rate": 7.62760302225116e-05,
"loss": 0.0199,
"step": 85500
},
{
"epoch": 3.919279328777629,
"grad_norm": 0.7809280157089233,
"learning_rate": 7.620800709557421e-05,
"loss": 0.0186,
"step": 85600
},
{
"epoch": 3.923857926124332,
"grad_norm": 0.1833581030368805,
"learning_rate": 7.61399170197495e-05,
"loss": 0.0189,
"step": 85700
},
{
"epoch": 3.9284365234710346,
"grad_norm": 0.3215663433074951,
"learning_rate": 7.60717601689749e-05,
"loss": 0.0168,
"step": 85800
},
{
"epoch": 3.9330151208177373,
"grad_norm": 0.41018444299697876,
"learning_rate": 7.600353671735853e-05,
"loss": 0.0208,
"step": 85900
},
{
"epoch": 3.9375937181644405,
"grad_norm": 0.34082677960395813,
"learning_rate": 7.593524683917854e-05,
"loss": 0.0191,
"step": 86000
},
{
"epoch": 3.942172315511143,
"grad_norm": 0.39426901936531067,
"learning_rate": 7.586689070888284e-05,
"loss": 0.0199,
"step": 86100
},
{
"epoch": 3.946750912857846,
"grad_norm": 0.4446451663970947,
"learning_rate": 7.579846850108855e-05,
"loss": 0.0204,
"step": 86200
},
{
"epoch": 3.9513295102045487,
"grad_norm": 0.3159216344356537,
"learning_rate": 7.572998039058159e-05,
"loss": 0.0183,
"step": 86300
},
{
"epoch": 3.9559081075512514,
"grad_norm": 0.3799346387386322,
"learning_rate": 7.566142655231622e-05,
"loss": 0.019,
"step": 86400
},
{
"epoch": 3.9604867048979546,
"grad_norm": 0.4832625687122345,
"learning_rate": 7.559280716141463e-05,
"loss": 0.0179,
"step": 86500
},
{
"epoch": 3.9650653022446574,
"grad_norm": 0.2456403523683548,
"learning_rate": 7.552412239316645e-05,
"loss": 0.0184,
"step": 86600
},
{
"epoch": 3.96964389959136,
"grad_norm": 0.3314709961414337,
"learning_rate": 7.545537242302829e-05,
"loss": 0.0177,
"step": 86700
},
{
"epoch": 3.9742224969380633,
"grad_norm": 0.4336375892162323,
"learning_rate": 7.53865574266234e-05,
"loss": 0.0187,
"step": 86800
},
{
"epoch": 3.978801094284766,
"grad_norm": 0.7629146575927734,
"learning_rate": 7.531767757974104e-05,
"loss": 0.0199,
"step": 86900
},
{
"epoch": 3.9833796916314688,
"grad_norm": 0.16511370241641998,
"learning_rate": 7.52487330583362e-05,
"loss": 0.0182,
"step": 87000
},
{
"epoch": 3.9879582889781715,
"grad_norm": 0.29885396361351013,
"learning_rate": 7.517972403852905e-05,
"loss": 0.0193,
"step": 87100
},
{
"epoch": 3.9925368863248742,
"grad_norm": 0.4066375494003296,
"learning_rate": 7.511065069660458e-05,
"loss": 0.0191,
"step": 87200
},
{
"epoch": 3.9971154836715774,
"grad_norm": 0.44243311882019043,
"learning_rate": 7.504151320901199e-05,
"loss": 0.0203,
"step": 87300
},
{
"epoch": 4.0,
"eval_loss": 0.15203019976615906,
"eval_runtime": 258.1696,
"eval_samples_per_second": 21.304,
"eval_steps_per_second": 21.304,
"step": 87363
},
{
"epoch": 4.00169408101828,
"grad_norm": 0.2750494182109833,
"learning_rate": 7.497231175236442e-05,
"loss": 0.0174,
"step": 87400
},
{
"epoch": 4.006272678364983,
"grad_norm": 0.4887785315513611,
"learning_rate": 7.490304650343841e-05,
"loss": 0.0131,
"step": 87500
},
{
"epoch": 4.010851275711686,
"grad_norm": 0.21974627673625946,
"learning_rate": 7.483371763917345e-05,
"loss": 0.0141,
"step": 87600
},
{
"epoch": 4.015429873058388,
"grad_norm": 0.34770917892456055,
"learning_rate": 7.476432533667151e-05,
"loss": 0.0139,
"step": 87700
},
{
"epoch": 4.020008470405092,
"grad_norm": 0.2878529727458954,
"learning_rate": 7.469486977319665e-05,
"loss": 0.0118,
"step": 87800
},
{
"epoch": 4.024587067751794,
"grad_norm": 0.6604347229003906,
"learning_rate": 7.462535112617452e-05,
"loss": 0.0128,
"step": 87900
},
{
"epoch": 4.029165665098497,
"grad_norm": 0.4288138747215271,
"learning_rate": 7.455576957319194e-05,
"loss": 0.0145,
"step": 88000
},
{
"epoch": 4.0337442624452,
"grad_norm": 0.19010120630264282,
"learning_rate": 7.448612529199637e-05,
"loss": 0.0114,
"step": 88100
},
{
"epoch": 4.0383228597919025,
"grad_norm": 0.3835040032863617,
"learning_rate": 7.441641846049556e-05,
"loss": 0.0152,
"step": 88200
},
{
"epoch": 4.042901457138606,
"grad_norm": 2.3910844326019287,
"learning_rate": 7.434664925675702e-05,
"loss": 0.0153,
"step": 88300
},
{
"epoch": 4.047480054485308,
"grad_norm": 0.40593621134757996,
"learning_rate": 7.427681785900761e-05,
"loss": 0.0143,
"step": 88400
},
{
"epoch": 4.052058651832011,
"grad_norm": 0.08815860003232956,
"learning_rate": 7.420692444563305e-05,
"loss": 0.014,
"step": 88500
},
{
"epoch": 4.056637249178714,
"grad_norm": 0.33992356061935425,
"learning_rate": 7.413696919517749e-05,
"loss": 0.0135,
"step": 88600
},
{
"epoch": 4.061215846525417,
"grad_norm": 0.32726776599884033,
"learning_rate": 7.406695228634305e-05,
"loss": 0.0131,
"step": 88700
},
{
"epoch": 4.06579444387212,
"grad_norm": 0.3524836301803589,
"learning_rate": 7.399687389798933e-05,
"loss": 0.0136,
"step": 88800
},
{
"epoch": 4.070373041218823,
"grad_norm": 0.18603968620300293,
"learning_rate": 7.3926734209133e-05,
"loss": 0.0123,
"step": 88900
},
{
"epoch": 4.074951638565525,
"grad_norm": 0.4780280888080597,
"learning_rate": 7.385653339894733e-05,
"loss": 0.0142,
"step": 89000
},
{
"epoch": 4.0795302359122285,
"grad_norm": 0.22851374745368958,
"learning_rate": 7.378627164676173e-05,
"loss": 0.013,
"step": 89100
},
{
"epoch": 4.084108833258931,
"grad_norm": 0.4251825511455536,
"learning_rate": 7.371594913206124e-05,
"loss": 0.0153,
"step": 89200
},
{
"epoch": 4.088687430605634,
"grad_norm": 0.3959885239601135,
"learning_rate": 7.364556603448619e-05,
"loss": 0.0166,
"step": 89300
},
{
"epoch": 4.093266027952337,
"grad_norm": 0.8459362387657166,
"learning_rate": 7.357512253383162e-05,
"loss": 0.0152,
"step": 89400
},
{
"epoch": 4.0978446252990395,
"grad_norm": 0.5725641250610352,
"learning_rate": 7.35046188100469e-05,
"loss": 0.0135,
"step": 89500
},
{
"epoch": 4.102423222645743,
"grad_norm": 0.2906801402568817,
"learning_rate": 7.343405504323519e-05,
"loss": 0.013,
"step": 89600
},
{
"epoch": 4.107001819992445,
"grad_norm": 0.10050017386674881,
"learning_rate": 7.33634314136531e-05,
"loss": 0.0114,
"step": 89700
},
{
"epoch": 4.111580417339148,
"grad_norm": 0.6948938965797424,
"learning_rate": 7.329274810171014e-05,
"loss": 0.0138,
"step": 89800
},
{
"epoch": 4.116159014685851,
"grad_norm": 0.4069768190383911,
"learning_rate": 7.322200528796822e-05,
"loss": 0.0124,
"step": 89900
},
{
"epoch": 4.120737612032554,
"grad_norm": 0.09699010848999023,
"learning_rate": 7.315120315314134e-05,
"loss": 0.0128,
"step": 90000
},
{
"epoch": 4.125316209379257,
"grad_norm": 0.3347591161727905,
"learning_rate": 7.308034187809498e-05,
"loss": 0.0166,
"step": 90100
},
{
"epoch": 4.129894806725959,
"grad_norm": 0.22168204188346863,
"learning_rate": 7.300942164384571e-05,
"loss": 0.0151,
"step": 90200
},
{
"epoch": 4.134473404072662,
"grad_norm": 0.5564683675765991,
"learning_rate": 7.293844263156072e-05,
"loss": 0.0126,
"step": 90300
},
{
"epoch": 4.1390520014193655,
"grad_norm": 0.32226261496543884,
"learning_rate": 7.28674050225573e-05,
"loss": 0.0131,
"step": 90400
},
{
"epoch": 4.143630598766068,
"grad_norm": 0.36912479996681213,
"learning_rate": 7.279630899830252e-05,
"loss": 0.0143,
"step": 90500
},
{
"epoch": 4.148209196112771,
"grad_norm": 0.2860753834247589,
"learning_rate": 7.272515474041259e-05,
"loss": 0.0152,
"step": 90600
},
{
"epoch": 4.152787793459474,
"grad_norm": 0.3625887930393219,
"learning_rate": 7.265394243065253e-05,
"loss": 0.0143,
"step": 90700
},
{
"epoch": 4.157366390806176,
"grad_norm": 0.24506491422653198,
"learning_rate": 7.258267225093563e-05,
"loss": 0.015,
"step": 90800
},
{
"epoch": 4.16194498815288,
"grad_norm": 0.03290629759430885,
"learning_rate": 7.251134438332299e-05,
"loss": 0.0126,
"step": 90900
},
{
"epoch": 4.166523585499582,
"grad_norm": 0.4261631667613983,
"learning_rate": 7.243995901002312e-05,
"loss": 0.0148,
"step": 91000
},
{
"epoch": 4.171102182846285,
"grad_norm": 0.14463308453559875,
"learning_rate": 7.23685163133914e-05,
"loss": 0.0113,
"step": 91100
},
{
"epoch": 4.175680780192988,
"grad_norm": 0.53131502866745,
"learning_rate": 7.229701647592966e-05,
"loss": 0.0136,
"step": 91200
},
{
"epoch": 4.180259377539691,
"grad_norm": 0.30526795983314514,
"learning_rate": 7.222545968028569e-05,
"loss": 0.0142,
"step": 91300
},
{
"epoch": 4.184837974886394,
"grad_norm": 0.07798325270414352,
"learning_rate": 7.215384610925278e-05,
"loss": 0.0134,
"step": 91400
},
{
"epoch": 4.189416572233096,
"grad_norm": 0.164367213845253,
"learning_rate": 7.208217594576923e-05,
"loss": 0.0127,
"step": 91500
},
{
"epoch": 4.193995169579799,
"grad_norm": 0.0945630893111229,
"learning_rate": 7.201044937291797e-05,
"loss": 0.0118,
"step": 91600
},
{
"epoch": 4.198573766926502,
"grad_norm": 0.38682791590690613,
"learning_rate": 7.193866657392597e-05,
"loss": 0.0141,
"step": 91700
},
{
"epoch": 4.203152364273205,
"grad_norm": 0.49326708912849426,
"learning_rate": 7.186682773216384e-05,
"loss": 0.0125,
"step": 91800
},
{
"epoch": 4.207730961619908,
"grad_norm": 0.2276126593351364,
"learning_rate": 7.179493303114537e-05,
"loss": 0.014,
"step": 91900
},
{
"epoch": 4.21230955896661,
"grad_norm": 0.5109021067619324,
"learning_rate": 7.172298265452706e-05,
"loss": 0.0138,
"step": 92000
},
{
"epoch": 4.216888156313313,
"grad_norm": 0.23471687734127045,
"learning_rate": 7.165097678610759e-05,
"loss": 0.014,
"step": 92100
},
{
"epoch": 4.221466753660017,
"grad_norm": 0.4894104301929474,
"learning_rate": 7.15789156098274e-05,
"loss": 0.0155,
"step": 92200
},
{
"epoch": 4.226045351006719,
"grad_norm": 0.1319025456905365,
"learning_rate": 7.150679930976825e-05,
"loss": 0.0135,
"step": 92300
},
{
"epoch": 4.230623948353422,
"grad_norm": 0.32496750354766846,
"learning_rate": 7.143462807015271e-05,
"loss": 0.0136,
"step": 92400
},
{
"epoch": 4.235202545700124,
"grad_norm": 0.380876749753952,
"learning_rate": 7.136240207534365e-05,
"loss": 0.0148,
"step": 92500
},
{
"epoch": 4.2397811430468275,
"grad_norm": 0.18530067801475525,
"learning_rate": 7.129012150984387e-05,
"loss": 0.0143,
"step": 92600
},
{
"epoch": 4.244359740393531,
"grad_norm": 0.9411688446998596,
"learning_rate": 7.121778655829554e-05,
"loss": 0.0115,
"step": 92700
},
{
"epoch": 4.248938337740233,
"grad_norm": 0.22460629045963287,
"learning_rate": 7.114539740547974e-05,
"loss": 0.0159,
"step": 92800
},
{
"epoch": 4.253516935086936,
"grad_norm": 0.19735155999660492,
"learning_rate": 7.107295423631606e-05,
"loss": 0.0133,
"step": 92900
},
{
"epoch": 4.258095532433639,
"grad_norm": 0.2656545341014862,
"learning_rate": 7.100045723586204e-05,
"loss": 0.0125,
"step": 93000
},
{
"epoch": 4.262674129780342,
"grad_norm": 1.059777021408081,
"learning_rate": 7.092790658931273e-05,
"loss": 0.0148,
"step": 93100
},
{
"epoch": 4.267252727127045,
"grad_norm": 0.3590608835220337,
"learning_rate": 7.085530248200027e-05,
"loss": 0.0139,
"step": 93200
},
{
"epoch": 4.271831324473747,
"grad_norm": 0.133284792304039,
"learning_rate": 7.07826450993933e-05,
"loss": 0.0153,
"step": 93300
},
{
"epoch": 4.27640992182045,
"grad_norm": 0.3305582106113434,
"learning_rate": 7.070993462709656e-05,
"loss": 0.0129,
"step": 93400
},
{
"epoch": 4.2809885191671535,
"grad_norm": 0.4209526777267456,
"learning_rate": 7.06371712508505e-05,
"loss": 0.0125,
"step": 93500
},
{
"epoch": 4.285567116513856,
"grad_norm": 0.10924796760082245,
"learning_rate": 7.056435515653059e-05,
"loss": 0.0162,
"step": 93600
},
{
"epoch": 4.290145713860559,
"grad_norm": 0.4727434515953064,
"learning_rate": 7.049148653014702e-05,
"loss": 0.0126,
"step": 93700
},
{
"epoch": 4.294724311207261,
"grad_norm": 0.5440820455551147,
"learning_rate": 7.041856555784421e-05,
"loss": 0.0131,
"step": 93800
},
{
"epoch": 4.2993029085539645,
"grad_norm": 0.07101954519748688,
"learning_rate": 7.034559242590027e-05,
"loss": 0.0163,
"step": 93900
},
{
"epoch": 4.303881505900668,
"grad_norm": 1.4522393941879272,
"learning_rate": 7.027256732072651e-05,
"loss": 0.014,
"step": 94000
},
{
"epoch": 4.30846010324737,
"grad_norm": 0.1080670952796936,
"learning_rate": 7.019949042886708e-05,
"loss": 0.013,
"step": 94100
},
{
"epoch": 4.313038700594073,
"grad_norm": 0.4725320339202881,
"learning_rate": 7.012636193699837e-05,
"loss": 0.0133,
"step": 94200
},
{
"epoch": 4.317617297940776,
"grad_norm": 0.7752532362937927,
"learning_rate": 7.005318203192864e-05,
"loss": 0.0136,
"step": 94300
},
{
"epoch": 4.322195895287479,
"grad_norm": 0.39167362451553345,
"learning_rate": 6.997995090059739e-05,
"loss": 0.0132,
"step": 94400
},
{
"epoch": 4.326774492634182,
"grad_norm": 0.16077743470668793,
"learning_rate": 6.990666873007505e-05,
"loss": 0.0126,
"step": 94500
},
{
"epoch": 4.331353089980884,
"grad_norm": 0.20132170617580414,
"learning_rate": 6.983333570756245e-05,
"loss": 0.0125,
"step": 94600
},
{
"epoch": 4.335931687327587,
"grad_norm": 0.4036431610584259,
"learning_rate": 6.975995202039025e-05,
"loss": 0.0149,
"step": 94700
},
{
"epoch": 4.34051028467429,
"grad_norm": 0.8535305261611938,
"learning_rate": 6.968651785601859e-05,
"loss": 0.0136,
"step": 94800
},
{
"epoch": 4.345088882020993,
"grad_norm": 0.3927995562553406,
"learning_rate": 6.961303340203653e-05,
"loss": 0.0146,
"step": 94900
},
{
"epoch": 4.349667479367696,
"grad_norm": 0.371528297662735,
"learning_rate": 6.953949884616162e-05,
"loss": 0.0124,
"step": 95000
},
{
"epoch": 4.354246076714398,
"grad_norm": 0.06207489222288132,
"learning_rate": 6.946591437623934e-05,
"loss": 0.0129,
"step": 95100
},
{
"epoch": 4.358824674061101,
"grad_norm": 0.05522959679365158,
"learning_rate": 6.939228018024275e-05,
"loss": 0.0133,
"step": 95200
},
{
"epoch": 4.363403271407805,
"grad_norm": 0.5625087022781372,
"learning_rate": 6.931859644627189e-05,
"loss": 0.0141,
"step": 95300
},
{
"epoch": 4.367981868754507,
"grad_norm": 0.13779932260513306,
"learning_rate": 6.924486336255337e-05,
"loss": 0.0135,
"step": 95400
},
{
"epoch": 4.37256046610121,
"grad_norm": 1.0762056112289429,
"learning_rate": 6.917108111743984e-05,
"loss": 0.0142,
"step": 95500
},
{
"epoch": 4.377139063447912,
"grad_norm": 0.22283124923706055,
"learning_rate": 6.909724989940953e-05,
"loss": 0.0133,
"step": 95600
},
{
"epoch": 4.3817176607946156,
"grad_norm": 0.5186660289764404,
"learning_rate": 6.902336989706581e-05,
"loss": 0.0136,
"step": 95700
},
{
"epoch": 4.386296258141319,
"grad_norm": 0.47632691264152527,
"learning_rate": 6.894944129913667e-05,
"loss": 0.0147,
"step": 95800
},
{
"epoch": 4.390874855488021,
"grad_norm": 1.1676534414291382,
"learning_rate": 6.887546429447419e-05,
"loss": 0.0128,
"step": 95900
},
{
"epoch": 4.395453452834724,
"grad_norm": 1.0476038455963135,
"learning_rate": 6.880143907205411e-05,
"loss": 0.0132,
"step": 96000
},
{
"epoch": 4.4000320501814265,
"grad_norm": 0.656058669090271,
"learning_rate": 6.872736582097541e-05,
"loss": 0.0152,
"step": 96100
},
{
"epoch": 4.40461064752813,
"grad_norm": 0.3963877856731415,
"learning_rate": 6.86532447304597e-05,
"loss": 0.0122,
"step": 96200
},
{
"epoch": 4.409189244874833,
"grad_norm": 0.23698298633098602,
"learning_rate": 6.857907598985081e-05,
"loss": 0.0135,
"step": 96300
},
{
"epoch": 4.413767842221535,
"grad_norm": 0.20948071777820587,
"learning_rate": 6.850485978861431e-05,
"loss": 0.0136,
"step": 96400
},
{
"epoch": 4.418346439568238,
"grad_norm": 0.3551422357559204,
"learning_rate": 6.843059631633699e-05,
"loss": 0.0143,
"step": 96500
},
{
"epoch": 4.4229250369149415,
"grad_norm": 0.21045321226119995,
"learning_rate": 6.835628576272638e-05,
"loss": 0.0149,
"step": 96600
},
{
"epoch": 4.427503634261644,
"grad_norm": 1.5752928256988525,
"learning_rate": 6.828192831761033e-05,
"loss": 0.0151,
"step": 96700
},
{
"epoch": 4.432082231608347,
"grad_norm": 0.4416331350803375,
"learning_rate": 6.820752417093644e-05,
"loss": 0.0133,
"step": 96800
},
{
"epoch": 4.436660828955049,
"grad_norm": 0.44132721424102783,
"learning_rate": 6.81330735127716e-05,
"loss": 0.0101,
"step": 96900
},
{
"epoch": 4.4412394263017525,
"grad_norm": 0.2506002187728882,
"learning_rate": 6.805857653330156e-05,
"loss": 0.0128,
"step": 97000
},
{
"epoch": 4.445818023648456,
"grad_norm": 0.11981073021888733,
"learning_rate": 6.798403342283034e-05,
"loss": 0.0127,
"step": 97100
},
{
"epoch": 4.450396620995158,
"grad_norm": 0.9063414335250854,
"learning_rate": 6.790944437177984e-05,
"loss": 0.0136,
"step": 97200
},
{
"epoch": 4.454975218341861,
"grad_norm": 1.0382390022277832,
"learning_rate": 6.783480957068934e-05,
"loss": 0.0116,
"step": 97300
},
{
"epoch": 4.4595538156885635,
"grad_norm": 0.22426804900169373,
"learning_rate": 6.776012921021492e-05,
"loss": 0.0149,
"step": 97400
},
{
"epoch": 4.464132413035267,
"grad_norm": 0.4911547899246216,
"learning_rate": 6.768540348112907e-05,
"loss": 0.0123,
"step": 97500
},
{
"epoch": 4.46871101038197,
"grad_norm": 0.6653274893760681,
"learning_rate": 6.761063257432023e-05,
"loss": 0.0121,
"step": 97600
},
{
"epoch": 4.473289607728672,
"grad_norm": 0.37786972522735596,
"learning_rate": 6.753581668079219e-05,
"loss": 0.0133,
"step": 97700
},
{
"epoch": 4.477868205075375,
"grad_norm": 0.15616688132286072,
"learning_rate": 6.746095599166362e-05,
"loss": 0.013,
"step": 97800
},
{
"epoch": 4.482446802422078,
"grad_norm": 0.11935741454362869,
"learning_rate": 6.738605069816775e-05,
"loss": 0.0148,
"step": 97900
},
{
"epoch": 4.487025399768781,
"grad_norm": 0.18721537292003632,
"learning_rate": 6.731110099165164e-05,
"loss": 0.0139,
"step": 98000
},
{
"epoch": 4.491603997115484,
"grad_norm": 0.3637322783470154,
"learning_rate": 6.723610706357582e-05,
"loss": 0.0148,
"step": 98100
},
{
"epoch": 4.496182594462186,
"grad_norm": 0.1633034497499466,
"learning_rate": 6.716106910551385e-05,
"loss": 0.0127,
"step": 98200
},
{
"epoch": 4.5007611918088894,
"grad_norm": 0.19283847510814667,
"learning_rate": 6.708598730915168e-05,
"loss": 0.0132,
"step": 98300
},
{
"epoch": 4.505339789155592,
"grad_norm": 0.17327933013439178,
"learning_rate": 6.701086186628732e-05,
"loss": 0.0156,
"step": 98400
},
{
"epoch": 4.509918386502295,
"grad_norm": 0.06521926075220108,
"learning_rate": 6.693569296883022e-05,
"loss": 0.0137,
"step": 98500
},
{
"epoch": 4.514496983848998,
"grad_norm": 0.4145078659057617,
"learning_rate": 6.686048080880086e-05,
"loss": 0.0144,
"step": 98600
},
{
"epoch": 4.5190755811957,
"grad_norm": 0.5390291810035706,
"learning_rate": 6.678522557833024e-05,
"loss": 0.0132,
"step": 98700
},
{
"epoch": 4.523654178542404,
"grad_norm": 0.2249838411808014,
"learning_rate": 6.670992746965938e-05,
"loss": 0.0122,
"step": 98800
},
{
"epoch": 4.528232775889107,
"grad_norm": 0.09684702008962631,
"learning_rate": 6.663458667513882e-05,
"loss": 0.0122,
"step": 98900
},
{
"epoch": 4.532811373235809,
"grad_norm": 0.5852058529853821,
"learning_rate": 6.655920338722816e-05,
"loss": 0.014,
"step": 99000
},
{
"epoch": 4.537389970582512,
"grad_norm": 0.4523356258869171,
"learning_rate": 6.648377779849554e-05,
"loss": 0.0129,
"step": 99100
},
{
"epoch": 4.5419685679292146,
"grad_norm": 0.4520733058452606,
"learning_rate": 6.640831010161716e-05,
"loss": 0.0123,
"step": 99200
},
{
"epoch": 4.546547165275918,
"grad_norm": 0.42760178446769714,
"learning_rate": 6.633280048937678e-05,
"loss": 0.0171,
"step": 99300
},
{
"epoch": 4.551125762622621,
"grad_norm": 0.27447327971458435,
"learning_rate": 6.625724915466526e-05,
"loss": 0.0136,
"step": 99400
},
{
"epoch": 4.555704359969323,
"grad_norm": 0.30612578988075256,
"learning_rate": 6.618165629048e-05,
"loss": 0.0133,
"step": 99500
},
{
"epoch": 4.560282957316026,
"grad_norm": 0.48825210332870483,
"learning_rate": 6.610602208992454e-05,
"loss": 0.0123,
"step": 99600
},
{
"epoch": 4.564861554662729,
"grad_norm": 0.43417781591415405,
"learning_rate": 6.603034674620794e-05,
"loss": 0.0149,
"step": 99700
},
{
"epoch": 4.569440152009432,
"grad_norm": 0.6489459276199341,
"learning_rate": 6.595463045264445e-05,
"loss": 0.0118,
"step": 99800
},
{
"epoch": 4.574018749356135,
"grad_norm": 0.29751142859458923,
"learning_rate": 6.587887340265286e-05,
"loss": 0.0122,
"step": 99900
},
{
"epoch": 4.578597346702837,
"grad_norm": 0.1352328062057495,
"learning_rate": 6.580307578975608e-05,
"loss": 0.0139,
"step": 100000
},
{
"epoch": 4.5831759440495405,
"grad_norm": 0.2985703945159912,
"learning_rate": 6.572723780758069e-05,
"loss": 0.0121,
"step": 100100
},
{
"epoch": 4.587754541396244,
"grad_norm": 0.1775195151567459,
"learning_rate": 6.565135964985634e-05,
"loss": 0.0139,
"step": 100200
},
{
"epoch": 4.592333138742946,
"grad_norm": 0.41841718554496765,
"learning_rate": 6.557544151041531e-05,
"loss": 0.0146,
"step": 100300
},
{
"epoch": 4.596911736089649,
"grad_norm": 0.07853005081415176,
"learning_rate": 6.549948358319206e-05,
"loss": 0.0138,
"step": 100400
},
{
"epoch": 4.6014903334363515,
"grad_norm": 0.39813074469566345,
"learning_rate": 6.542348606222266e-05,
"loss": 0.0127,
"step": 100500
},
{
"epoch": 4.606068930783055,
"grad_norm": 0.3754967749118805,
"learning_rate": 6.53474491416443e-05,
"loss": 0.0156,
"step": 100600
},
{
"epoch": 4.610647528129757,
"grad_norm": 0.6578196287155151,
"learning_rate": 6.527137301569486e-05,
"loss": 0.0125,
"step": 100700
},
{
"epoch": 4.61522612547646,
"grad_norm": 0.7814628481864929,
"learning_rate": 6.519525787871235e-05,
"loss": 0.0142,
"step": 100800
},
{
"epoch": 4.619804722823163,
"grad_norm": 0.23694345355033875,
"learning_rate": 6.511910392513443e-05,
"loss": 0.0115,
"step": 100900
},
{
"epoch": 4.624383320169866,
"grad_norm": 0.18302284181118011,
"learning_rate": 6.504291134949792e-05,
"loss": 0.0138,
"step": 101000
},
{
"epoch": 4.628961917516569,
"grad_norm": 0.5445951223373413,
"learning_rate": 6.496668034643831e-05,
"loss": 0.0149,
"step": 101100
},
{
"epoch": 4.633540514863272,
"grad_norm": 0.6721272468566895,
"learning_rate": 6.489041111068926e-05,
"loss": 0.014,
"step": 101200
},
{
"epoch": 4.638119112209974,
"grad_norm": 0.40816518664360046,
"learning_rate": 6.481410383708206e-05,
"loss": 0.012,
"step": 101300
},
{
"epoch": 4.6426977095566775,
"grad_norm": 0.28873249888420105,
"learning_rate": 6.473775872054521e-05,
"loss": 0.0148,
"step": 101400
},
{
"epoch": 4.64727630690338,
"grad_norm": 0.5939431190490723,
"learning_rate": 6.466137595610388e-05,
"loss": 0.0124,
"step": 101500
},
{
"epoch": 4.651854904250083,
"grad_norm": 0.08564829081296921,
"learning_rate": 6.458495573887933e-05,
"loss": 0.0128,
"step": 101600
},
{
"epoch": 4.656433501596786,
"grad_norm": 0.8717368245124817,
"learning_rate": 6.450849826408865e-05,
"loss": 0.0137,
"step": 101700
},
{
"epoch": 4.6610120989434884,
"grad_norm": 0.02314877323806286,
"learning_rate": 6.443200372704395e-05,
"loss": 0.0151,
"step": 101800
},
{
"epoch": 4.665590696290192,
"grad_norm": 0.2785604000091553,
"learning_rate": 6.43554723231521e-05,
"loss": 0.0111,
"step": 101900
},
{
"epoch": 4.670169293636894,
"grad_norm": 0.14578752219676971,
"learning_rate": 6.427890424791415e-05,
"loss": 0.0131,
"step": 102000
},
{
"epoch": 4.674747890983597,
"grad_norm": 0.14544513821601868,
"learning_rate": 6.420229969692477e-05,
"loss": 0.0136,
"step": 102100
},
{
"epoch": 4.6793264883303,
"grad_norm": 0.36046111583709717,
"learning_rate": 6.412565886587185e-05,
"loss": 0.0135,
"step": 102200
},
{
"epoch": 4.683905085677003,
"grad_norm": 0.208379328250885,
"learning_rate": 6.404898195053597e-05,
"loss": 0.0132,
"step": 102300
},
{
"epoch": 4.688483683023706,
"grad_norm": 0.04505769535899162,
"learning_rate": 6.397226914678986e-05,
"loss": 0.014,
"step": 102400
},
{
"epoch": 4.693062280370409,
"grad_norm": 0.12393535673618317,
"learning_rate": 6.389552065059795e-05,
"loss": 0.0142,
"step": 102500
},
{
"epoch": 4.697640877717111,
"grad_norm": 0.14113786816596985,
"learning_rate": 6.381873665801581e-05,
"loss": 0.0146,
"step": 102600
},
{
"epoch": 4.702219475063814,
"grad_norm": 0.14992213249206543,
"learning_rate": 6.374191736518974e-05,
"loss": 0.01,
"step": 102700
},
{
"epoch": 4.706798072410517,
"grad_norm": 0.24738559126853943,
"learning_rate": 6.366506296835616e-05,
"loss": 0.0114,
"step": 102800
},
{
"epoch": 4.71137666975722,
"grad_norm": 0.6193427443504333,
"learning_rate": 6.358817366384122e-05,
"loss": 0.0139,
"step": 102900
},
{
"epoch": 4.715955267103923,
"grad_norm": 0.24367505311965942,
"learning_rate": 6.35112496480602e-05,
"loss": 0.0113,
"step": 103000
},
{
"epoch": 4.720533864450625,
"grad_norm": 0.11543486267328262,
"learning_rate": 6.343429111751704e-05,
"loss": 0.015,
"step": 103100
},
{
"epoch": 4.725112461797329,
"grad_norm": 0.23988036811351776,
"learning_rate": 6.33572982688039e-05,
"loss": 0.0121,
"step": 103200
},
{
"epoch": 4.729691059144031,
"grad_norm": 0.26978039741516113,
"learning_rate": 6.328027129860057e-05,
"loss": 0.0117,
"step": 103300
},
{
"epoch": 4.734269656490734,
"grad_norm": 0.047924984246492386,
"learning_rate": 6.3203210403674e-05,
"loss": 0.0141,
"step": 103400
},
{
"epoch": 4.738848253837437,
"grad_norm": 0.23787090182304382,
"learning_rate": 6.312611578087784e-05,
"loss": 0.0133,
"step": 103500
},
{
"epoch": 4.7434268511841395,
"grad_norm": 0.9701817035675049,
"learning_rate": 6.304898762715186e-05,
"loss": 0.0121,
"step": 103600
},
{
"epoch": 4.748005448530843,
"grad_norm": 0.5129296183586121,
"learning_rate": 6.29718261395215e-05,
"loss": 0.0161,
"step": 103700
},
{
"epoch": 4.752584045877546,
"grad_norm": 0.2481413185596466,
"learning_rate": 6.289463151509733e-05,
"loss": 0.0142,
"step": 103800
},
{
"epoch": 4.757162643224248,
"grad_norm": 0.4262784719467163,
"learning_rate": 6.281740395107462e-05,
"loss": 0.0152,
"step": 103900
},
{
"epoch": 4.761741240570951,
"grad_norm": 0.42060771584510803,
"learning_rate": 6.274014364473274e-05,
"loss": 0.0132,
"step": 104000
},
{
"epoch": 4.766319837917654,
"grad_norm": 0.2619081437587738,
"learning_rate": 6.26628507934347e-05,
"loss": 0.0124,
"step": 104100
},
{
"epoch": 4.770898435264357,
"grad_norm": 0.47017577290534973,
"learning_rate": 6.258552559462668e-05,
"loss": 0.0132,
"step": 104200
},
{
"epoch": 4.775477032611059,
"grad_norm": 0.5897017121315002,
"learning_rate": 6.250816824583747e-05,
"loss": 0.0134,
"step": 104300
},
{
"epoch": 4.780055629957762,
"grad_norm": 0.41096287965774536,
"learning_rate": 6.243077894467799e-05,
"loss": 0.0139,
"step": 104400
},
{
"epoch": 4.7846342273044655,
"grad_norm": 0.9277390241622925,
"learning_rate": 6.235335788884079e-05,
"loss": 0.0114,
"step": 104500
},
{
"epoch": 4.789212824651168,
"grad_norm": 0.4034029245376587,
"learning_rate": 6.227590527609952e-05,
"loss": 0.0117,
"step": 104600
},
{
"epoch": 4.793791421997871,
"grad_norm": 0.08527888357639313,
"learning_rate": 6.219842130430846e-05,
"loss": 0.0139,
"step": 104700
},
{
"epoch": 4.798370019344574,
"grad_norm": 0.43536534905433655,
"learning_rate": 6.2120906171402e-05,
"loss": 0.0136,
"step": 104800
},
{
"epoch": 4.8029486166912765,
"grad_norm": 0.14146916568279266,
"learning_rate": 6.204336007539412e-05,
"loss": 0.014,
"step": 104900
},
{
"epoch": 4.80752721403798,
"grad_norm": 0.2524791657924652,
"learning_rate": 6.19657832143779e-05,
"loss": 0.0149,
"step": 105000
},
{
"epoch": 4.812105811384682,
"grad_norm": 0.14325548708438873,
"learning_rate": 6.1888175786525e-05,
"loss": 0.0135,
"step": 105100
},
{
"epoch": 4.816684408731385,
"grad_norm": 0.08125073462724686,
"learning_rate": 6.181053799008519e-05,
"loss": 0.012,
"step": 105200
},
{
"epoch": 4.821263006078088,
"grad_norm": 0.765481173992157,
"learning_rate": 6.173287002338577e-05,
"loss": 0.0123,
"step": 105300
},
{
"epoch": 4.825841603424791,
"grad_norm": 0.6038789749145508,
"learning_rate": 6.165517208483117e-05,
"loss": 0.0135,
"step": 105400
},
{
"epoch": 4.830420200771494,
"grad_norm": 0.3267226219177246,
"learning_rate": 6.157744437290236e-05,
"loss": 0.012,
"step": 105500
},
{
"epoch": 4.834998798118196,
"grad_norm": 0.08704890310764313,
"learning_rate": 6.149968708615634e-05,
"loss": 0.0136,
"step": 105600
},
{
"epoch": 4.839577395464899,
"grad_norm": 0.22156277298927307,
"learning_rate": 6.142190042322569e-05,
"loss": 0.013,
"step": 105700
},
{
"epoch": 4.8441559928116025,
"grad_norm": 0.12729842960834503,
"learning_rate": 6.134408458281805e-05,
"loss": 0.014,
"step": 105800
},
{
"epoch": 4.848734590158305,
"grad_norm": 0.21868254244327545,
"learning_rate": 6.12662397637155e-05,
"loss": 0.0154,
"step": 105900
},
{
"epoch": 4.853313187505008,
"grad_norm": 0.3205544352531433,
"learning_rate": 6.118836616477427e-05,
"loss": 0.0132,
"step": 106000
},
{
"epoch": 4.857891784851711,
"grad_norm": 0.26208868622779846,
"learning_rate": 6.111046398492404e-05,
"loss": 0.0139,
"step": 106100
},
{
"epoch": 4.862470382198413,
"grad_norm": 0.6751037836074829,
"learning_rate": 6.103253342316753e-05,
"loss": 0.0129,
"step": 106200
},
{
"epoch": 4.867048979545117,
"grad_norm": 0.2062651365995407,
"learning_rate": 6.095457467857989e-05,
"loss": 0.0145,
"step": 106300
},
{
"epoch": 4.871627576891819,
"grad_norm": 0.18155290186405182,
"learning_rate": 6.087658795030837e-05,
"loss": 0.0127,
"step": 106400
},
{
"epoch": 4.876206174238522,
"grad_norm": 0.17720471322536469,
"learning_rate": 6.079857343757165e-05,
"loss": 0.0134,
"step": 106500
},
{
"epoch": 4.880784771585224,
"grad_norm": 0.09973806142807007,
"learning_rate": 6.072053133965938e-05,
"loss": 0.0116,
"step": 106600
},
{
"epoch": 4.885363368931928,
"grad_norm": 0.25288718938827515,
"learning_rate": 6.064246185593167e-05,
"loss": 0.0127,
"step": 106700
},
{
"epoch": 4.889941966278631,
"grad_norm": 0.19430892169475555,
"learning_rate": 6.056436518581864e-05,
"loss": 0.0147,
"step": 106800
},
{
"epoch": 4.894520563625333,
"grad_norm": 0.31932905316352844,
"learning_rate": 6.0486241528819795e-05,
"loss": 0.0127,
"step": 106900
},
{
"epoch": 4.899099160972036,
"grad_norm": 0.06558812409639359,
"learning_rate": 6.040809108450363e-05,
"loss": 0.0124,
"step": 107000
},
{
"epoch": 4.903677758318739,
"grad_norm": 0.20380474627017975,
"learning_rate": 6.032991405250702e-05,
"loss": 0.0147,
"step": 107100
},
{
"epoch": 4.908256355665442,
"grad_norm": 0.08541610836982727,
"learning_rate": 6.025171063253479e-05,
"loss": 0.014,
"step": 107200
},
{
"epoch": 4.912834953012145,
"grad_norm": 0.3804337978363037,
"learning_rate": 6.017348102435918e-05,
"loss": 0.0116,
"step": 107300
},
{
"epoch": 4.917413550358847,
"grad_norm": 0.3044677674770355,
"learning_rate": 6.00952254278193e-05,
"loss": 0.0141,
"step": 107400
},
{
"epoch": 4.92199214770555,
"grad_norm": 0.4350314438343048,
"learning_rate": 6.001694404282068e-05,
"loss": 0.0129,
"step": 107500
},
{
"epoch": 4.926570745052254,
"grad_norm": 0.19222760200500488,
"learning_rate": 5.993863706933468e-05,
"loss": 0.0124,
"step": 107600
},
{
"epoch": 4.931149342398956,
"grad_norm": 0.36865904927253723,
"learning_rate": 5.986030470739811e-05,
"loss": 0.0113,
"step": 107700
},
{
"epoch": 4.935727939745659,
"grad_norm": 0.20282283425331116,
"learning_rate": 5.9781947157112536e-05,
"loss": 0.013,
"step": 107800
},
{
"epoch": 4.940306537092361,
"grad_norm": 0.11859617382287979,
"learning_rate": 5.970356461864391e-05,
"loss": 0.0138,
"step": 107900
},
{
"epoch": 4.9448851344390645,
"grad_norm": 0.5312494039535522,
"learning_rate": 5.962515729222208e-05,
"loss": 0.0128,
"step": 108000
},
{
"epoch": 4.949463731785768,
"grad_norm": 0.40164250135421753,
"learning_rate": 5.95467253781401e-05,
"loss": 0.0117,
"step": 108100
},
{
"epoch": 4.95404232913247,
"grad_norm": 0.11808757483959198,
"learning_rate": 5.9468269076753894e-05,
"loss": 0.0121,
"step": 108200
},
{
"epoch": 4.958620926479173,
"grad_norm": 0.20174367725849152,
"learning_rate": 5.938978858848171e-05,
"loss": 0.0122,
"step": 108300
},
{
"epoch": 4.963199523825876,
"grad_norm": 0.33299440145492554,
"learning_rate": 5.9311284113803524e-05,
"loss": 0.0115,
"step": 108400
},
{
"epoch": 4.967778121172579,
"grad_norm": 0.6904717683792114,
"learning_rate": 5.9232755853260635e-05,
"loss": 0.0139,
"step": 108500
},
{
"epoch": 4.972356718519282,
"grad_norm": 0.17567585408687592,
"learning_rate": 5.915420400745507e-05,
"loss": 0.0118,
"step": 108600
},
{
"epoch": 4.976935315865984,
"grad_norm": 0.16880100965499878,
"learning_rate": 5.907562877704912e-05,
"loss": 0.015,
"step": 108700
},
{
"epoch": 4.981513913212687,
"grad_norm": 0.2917187213897705,
"learning_rate": 5.899703036276482e-05,
"loss": 0.0135,
"step": 108800
},
{
"epoch": 4.9860925105593905,
"grad_norm": 0.028255263343453407,
"learning_rate": 5.891840896538339e-05,
"loss": 0.0112,
"step": 108900
},
{
"epoch": 4.990671107906093,
"grad_norm": 0.2152412086725235,
"learning_rate": 5.883976478574482e-05,
"loss": 0.014,
"step": 109000
},
{
"epoch": 4.995249705252796,
"grad_norm": 0.3723663091659546,
"learning_rate": 5.876109802474725e-05,
"loss": 0.0123,
"step": 109100
},
{
"epoch": 4.999828302599498,
"grad_norm": 0.6162732243537903,
"learning_rate": 5.868240888334653e-05,
"loss": 0.0161,
"step": 109200
},
{
"epoch": 4.9999656605199,
"eval_loss": 0.17184050381183624,
"eval_runtime": 244.2658,
"eval_samples_per_second": 22.516,
"eval_steps_per_second": 22.516,
"step": 109203
},
{
"epoch": 5.0044068999462015,
"grad_norm": 0.5017980337142944,
"learning_rate": 5.860369756255566e-05,
"loss": 0.0083,
"step": 109300
},
{
"epoch": 5.008985497292905,
"grad_norm": 0.14825376868247986,
"learning_rate": 5.8524964263444324e-05,
"loss": 0.0097,
"step": 109400
},
{
"epoch": 5.013564094639607,
"grad_norm": 1.7440462112426758,
"learning_rate": 5.8446209187138324e-05,
"loss": 0.0083,
"step": 109500
},
{
"epoch": 5.01814269198631,
"grad_norm": 0.25318461656570435,
"learning_rate": 5.8367432534819124e-05,
"loss": 0.0094,
"step": 109600
},
{
"epoch": 5.022721289333012,
"grad_norm": 0.0751919150352478,
"learning_rate": 5.8288634507723274e-05,
"loss": 0.0089,
"step": 109700
},
{
"epoch": 5.027299886679716,
"grad_norm": 0.3842028081417084,
"learning_rate": 5.820981530714191e-05,
"loss": 0.0088,
"step": 109800
},
{
"epoch": 5.031878484026419,
"grad_norm": 0.11625286936759949,
"learning_rate": 5.813097513442035e-05,
"loss": 0.008,
"step": 109900
},
{
"epoch": 5.036457081373121,
"grad_norm": 0.25438615679740906,
"learning_rate": 5.805211419095736e-05,
"loss": 0.009,
"step": 110000
},
{
"epoch": 5.041035678719824,
"grad_norm": 0.13749825954437256,
"learning_rate": 5.797323267820484e-05,
"loss": 0.0092,
"step": 110100
},
{
"epoch": 5.0456142760665275,
"grad_norm": 0.06733408570289612,
"learning_rate": 5.789433079766723e-05,
"loss": 0.0097,
"step": 110200
},
{
"epoch": 5.05019287341323,
"grad_norm": 0.2959531843662262,
"learning_rate": 5.7815408750900993e-05,
"loss": 0.0071,
"step": 110300
},
{
"epoch": 5.054771470759933,
"grad_norm": 0.10893545299768448,
"learning_rate": 5.773646673951406e-05,
"loss": 0.0096,
"step": 110400
},
{
"epoch": 5.059350068106635,
"grad_norm": 0.9517889618873596,
"learning_rate": 5.765750496516547e-05,
"loss": 0.0108,
"step": 110500
},
{
"epoch": 5.063928665453338,
"grad_norm": 0.31945428252220154,
"learning_rate": 5.757852362956463e-05,
"loss": 0.0107,
"step": 110600
},
{
"epoch": 5.068507262800042,
"grad_norm": 0.2407699078321457,
"learning_rate": 5.7499522934470994e-05,
"loss": 0.0083,
"step": 110700
},
{
"epoch": 5.073085860146744,
"grad_norm": 0.15435832738876343,
"learning_rate": 5.7420503081693446e-05,
"loss": 0.0086,
"step": 110800
},
{
"epoch": 5.077664457493447,
"grad_norm": 0.4791698455810547,
"learning_rate": 5.734146427308979e-05,
"loss": 0.0072,
"step": 110900
},
{
"epoch": 5.082243054840149,
"grad_norm": 0.14484897255897522,
"learning_rate": 5.7262406710566296e-05,
"loss": 0.0105,
"step": 111000
},
{
"epoch": 5.086821652186853,
"grad_norm": 0.5574690103530884,
"learning_rate": 5.71833305960771e-05,
"loss": 0.0092,
"step": 111100
},
{
"epoch": 5.091400249533556,
"grad_norm": 0.3678722679615021,
"learning_rate": 5.7104236131623736e-05,
"loss": 0.0099,
"step": 111200
},
{
"epoch": 5.095978846880258,
"grad_norm": 0.8227113485336304,
"learning_rate": 5.702512351925464e-05,
"loss": 0.008,
"step": 111300
},
{
"epoch": 5.100557444226961,
"grad_norm": 0.13089661300182343,
"learning_rate": 5.6945992961064586e-05,
"loss": 0.0081,
"step": 111400
},
{
"epoch": 5.1051360415736635,
"grad_norm": 0.008971684612333775,
"learning_rate": 5.6866844659194185e-05,
"loss": 0.0084,
"step": 111500
},
{
"epoch": 5.109714638920367,
"grad_norm": 0.0824974775314331,
"learning_rate": 5.6787678815829404e-05,
"loss": 0.0098,
"step": 111600
},
{
"epoch": 5.11429323626707,
"grad_norm": 0.17469094693660736,
"learning_rate": 5.6708495633200964e-05,
"loss": 0.0078,
"step": 111700
},
{
"epoch": 5.118871833613772,
"grad_norm": 0.13333024084568024,
"learning_rate": 5.6629295313583974e-05,
"loss": 0.0082,
"step": 111800
},
{
"epoch": 5.123450430960475,
"grad_norm": 0.43794387578964233,
"learning_rate": 5.6550078059297205e-05,
"loss": 0.0089,
"step": 111900
},
{
"epoch": 5.1280290283071785,
"grad_norm": 0.37814435362815857,
"learning_rate": 5.6470844072702764e-05,
"loss": 0.0105,
"step": 112000
},
{
"epoch": 5.132607625653881,
"grad_norm": 0.3779330253601074,
"learning_rate": 5.639159355620551e-05,
"loss": 0.0084,
"step": 112100
},
{
"epoch": 5.137186223000584,
"grad_norm": 0.30869078636169434,
"learning_rate": 5.631232671225247e-05,
"loss": 0.0093,
"step": 112200
},
{
"epoch": 5.141764820347286,
"grad_norm": 0.3333792984485626,
"learning_rate": 5.623304374333239e-05,
"loss": 0.0108,
"step": 112300
},
{
"epoch": 5.1463434176939895,
"grad_norm": 1.2692680358886719,
"learning_rate": 5.6153744851975274e-05,
"loss": 0.0081,
"step": 112400
},
{
"epoch": 5.150922015040693,
"grad_norm": 0.017233431339263916,
"learning_rate": 5.607443024075173e-05,
"loss": 0.0075,
"step": 112500
},
{
"epoch": 5.155500612387395,
"grad_norm": 0.46397635340690613,
"learning_rate": 5.5995100112272545e-05,
"loss": 0.0108,
"step": 112600
},
{
"epoch": 5.160079209734098,
"grad_norm": 0.23527605831623077,
"learning_rate": 5.591575466918816e-05,
"loss": 0.0094,
"step": 112700
},
{
"epoch": 5.1646578070808005,
"grad_norm": 0.19655343890190125,
"learning_rate": 5.583639411418811e-05,
"loss": 0.0092,
"step": 112800
},
{
"epoch": 5.169236404427504,
"grad_norm": 0.6157360076904297,
"learning_rate": 5.575701865000054e-05,
"loss": 0.0085,
"step": 112900
},
{
"epoch": 5.173815001774207,
"grad_norm": 0.4467610716819763,
"learning_rate": 5.56776284793917e-05,
"loss": 0.0092,
"step": 113000
},
{
"epoch": 5.178393599120909,
"grad_norm": 0.16839289665222168,
"learning_rate": 5.559822380516539e-05,
"loss": 0.0093,
"step": 113100
},
{
"epoch": 5.182972196467612,
"grad_norm": 0.08081818372011185,
"learning_rate": 5.551880483016248e-05,
"loss": 0.0088,
"step": 113200
},
{
"epoch": 5.187550793814315,
"grad_norm": 0.7287288308143616,
"learning_rate": 5.543937175726035e-05,
"loss": 0.0084,
"step": 113300
},
{
"epoch": 5.192129391161018,
"grad_norm": 0.18267770111560822,
"learning_rate": 5.5359924789372396e-05,
"loss": 0.0083,
"step": 113400
},
{
"epoch": 5.196707988507721,
"grad_norm": 0.3210001587867737,
"learning_rate": 5.528046412944752e-05,
"loss": 0.0094,
"step": 113500
},
{
"epoch": 5.201286585854423,
"grad_norm": 0.21997089684009552,
"learning_rate": 5.520098998046958e-05,
"loss": 0.0089,
"step": 113600
},
{
"epoch": 5.2058651832011265,
"grad_norm": 0.24578975141048431,
"learning_rate": 5.5121502545456925e-05,
"loss": 0.0095,
"step": 113700
},
{
"epoch": 5.210443780547829,
"grad_norm": 1.2959401607513428,
"learning_rate": 5.504200202746182e-05,
"loss": 0.0085,
"step": 113800
},
{
"epoch": 5.215022377894532,
"grad_norm": 0.12553347647190094,
"learning_rate": 5.496248862956994e-05,
"loss": 0.0089,
"step": 113900
},
{
"epoch": 5.219600975241235,
"grad_norm": 0.7202230095863342,
"learning_rate": 5.488296255489991e-05,
"loss": 0.008,
"step": 114000
},
{
"epoch": 5.224179572587937,
"grad_norm": 0.7170085310935974,
"learning_rate": 5.480342400660268e-05,
"loss": 0.0104,
"step": 114100
},
{
"epoch": 5.228758169934641,
"grad_norm": 0.029888896271586418,
"learning_rate": 5.4723873187861085e-05,
"loss": 0.0092,
"step": 114200
},
{
"epoch": 5.233336767281344,
"grad_norm": 0.2950020730495453,
"learning_rate": 5.4644310301889334e-05,
"loss": 0.0089,
"step": 114300
},
{
"epoch": 5.237915364628046,
"grad_norm": 0.12343444675207138,
"learning_rate": 5.456473555193242e-05,
"loss": 0.008,
"step": 114400
},
{
"epoch": 5.242493961974749,
"grad_norm": 0.5347928404808044,
"learning_rate": 5.4485149141265667e-05,
"loss": 0.0079,
"step": 114500
},
{
"epoch": 5.247072559321452,
"grad_norm": 0.9914150834083557,
"learning_rate": 5.440555127319418e-05,
"loss": 0.0111,
"step": 114600
},
{
"epoch": 5.251651156668155,
"grad_norm": 0.24366235733032227,
"learning_rate": 5.432594215105234e-05,
"loss": 0.0085,
"step": 114700
},
{
"epoch": 5.256229754014858,
"grad_norm": 0.1021379604935646,
"learning_rate": 5.424632197820324e-05,
"loss": 0.0091,
"step": 114800
},
{
"epoch": 5.26080835136156,
"grad_norm": 0.11071757227182388,
"learning_rate": 5.4166690958038265e-05,
"loss": 0.0082,
"step": 114900
},
{
"epoch": 5.265386948708263,
"grad_norm": 1.4259638786315918,
"learning_rate": 5.408704929397648e-05,
"loss": 0.0085,
"step": 115000
},
{
"epoch": 5.269965546054966,
"grad_norm": 0.2681211531162262,
"learning_rate": 5.4007397189464105e-05,
"loss": 0.0108,
"step": 115100
},
{
"epoch": 5.274544143401669,
"grad_norm": 0.4776928126811981,
"learning_rate": 5.3927734847974064e-05,
"loss": 0.008,
"step": 115200
},
{
"epoch": 5.279122740748372,
"grad_norm": 0.38615280389785767,
"learning_rate": 5.3848062473005464e-05,
"loss": 0.0092,
"step": 115300
},
{
"epoch": 5.283701338095074,
"grad_norm": 0.23448576033115387,
"learning_rate": 5.376838026808298e-05,
"loss": 0.0099,
"step": 115400
},
{
"epoch": 5.2882799354417775,
"grad_norm": 0.11435823887586594,
"learning_rate": 5.368868843675642e-05,
"loss": 0.0093,
"step": 115500
},
{
"epoch": 5.29285853278848,
"grad_norm": 0.22706013917922974,
"learning_rate": 5.360898718260021e-05,
"loss": 0.0085,
"step": 115600
},
{
"epoch": 5.297437130135183,
"grad_norm": 0.04221300780773163,
"learning_rate": 5.3529276709212816e-05,
"loss": 0.0084,
"step": 115700
},
{
"epoch": 5.302015727481886,
"grad_norm": 0.3892548382282257,
"learning_rate": 5.344955722021624e-05,
"loss": 0.0101,
"step": 115800
},
{
"epoch": 5.3065943248285885,
"grad_norm": 0.13219723105430603,
"learning_rate": 5.336982891925559e-05,
"loss": 0.0087,
"step": 115900
},
{
"epoch": 5.311172922175292,
"grad_norm": 0.18125391006469727,
"learning_rate": 5.32900920099984e-05,
"loss": 0.0097,
"step": 116000
},
{
"epoch": 5.315751519521994,
"grad_norm": 0.14028698205947876,
"learning_rate": 5.321034669613422e-05,
"loss": 0.0088,
"step": 116100
},
{
"epoch": 5.320330116868697,
"grad_norm": 0.1114293709397316,
"learning_rate": 5.31305931813741e-05,
"loss": 0.0086,
"step": 116200
},
{
"epoch": 5.3249087142154,
"grad_norm": 0.20969901978969574,
"learning_rate": 5.3050831669450005e-05,
"loss": 0.0082,
"step": 116300
},
{
"epoch": 5.329487311562103,
"grad_norm": 0.07742590457201004,
"learning_rate": 5.297106236411432e-05,
"loss": 0.0083,
"step": 116400
},
{
"epoch": 5.334065908908806,
"grad_norm": 0.259859174489975,
"learning_rate": 5.2891285469139395e-05,
"loss": 0.0087,
"step": 116500
},
{
"epoch": 5.338644506255509,
"grad_norm": 0.3085865080356598,
"learning_rate": 5.2811501188316915e-05,
"loss": 0.0103,
"step": 116600
},
{
"epoch": 5.343223103602211,
"grad_norm": 0.27554938197135925,
"learning_rate": 5.2731709725457434e-05,
"loss": 0.0084,
"step": 116700
},
{
"epoch": 5.3478017009489145,
"grad_norm": 0.36539149284362793,
"learning_rate": 5.2651911284389896e-05,
"loss": 0.0085,
"step": 116800
},
{
"epoch": 5.352380298295617,
"grad_norm": 0.47007834911346436,
"learning_rate": 5.2572106068961026e-05,
"loss": 0.0106,
"step": 116900
},
{
"epoch": 5.35695889564232,
"grad_norm": 0.22008706629276276,
"learning_rate": 5.249229428303486e-05,
"loss": 0.0086,
"step": 117000
},
{
"epoch": 5.361537492989023,
"grad_norm": 0.02755674161016941,
"learning_rate": 5.241247613049225e-05,
"loss": 0.0093,
"step": 117100
},
{
"epoch": 5.3661160903357255,
"grad_norm": 0.11869332939386368,
"learning_rate": 5.233265181523028e-05,
"loss": 0.0086,
"step": 117200
},
{
"epoch": 5.370694687682429,
"grad_norm": 0.6038843393325806,
"learning_rate": 5.225282154116179e-05,
"loss": 0.0089,
"step": 117300
},
{
"epoch": 5.375273285029131,
"grad_norm": 0.34202539920806885,
"learning_rate": 5.217298551221483e-05,
"loss": 0.0101,
"step": 117400
},
{
"epoch": 5.379851882375834,
"grad_norm": 0.18048258125782013,
"learning_rate": 5.2093143932332176e-05,
"loss": 0.0089,
"step": 117500
},
{
"epoch": 5.384430479722537,
"grad_norm": 0.14283466339111328,
"learning_rate": 5.201329700547076e-05,
"loss": 0.0076,
"step": 117600
},
{
"epoch": 5.38900907706924,
"grad_norm": 0.5224958658218384,
"learning_rate": 5.193344493560117e-05,
"loss": 0.0091,
"step": 117700
},
{
"epoch": 5.393587674415943,
"grad_norm": 0.0608445443212986,
"learning_rate": 5.185358792670718e-05,
"loss": 0.0091,
"step": 117800
},
{
"epoch": 5.398166271762646,
"grad_norm": 0.3700086176395416,
"learning_rate": 5.177372618278511e-05,
"loss": 0.0087,
"step": 117900
},
{
"epoch": 5.402744869109348,
"grad_norm": 0.20753388106822968,
"learning_rate": 5.16938599078434e-05,
"loss": 0.0099,
"step": 118000
},
{
"epoch": 5.407323466456051,
"grad_norm": 0.13068944215774536,
"learning_rate": 5.161398930590212e-05,
"loss": 0.0099,
"step": 118100
},
{
"epoch": 5.411902063802754,
"grad_norm": 0.34820255637168884,
"learning_rate": 5.153411458099231e-05,
"loss": 0.0087,
"step": 118200
},
{
"epoch": 5.416480661149457,
"grad_norm": 0.3474198281764984,
"learning_rate": 5.145423593715557e-05,
"loss": 0.0104,
"step": 118300
},
{
"epoch": 5.42105925849616,
"grad_norm": 0.11103557795286179,
"learning_rate": 5.137435357844357e-05,
"loss": 0.0065,
"step": 118400
},
{
"epoch": 5.425637855842862,
"grad_norm": 0.08837764710187912,
"learning_rate": 5.129446770891738e-05,
"loss": 0.0078,
"step": 118500
},
{
"epoch": 5.430216453189566,
"grad_norm": 0.07470713555812836,
"learning_rate": 5.121457853264708e-05,
"loss": 0.0074,
"step": 118600
},
{
"epoch": 5.434795050536268,
"grad_norm": 0.08019549399614334,
"learning_rate": 5.1134686253711215e-05,
"loss": 0.0104,
"step": 118700
},
{
"epoch": 5.439373647882971,
"grad_norm": 0.1745513528585434,
"learning_rate": 5.105479107619624e-05,
"loss": 0.009,
"step": 118800
},
{
"epoch": 5.443952245229674,
"grad_norm": 0.07470156252384186,
"learning_rate": 5.097489320419598e-05,
"loss": 0.0083,
"step": 118900
},
{
"epoch": 5.4485308425763765,
"grad_norm": 0.5151394605636597,
"learning_rate": 5.089499284181122e-05,
"loss": 0.0083,
"step": 119000
},
{
"epoch": 5.45310943992308,
"grad_norm": 0.11218901723623276,
"learning_rate": 5.081509019314902e-05,
"loss": 0.0097,
"step": 119100
},
{
"epoch": 5.457688037269782,
"grad_norm": 0.25493118166923523,
"learning_rate": 5.073518546232234e-05,
"loss": 0.0084,
"step": 119200
},
{
"epoch": 5.462266634616485,
"grad_norm": 0.39373013377189636,
"learning_rate": 5.065527885344944e-05,
"loss": 0.0098,
"step": 119300
},
{
"epoch": 5.466845231963188,
"grad_norm": 0.5648688673973083,
"learning_rate": 5.057537057065338e-05,
"loss": 0.009,
"step": 119400
},
{
"epoch": 5.471423829309891,
"grad_norm": 0.2762792408466339,
"learning_rate": 5.049546081806149e-05,
"loss": 0.0077,
"step": 119500
},
{
"epoch": 5.476002426656594,
"grad_norm": 0.10117408633232117,
"learning_rate": 5.041554979980486e-05,
"loss": 0.0078,
"step": 119600
},
{
"epoch": 5.480581024003296,
"grad_norm": 0.7319039106369019,
"learning_rate": 5.0335637720017817e-05,
"loss": 0.0085,
"step": 119700
},
{
"epoch": 5.485159621349999,
"grad_norm": 0.4741845428943634,
"learning_rate": 5.025572478283738e-05,
"loss": 0.0084,
"step": 119800
},
{
"epoch": 5.4897382186967025,
"grad_norm": 0.2592092752456665,
"learning_rate": 5.0175811192402767e-05,
"loss": 0.0075,
"step": 119900
},
{
"epoch": 5.494316816043405,
"grad_norm": 0.03605992719531059,
"learning_rate": 5.009589715285492e-05,
"loss": 0.0056,
"step": 120000
},
{
"epoch": 5.498895413390108,
"grad_norm": 0.518429696559906,
"learning_rate": 5.0015982868335834e-05,
"loss": 0.0104,
"step": 120100
},
{
"epoch": 5.503474010736811,
"grad_norm": 0.42362892627716064,
"learning_rate": 4.993606854298817e-05,
"loss": 0.0106,
"step": 120200
},
{
"epoch": 5.5080526080835135,
"grad_norm": 0.27914491295814514,
"learning_rate": 4.985615438095473e-05,
"loss": 0.008,
"step": 120300
},
{
"epoch": 5.512631205430217,
"grad_norm": 0.12702660262584686,
"learning_rate": 4.977624058637783e-05,
"loss": 0.0094,
"step": 120400
},
{
"epoch": 5.517209802776919,
"grad_norm": 0.06755949556827545,
"learning_rate": 4.969632736339893e-05,
"loss": 0.0089,
"step": 120500
},
{
"epoch": 5.521788400123622,
"grad_norm": 0.2052990347146988,
"learning_rate": 4.961641491615794e-05,
"loss": 0.0079,
"step": 120600
},
{
"epoch": 5.526366997470325,
"grad_norm": 0.27255722880363464,
"learning_rate": 4.953650344879286e-05,
"loss": 0.0076,
"step": 120700
},
{
"epoch": 5.530945594817028,
"grad_norm": 0.10563024878501892,
"learning_rate": 4.945659316543916e-05,
"loss": 0.0087,
"step": 120800
},
{
"epoch": 5.535524192163731,
"grad_norm": 0.31879550218582153,
"learning_rate": 4.9376684270229254e-05,
"loss": 0.009,
"step": 120900
},
{
"epoch": 5.540102789510433,
"grad_norm": 0.21383854746818542,
"learning_rate": 4.929677696729207e-05,
"loss": 0.0085,
"step": 121000
},
{
"epoch": 5.544681386857136,
"grad_norm": 0.2081623524427414,
"learning_rate": 4.921687146075244e-05,
"loss": 0.0095,
"step": 121100
},
{
"epoch": 5.5492599842038395,
"grad_norm": 0.12125098705291748,
"learning_rate": 4.913696795473058e-05,
"loss": 0.0084,
"step": 121200
},
{
"epoch": 5.553838581550542,
"grad_norm": 0.17820671200752258,
"learning_rate": 4.905706665334165e-05,
"loss": 0.0081,
"step": 121300
},
{
"epoch": 5.558417178897245,
"grad_norm": 0.2230408489704132,
"learning_rate": 4.897716776069512e-05,
"loss": 0.0079,
"step": 121400
},
{
"epoch": 5.562995776243948,
"grad_norm": 0.3595784604549408,
"learning_rate": 4.889727148089439e-05,
"loss": 0.0104,
"step": 121500
},
{
"epoch": 5.56757437359065,
"grad_norm": 0.08180402964353561,
"learning_rate": 4.8817378018036073e-05,
"loss": 0.008,
"step": 121600
},
{
"epoch": 5.572152970937354,
"grad_norm": 0.13690640032291412,
"learning_rate": 4.873748757620967e-05,
"loss": 0.0093,
"step": 121700
},
{
"epoch": 5.576731568284056,
"grad_norm": 0.048987165093421936,
"learning_rate": 4.865760035949695e-05,
"loss": 0.0088,
"step": 121800
},
{
"epoch": 5.581310165630759,
"grad_norm": 0.7239773869514465,
"learning_rate": 4.857771657197142e-05,
"loss": 0.0098,
"step": 121900
},
{
"epoch": 5.585888762977461,
"grad_norm": 0.13404466211795807,
"learning_rate": 4.849783641769783e-05,
"loss": 0.0095,
"step": 122000
},
{
"epoch": 5.590467360324165,
"grad_norm": 0.30230358242988586,
"learning_rate": 4.8417960100731706e-05,
"loss": 0.0076,
"step": 122100
},
{
"epoch": 5.595045957670868,
"grad_norm": 0.169099822640419,
"learning_rate": 4.8338087825118675e-05,
"loss": 0.009,
"step": 122200
},
{
"epoch": 5.59962455501757,
"grad_norm": 0.7153336405754089,
"learning_rate": 4.8258219794894095e-05,
"loss": 0.0088,
"step": 122300
},
{
"epoch": 5.604203152364273,
"grad_norm": 0.167174831032753,
"learning_rate": 4.817835621408251e-05,
"loss": 0.0076,
"step": 122400
},
{
"epoch": 5.608781749710976,
"grad_norm": 0.16803164780139923,
"learning_rate": 4.809849728669702e-05,
"loss": 0.0079,
"step": 122500
},
{
"epoch": 5.613360347057679,
"grad_norm": 0.645155131816864,
"learning_rate": 4.80186432167389e-05,
"loss": 0.008,
"step": 122600
},
{
"epoch": 5.617938944404382,
"grad_norm": 0.1512228399515152,
"learning_rate": 4.7938794208197005e-05,
"loss": 0.0091,
"step": 122700
},
{
"epoch": 5.622517541751084,
"grad_norm": 0.28644976019859314,
"learning_rate": 4.7858950465047224e-05,
"loss": 0.0081,
"step": 122800
},
{
"epoch": 5.627096139097787,
"grad_norm": 0.5135303735733032,
"learning_rate": 4.7779112191252054e-05,
"loss": 0.0092,
"step": 122900
},
{
"epoch": 5.631674736444491,
"grad_norm": 0.38240012526512146,
"learning_rate": 4.769927959075999e-05,
"loss": 0.0105,
"step": 123000
},
{
"epoch": 5.636253333791193,
"grad_norm": 0.565757155418396,
"learning_rate": 4.761945286750499e-05,
"loss": 0.0093,
"step": 123100
},
{
"epoch": 5.640831931137896,
"grad_norm": 0.12311606109142303,
"learning_rate": 4.7539632225406095e-05,
"loss": 0.0076,
"step": 123200
},
{
"epoch": 5.645410528484598,
"grad_norm": 0.2507004737854004,
"learning_rate": 4.745981786836672e-05,
"loss": 0.0088,
"step": 123300
},
{
"epoch": 5.6499891258313015,
"grad_norm": 0.3408881425857544,
"learning_rate": 4.738001000027431e-05,
"loss": 0.0088,
"step": 123400
},
{
"epoch": 5.654567723178005,
"grad_norm": 0.6254268884658813,
"learning_rate": 4.730020882499964e-05,
"loss": 0.0091,
"step": 123500
},
{
"epoch": 5.659146320524707,
"grad_norm": 0.046281538903713226,
"learning_rate": 4.722041454639645e-05,
"loss": 0.0084,
"step": 123600
},
{
"epoch": 5.66372491787141,
"grad_norm": 0.12148924171924591,
"learning_rate": 4.714062736830088e-05,
"loss": 0.0078,
"step": 123700
},
{
"epoch": 5.668303515218113,
"grad_norm": 0.06817379593849182,
"learning_rate": 4.706084749453085e-05,
"loss": 0.0078,
"step": 123800
},
{
"epoch": 5.672882112564816,
"grad_norm": 0.11472304165363312,
"learning_rate": 4.6981075128885693e-05,
"loss": 0.0092,
"step": 123900
},
{
"epoch": 5.677460709911519,
"grad_norm": 0.7873682975769043,
"learning_rate": 4.690131047514556e-05,
"loss": 0.0082,
"step": 124000
},
{
"epoch": 5.682039307258221,
"grad_norm": 0.34170079231262207,
"learning_rate": 4.6821553737070856e-05,
"loss": 0.008,
"step": 124100
},
{
"epoch": 5.686617904604924,
"grad_norm": 0.562393844127655,
"learning_rate": 4.674180511840178e-05,
"loss": 0.0079,
"step": 124200
},
{
"epoch": 5.691196501951627,
"grad_norm": 0.32295772433280945,
"learning_rate": 4.6662064822857844e-05,
"loss": 0.0088,
"step": 124300
},
{
"epoch": 5.69577509929833,
"grad_norm": 0.09313233196735382,
"learning_rate": 4.658233305413722e-05,
"loss": 0.0083,
"step": 124400
},
{
"epoch": 5.700353696645033,
"grad_norm": 0.27240103483200073,
"learning_rate": 4.650261001591633e-05,
"loss": 0.0076,
"step": 124500
},
{
"epoch": 5.704932293991735,
"grad_norm": 0.5987135767936707,
"learning_rate": 4.642289591184934e-05,
"loss": 0.0072,
"step": 124600
},
{
"epoch": 5.7095108913384385,
"grad_norm": 0.044540900737047195,
"learning_rate": 4.6343190945567504e-05,
"loss": 0.0084,
"step": 124700
},
{
"epoch": 5.714089488685142,
"grad_norm": 0.19168873131275177,
"learning_rate": 4.626349532067879e-05,
"loss": 0.0085,
"step": 124800
},
{
"epoch": 5.718668086031844,
"grad_norm": 0.3095737397670746,
"learning_rate": 4.6183809240767314e-05,
"loss": 0.0102,
"step": 124900
},
{
"epoch": 5.723246683378547,
"grad_norm": 0.34387272596359253,
"learning_rate": 4.6104132909392765e-05,
"loss": 0.0084,
"step": 125000
},
{
"epoch": 5.727825280725249,
"grad_norm": 0.18629814684391022,
"learning_rate": 4.602446653008997e-05,
"loss": 0.0091,
"step": 125100
},
{
"epoch": 5.732403878071953,
"grad_norm": 0.1663748174905777,
"learning_rate": 4.594481030636832e-05,
"loss": 0.0094,
"step": 125200
},
{
"epoch": 5.736982475418656,
"grad_norm": 0.21490418910980225,
"learning_rate": 4.586516444171122e-05,
"loss": 0.0083,
"step": 125300
},
{
"epoch": 5.741561072765358,
"grad_norm": 0.17258259654045105,
"learning_rate": 4.57855291395757e-05,
"loss": 0.0089,
"step": 125400
},
{
"epoch": 5.746139670112061,
"grad_norm": 0.25354665517807007,
"learning_rate": 4.5705904603391716e-05,
"loss": 0.0077,
"step": 125500
},
{
"epoch": 5.750718267458764,
"grad_norm": 0.28657224774360657,
"learning_rate": 4.562629103656183e-05,
"loss": 0.0074,
"step": 125600
},
{
"epoch": 5.755296864805467,
"grad_norm": 0.36166995763778687,
"learning_rate": 4.5546688642460446e-05,
"loss": 0.0091,
"step": 125700
},
{
"epoch": 5.75987546215217,
"grad_norm": 0.19394946098327637,
"learning_rate": 4.5467097624433524e-05,
"loss": 0.0097,
"step": 125800
},
{
"epoch": 5.764454059498872,
"grad_norm": 0.16516007483005524,
"learning_rate": 4.538751818579797e-05,
"loss": 0.0085,
"step": 125900
},
{
"epoch": 5.769032656845575,
"grad_norm": 0.2279433161020279,
"learning_rate": 4.530795052984104e-05,
"loss": 0.0078,
"step": 126000
},
{
"epoch": 5.773611254192279,
"grad_norm": 0.5914369225502014,
"learning_rate": 4.522839485981994e-05,
"loss": 0.0085,
"step": 126100
},
{
"epoch": 5.778189851538981,
"grad_norm": 0.06345394253730774,
"learning_rate": 4.514885137896127e-05,
"loss": 0.0096,
"step": 126200
},
{
"epoch": 5.782768448885684,
"grad_norm": 0.2646149694919586,
"learning_rate": 4.506932029046044e-05,
"loss": 0.0073,
"step": 126300
},
{
"epoch": 5.787347046232386,
"grad_norm": 0.8094835877418518,
"learning_rate": 4.498980179748123e-05,
"loss": 0.0082,
"step": 126400
},
{
"epoch": 5.79192564357909,
"grad_norm": 0.4164597988128662,
"learning_rate": 4.4910296103155296e-05,
"loss": 0.0079,
"step": 126500
},
{
"epoch": 5.796504240925793,
"grad_norm": 0.3092726469039917,
"learning_rate": 4.48308034105815e-05,
"loss": 0.0102,
"step": 126600
},
{
"epoch": 5.801082838272495,
"grad_norm": 0.2584327161312103,
"learning_rate": 4.475132392282556e-05,
"loss": 0.0084,
"step": 126700
},
{
"epoch": 5.805661435619198,
"grad_norm": 0.07558545470237732,
"learning_rate": 4.467185784291946e-05,
"loss": 0.008,
"step": 126800
},
{
"epoch": 5.8102400329659005,
"grad_norm": 0.1425691694021225,
"learning_rate": 4.459240537386089e-05,
"loss": 0.0095,
"step": 126900
},
{
"epoch": 5.814818630312604,
"grad_norm": 0.4250103235244751,
"learning_rate": 4.451296671861282e-05,
"loss": 0.009,
"step": 127000
},
{
"epoch": 5.819397227659307,
"grad_norm": 0.06756921857595444,
"learning_rate": 4.443354208010291e-05,
"loss": 0.0073,
"step": 127100
},
{
"epoch": 5.823975825006009,
"grad_norm": 0.2185693234205246,
"learning_rate": 4.4354131661222996e-05,
"loss": 0.0072,
"step": 127200
},
{
"epoch": 5.828554422352712,
"grad_norm": 0.3645274341106415,
"learning_rate": 4.427473566482863e-05,
"loss": 0.0106,
"step": 127300
},
{
"epoch": 5.8331330196994156,
"grad_norm": 0.26136744022369385,
"learning_rate": 4.4195354293738484e-05,
"loss": 0.0085,
"step": 127400
},
{
"epoch": 5.837711617046118,
"grad_norm": 0.1584431380033493,
"learning_rate": 4.4115987750733914e-05,
"loss": 0.0067,
"step": 127500
},
{
"epoch": 5.842290214392821,
"grad_norm": 0.3366251587867737,
"learning_rate": 4.4036636238558335e-05,
"loss": 0.0072,
"step": 127600
},
{
"epoch": 5.846868811739523,
"grad_norm": 0.1969982236623764,
"learning_rate": 4.39572999599168e-05,
"loss": 0.0099,
"step": 127700
},
{
"epoch": 5.8514474090862265,
"grad_norm": 0.178545281291008,
"learning_rate": 4.3877979117475486e-05,
"loss": 0.0063,
"step": 127800
},
{
"epoch": 5.856026006432929,
"grad_norm": 0.3591267168521881,
"learning_rate": 4.379867391386106e-05,
"loss": 0.0074,
"step": 127900
},
{
"epoch": 5.860604603779632,
"grad_norm": 0.11651629209518433,
"learning_rate": 4.371938455166028e-05,
"loss": 0.0079,
"step": 128000
},
{
"epoch": 5.865183201126335,
"grad_norm": 0.19086627662181854,
"learning_rate": 4.364011123341947e-05,
"loss": 0.0067,
"step": 128100
},
{
"epoch": 5.8697617984730375,
"grad_norm": 0.0712941512465477,
"learning_rate": 4.35608541616439e-05,
"loss": 0.0099,
"step": 128200
},
{
"epoch": 5.874340395819741,
"grad_norm": 0.26921433210372925,
"learning_rate": 4.348161353879737e-05,
"loss": 0.0107,
"step": 128300
},
{
"epoch": 5.878918993166444,
"grad_norm": 0.6659551858901978,
"learning_rate": 4.340238956730169e-05,
"loss": 0.0081,
"step": 128400
},
{
"epoch": 5.883497590513146,
"grad_norm": 1.7324509620666504,
"learning_rate": 4.3323182449536095e-05,
"loss": 0.0076,
"step": 128500
},
{
"epoch": 5.888076187859849,
"grad_norm": 0.4373182952404022,
"learning_rate": 4.3243992387836755e-05,
"loss": 0.0063,
"step": 128600
},
{
"epoch": 5.892654785206552,
"grad_norm": 0.45876213908195496,
"learning_rate": 4.316481958449634e-05,
"loss": 0.008,
"step": 128700
},
{
"epoch": 5.897233382553255,
"grad_norm": 0.18616245687007904,
"learning_rate": 4.308566424176336e-05,
"loss": 0.0072,
"step": 128800
},
{
"epoch": 5.901811979899958,
"grad_norm": 0.056702371686697006,
"learning_rate": 4.3006526561841725e-05,
"loss": 0.0086,
"step": 128900
},
{
"epoch": 5.90639057724666,
"grad_norm": 0.38554903864860535,
"learning_rate": 4.292740674689031e-05,
"loss": 0.0078,
"step": 129000
},
{
"epoch": 5.9109691745933635,
"grad_norm": 0.6524538397789001,
"learning_rate": 4.284830499902223e-05,
"loss": 0.0093,
"step": 129100
},
{
"epoch": 5.915547771940066,
"grad_norm": 0.3187253475189209,
"learning_rate": 4.276922152030454e-05,
"loss": 0.0075,
"step": 129200
},
{
"epoch": 5.920126369286769,
"grad_norm": 0.208381786942482,
"learning_rate": 4.269015651275761e-05,
"loss": 0.0073,
"step": 129300
},
{
"epoch": 5.924704966633472,
"grad_norm": 0.2706379294395447,
"learning_rate": 4.261111017835456e-05,
"loss": 0.0074,
"step": 129400
},
{
"epoch": 5.929283563980174,
"grad_norm": 0.8774177432060242,
"learning_rate": 4.253208271902091e-05,
"loss": 0.008,
"step": 129500
},
{
"epoch": 5.933862161326878,
"grad_norm": 0.22220508754253387,
"learning_rate": 4.245307433663388e-05,
"loss": 0.0078,
"step": 129600
},
{
"epoch": 5.938440758673581,
"grad_norm": 0.37277668714523315,
"learning_rate": 4.237408523302203e-05,
"loss": 0.0073,
"step": 129700
},
{
"epoch": 5.943019356020283,
"grad_norm": 0.1921541541814804,
"learning_rate": 4.229511560996459e-05,
"loss": 0.0082,
"step": 129800
},
{
"epoch": 5.947597953366986,
"grad_norm": 0.8308386206626892,
"learning_rate": 4.221616566919107e-05,
"loss": 0.0085,
"step": 129900
},
{
"epoch": 5.952176550713689,
"grad_norm": 0.11215928941965103,
"learning_rate": 4.213723561238074e-05,
"loss": 0.0081,
"step": 130000
},
{
"epoch": 5.956755148060392,
"grad_norm": 0.6458770632743835,
"learning_rate": 4.205832564116201e-05,
"loss": 0.0091,
"step": 130100
},
{
"epoch": 5.961333745407094,
"grad_norm": 0.2930019199848175,
"learning_rate": 4.197943595711198e-05,
"loss": 0.0059,
"step": 130200
},
{
"epoch": 5.965912342753797,
"grad_norm": 0.08667781949043274,
"learning_rate": 4.190056676175602e-05,
"loss": 0.0072,
"step": 130300
},
{
"epoch": 5.9704909401005,
"grad_norm": 0.34257155656814575,
"learning_rate": 4.1821718256567034e-05,
"loss": 0.0076,
"step": 130400
},
{
"epoch": 5.975069537447203,
"grad_norm": 0.2989988327026367,
"learning_rate": 4.174289064296514e-05,
"loss": 0.0104,
"step": 130500
},
{
"epoch": 5.979648134793906,
"grad_norm": 0.6057233810424805,
"learning_rate": 4.1664084122317124e-05,
"loss": 0.0065,
"step": 130600
},
{
"epoch": 5.984226732140609,
"grad_norm": 0.16379669308662415,
"learning_rate": 4.15852988959358e-05,
"loss": 0.0072,
"step": 130700
},
{
"epoch": 5.988805329487311,
"grad_norm": 0.061728738248348236,
"learning_rate": 4.150653516507964e-05,
"loss": 0.0076,
"step": 130800
},
{
"epoch": 5.9933839268340146,
"grad_norm": 0.19023200869560242,
"learning_rate": 4.142779313095223e-05,
"loss": 0.0074,
"step": 130900
},
{
"epoch": 5.997962524180718,
"grad_norm": 0.2615407109260559,
"learning_rate": 4.134907299470165e-05,
"loss": 0.0087,
"step": 131000
},
{
"epoch": 5.999977107013266,
"eval_loss": 0.13592004776000977,
"eval_runtime": 244.1354,
"eval_samples_per_second": 22.528,
"eval_steps_per_second": 22.528,
"step": 131044
},
{
"epoch": 6.00254112152742,
"grad_norm": 0.12518206238746643,
"learning_rate": 4.127037495742013e-05,
"loss": 0.0077,
"step": 131100
},
{
"epoch": 6.007119718874123,
"grad_norm": 0.1018320843577385,
"learning_rate": 4.119169922014339e-05,
"loss": 0.0043,
"step": 131200
},
{
"epoch": 6.0116983162208255,
"grad_norm": 0.09295986592769623,
"learning_rate": 4.111304598385018e-05,
"loss": 0.0061,
"step": 131300
},
{
"epoch": 6.016276913567529,
"grad_norm": 0.05357728898525238,
"learning_rate": 4.103441544946184e-05,
"loss": 0.0056,
"step": 131400
},
{
"epoch": 6.020855510914231,
"grad_norm": 0.08241847157478333,
"learning_rate": 4.095580781784162e-05,
"loss": 0.0059,
"step": 131500
},
{
"epoch": 6.025434108260934,
"grad_norm": 0.12265779078006744,
"learning_rate": 4.087722328979438e-05,
"loss": 0.0033,
"step": 131600
},
{
"epoch": 6.030012705607637,
"grad_norm": 0.11975305527448654,
"learning_rate": 4.079866206606582e-05,
"loss": 0.0061,
"step": 131700
},
{
"epoch": 6.03459130295434,
"grad_norm": 0.15824288129806519,
"learning_rate": 4.072012434734222e-05,
"loss": 0.0066,
"step": 131800
},
{
"epoch": 6.039169900301043,
"grad_norm": 0.2796044647693634,
"learning_rate": 4.06416103342498e-05,
"loss": 0.0055,
"step": 131900
},
{
"epoch": 6.043748497647746,
"grad_norm": 0.1359216570854187,
"learning_rate": 4.056312022735417e-05,
"loss": 0.006,
"step": 132000
},
{
"epoch": 6.048327094994448,
"grad_norm": 0.24055655300617218,
"learning_rate": 4.0484654227159914e-05,
"loss": 0.0072,
"step": 132100
},
{
"epoch": 6.0529056923411515,
"grad_norm": 0.4629483222961426,
"learning_rate": 4.040621253411004e-05,
"loss": 0.0059,
"step": 132200
},
{
"epoch": 6.057484289687854,
"grad_norm": 0.3944862186908722,
"learning_rate": 4.032779534858544e-05,
"loss": 0.0059,
"step": 132300
},
{
"epoch": 6.062062887034557,
"grad_norm": 0.45347368717193604,
"learning_rate": 4.0249402870904396e-05,
"loss": 0.0061,
"step": 132400
},
{
"epoch": 6.06664148438126,
"grad_norm": 0.587853729724884,
"learning_rate": 4.017103530132212e-05,
"loss": 0.0074,
"step": 132500
},
{
"epoch": 6.0712200817279625,
"grad_norm": 0.6638960242271423,
"learning_rate": 4.0092692840030134e-05,
"loss": 0.0071,
"step": 132600
},
{
"epoch": 6.075798679074666,
"grad_norm": 0.28217336535453796,
"learning_rate": 4.0014375687155844e-05,
"loss": 0.0055,
"step": 132700
},
{
"epoch": 6.080377276421368,
"grad_norm": 0.19813333451747894,
"learning_rate": 3.993608404276205e-05,
"loss": 0.0066,
"step": 132800
},
{
"epoch": 6.084955873768071,
"grad_norm": 0.06923089921474457,
"learning_rate": 3.985781810684631e-05,
"loss": 0.006,
"step": 132900
},
{
"epoch": 6.089534471114774,
"grad_norm": 0.5418972969055176,
"learning_rate": 3.9779578079340554e-05,
"loss": 0.0051,
"step": 133000
},
{
"epoch": 6.094113068461477,
"grad_norm": 0.1508362740278244,
"learning_rate": 3.970136416011056e-05,
"loss": 0.0049,
"step": 133100
},
{
"epoch": 6.09869166580818,
"grad_norm": 0.26092636585235596,
"learning_rate": 3.962317654895533e-05,
"loss": 0.0054,
"step": 133200
},
{
"epoch": 6.103270263154882,
"grad_norm": 0.3573048412799835,
"learning_rate": 3.9545015445606736e-05,
"loss": 0.007,
"step": 133300
},
{
"epoch": 6.107848860501585,
"grad_norm": 0.033060140907764435,
"learning_rate": 3.946688104972891e-05,
"loss": 0.0045,
"step": 133400
},
{
"epoch": 6.1124274578482884,
"grad_norm": 0.05039636045694351,
"learning_rate": 3.9388773560917724e-05,
"loss": 0.0048,
"step": 133500
},
{
"epoch": 6.117006055194991,
"grad_norm": 0.41924425959587097,
"learning_rate": 3.931069317870039e-05,
"loss": 0.0065,
"step": 133600
},
{
"epoch": 6.121584652541694,
"grad_norm": 0.33294302225112915,
"learning_rate": 3.9232640102534786e-05,
"loss": 0.0054,
"step": 133700
},
{
"epoch": 6.126163249888397,
"grad_norm": 0.025311682373285294,
"learning_rate": 3.915461453180914e-05,
"loss": 0.0048,
"step": 133800
},
{
"epoch": 6.130741847235099,
"grad_norm": 0.13680239021778107,
"learning_rate": 3.907661666584131e-05,
"loss": 0.0055,
"step": 133900
},
{
"epoch": 6.135320444581803,
"grad_norm": 0.2641524076461792,
"learning_rate": 3.899864670387844e-05,
"loss": 0.0063,
"step": 134000
},
{
"epoch": 6.139899041928505,
"grad_norm": 0.28719770908355713,
"learning_rate": 3.892070484509642e-05,
"loss": 0.0052,
"step": 134100
},
{
"epoch": 6.144477639275208,
"grad_norm": 0.19892792403697968,
"learning_rate": 3.884279128859927e-05,
"loss": 0.0045,
"step": 134200
},
{
"epoch": 6.149056236621911,
"grad_norm": 0.16647031903266907,
"learning_rate": 3.8764906233418775e-05,
"loss": 0.0062,
"step": 134300
},
{
"epoch": 6.1536348339686135,
"grad_norm": 0.18115417659282684,
"learning_rate": 3.86870498785139e-05,
"loss": 0.0053,
"step": 134400
},
{
"epoch": 6.158213431315317,
"grad_norm": 0.7846788167953491,
"learning_rate": 3.860922242277028e-05,
"loss": 0.006,
"step": 134500
},
{
"epoch": 6.162792028662019,
"grad_norm": 0.056557830423116684,
"learning_rate": 3.853142406499972e-05,
"loss": 0.0068,
"step": 134600
},
{
"epoch": 6.167370626008722,
"grad_norm": 0.37449362874031067,
"learning_rate": 3.845365500393974e-05,
"loss": 0.0055,
"step": 134700
},
{
"epoch": 6.171949223355425,
"grad_norm": 1.3642663955688477,
"learning_rate": 3.837591543825296e-05,
"loss": 0.0052,
"step": 134800
},
{
"epoch": 6.176527820702128,
"grad_norm": 0.022911841049790382,
"learning_rate": 3.8298205566526676e-05,
"loss": 0.0042,
"step": 134900
},
{
"epoch": 6.181106418048831,
"grad_norm": 0.028689689934253693,
"learning_rate": 3.8220525587272384e-05,
"loss": 0.0062,
"step": 135000
},
{
"epoch": 6.185685015395533,
"grad_norm": 0.3197433650493622,
"learning_rate": 3.814287569892512e-05,
"loss": 0.0059,
"step": 135100
},
{
"epoch": 6.190263612742236,
"grad_norm": 0.06785603612661362,
"learning_rate": 3.806525609984312e-05,
"loss": 0.0049,
"step": 135200
},
{
"epoch": 6.1948422100889395,
"grad_norm": 0.3414902985095978,
"learning_rate": 3.7987666988307244e-05,
"loss": 0.0053,
"step": 135300
},
{
"epoch": 6.199420807435642,
"grad_norm": 0.1857975274324417,
"learning_rate": 3.791010856252043e-05,
"loss": 0.0058,
"step": 135400
},
{
"epoch": 6.203999404782345,
"grad_norm": 0.1203976720571518,
"learning_rate": 3.7832581020607284e-05,
"loss": 0.0077,
"step": 135500
},
{
"epoch": 6.208578002129048,
"grad_norm": 0.3408762514591217,
"learning_rate": 3.7755084560613455e-05,
"loss": 0.0065,
"step": 135600
},
{
"epoch": 6.2131565994757505,
"grad_norm": 0.0590222142636776,
"learning_rate": 3.767761938050528e-05,
"loss": 0.0055,
"step": 135700
},
{
"epoch": 6.217735196822454,
"grad_norm": 0.662112295627594,
"learning_rate": 3.760018567816908e-05,
"loss": 0.0059,
"step": 135800
},
{
"epoch": 6.222313794169156,
"grad_norm": 0.005584437865763903,
"learning_rate": 3.752278365141084e-05,
"loss": 0.0067,
"step": 135900
},
{
"epoch": 6.226892391515859,
"grad_norm": 0.15262584388256073,
"learning_rate": 3.744541349795564e-05,
"loss": 0.0065,
"step": 136000
},
{
"epoch": 6.231470988862562,
"grad_norm": 0.644844114780426,
"learning_rate": 3.7368075415447086e-05,
"loss": 0.0047,
"step": 136100
},
{
"epoch": 6.236049586209265,
"grad_norm": 0.06777459383010864,
"learning_rate": 3.729076960144687e-05,
"loss": 0.0052,
"step": 136200
},
{
"epoch": 6.240628183555968,
"grad_norm": 0.7604510188102722,
"learning_rate": 3.721349625343431e-05,
"loss": 0.0054,
"step": 136300
},
{
"epoch": 6.24520678090267,
"grad_norm": 0.3464463949203491,
"learning_rate": 3.71362555688057e-05,
"loss": 0.0053,
"step": 136400
},
{
"epoch": 6.249785378249373,
"grad_norm": 0.032986678183078766,
"learning_rate": 3.705904774487396e-05,
"loss": 0.0053,
"step": 136500
},
{
"epoch": 6.2543639755960765,
"grad_norm": 0.270702987909317,
"learning_rate": 3.6981872978868065e-05,
"loss": 0.0056,
"step": 136600
},
{
"epoch": 6.258942572942779,
"grad_norm": 0.5898098945617676,
"learning_rate": 3.6904731467932493e-05,
"loss": 0.0047,
"step": 136700
},
{
"epoch": 6.263521170289482,
"grad_norm": 0.07433097064495087,
"learning_rate": 3.682762340912681e-05,
"loss": 0.0057,
"step": 136800
},
{
"epoch": 6.268099767636184,
"grad_norm": 0.208632692694664,
"learning_rate": 3.675054899942515e-05,
"loss": 0.0064,
"step": 136900
},
{
"epoch": 6.2726783649828874,
"grad_norm": 0.48827114701271057,
"learning_rate": 3.6673508435715634e-05,
"loss": 0.0056,
"step": 137000
},
{
"epoch": 6.277256962329591,
"grad_norm": 0.15773746371269226,
"learning_rate": 3.659650191479994e-05,
"loss": 0.0059,
"step": 137100
},
{
"epoch": 6.281835559676293,
"grad_norm": 0.46037283539772034,
"learning_rate": 3.651952963339282e-05,
"loss": 0.0052,
"step": 137200
},
{
"epoch": 6.286414157022996,
"grad_norm": 0.07779065519571304,
"learning_rate": 3.6442591788121505e-05,
"loss": 0.0051,
"step": 137300
},
{
"epoch": 6.290992754369698,
"grad_norm": 0.5138252377510071,
"learning_rate": 3.6365688575525315e-05,
"loss": 0.0053,
"step": 137400
},
{
"epoch": 6.295571351716402,
"grad_norm": 0.21173468232154846,
"learning_rate": 3.628882019205506e-05,
"loss": 0.0058,
"step": 137500
},
{
"epoch": 6.300149949063105,
"grad_norm": 0.4661062955856323,
"learning_rate": 3.621198683407258e-05,
"loss": 0.0053,
"step": 137600
},
{
"epoch": 6.304728546409807,
"grad_norm": 0.2002924084663391,
"learning_rate": 3.613518869785025e-05,
"loss": 0.0054,
"step": 137700
},
{
"epoch": 6.30930714375651,
"grad_norm": 0.24317267537117004,
"learning_rate": 3.6058425979570485e-05,
"loss": 0.0057,
"step": 137800
},
{
"epoch": 6.313885741103213,
"grad_norm": 0.14312195777893066,
"learning_rate": 3.598169887532521e-05,
"loss": 0.0059,
"step": 137900
},
{
"epoch": 6.318464338449916,
"grad_norm": 0.07625292241573334,
"learning_rate": 3.590500758111537e-05,
"loss": 0.006,
"step": 138000
},
{
"epoch": 6.323042935796619,
"grad_norm": 0.07330285757780075,
"learning_rate": 3.582835229285042e-05,
"loss": 0.0044,
"step": 138100
},
{
"epoch": 6.327621533143321,
"grad_norm": 0.025587473064661026,
"learning_rate": 3.5751733206347894e-05,
"loss": 0.0054,
"step": 138200
},
{
"epoch": 6.332200130490024,
"grad_norm": 0.09335857629776001,
"learning_rate": 3.567515051733277e-05,
"loss": 0.0062,
"step": 138300
},
{
"epoch": 6.336778727836728,
"grad_norm": 0.031704433262348175,
"learning_rate": 3.559860442143709e-05,
"loss": 0.0063,
"step": 138400
},
{
"epoch": 6.34135732518343,
"grad_norm": 0.09114887565374374,
"learning_rate": 3.552209511419943e-05,
"loss": 0.0045,
"step": 138500
},
{
"epoch": 6.345935922530133,
"grad_norm": 0.023929867893457413,
"learning_rate": 3.5445622791064356e-05,
"loss": 0.0053,
"step": 138600
},
{
"epoch": 6.350514519876835,
"grad_norm": 1.4821025133132935,
"learning_rate": 3.5369187647381974e-05,
"loss": 0.0056,
"step": 138700
},
{
"epoch": 6.3550931172235385,
"grad_norm": 0.16608977317810059,
"learning_rate": 3.529278987840744e-05,
"loss": 0.0055,
"step": 138800
},
{
"epoch": 6.359671714570242,
"grad_norm": 0.21598820388317108,
"learning_rate": 3.5216429679300376e-05,
"loss": 0.0051,
"step": 138900
},
{
"epoch": 6.364250311916944,
"grad_norm": 0.016882436349987984,
"learning_rate": 3.5140107245124476e-05,
"loss": 0.0052,
"step": 139000
},
{
"epoch": 6.368828909263647,
"grad_norm": 0.05500126630067825,
"learning_rate": 3.506382277084696e-05,
"loss": 0.0043,
"step": 139100
},
{
"epoch": 6.37340750661035,
"grad_norm": 0.03151680901646614,
"learning_rate": 3.4987576451338055e-05,
"loss": 0.0056,
"step": 139200
},
{
"epoch": 6.377986103957053,
"grad_norm": 0.07861995697021484,
"learning_rate": 3.491136848137053e-05,
"loss": 0.0044,
"step": 139300
},
{
"epoch": 6.382564701303756,
"grad_norm": 0.12852996587753296,
"learning_rate": 3.483519905561924e-05,
"loss": 0.0045,
"step": 139400
},
{
"epoch": 6.387143298650458,
"grad_norm": 0.6488528847694397,
"learning_rate": 3.475906836866046e-05,
"loss": 0.0043,
"step": 139500
},
{
"epoch": 6.391721895997161,
"grad_norm": 0.3967334032058716,
"learning_rate": 3.468297661497164e-05,
"loss": 0.0069,
"step": 139600
},
{
"epoch": 6.396300493343864,
"grad_norm": 0.1223519891500473,
"learning_rate": 3.460692398893068e-05,
"loss": 0.0054,
"step": 139700
},
{
"epoch": 6.400879090690567,
"grad_norm": 0.15460754930973053,
"learning_rate": 3.453091068481559e-05,
"loss": 0.0056,
"step": 139800
},
{
"epoch": 6.40545768803727,
"grad_norm": 0.040079645812511444,
"learning_rate": 3.445493689680388e-05,
"loss": 0.0055,
"step": 139900
},
{
"epoch": 6.410036285383972,
"grad_norm": 0.020937960594892502,
"learning_rate": 3.4379002818972124e-05,
"loss": 0.0054,
"step": 140000
},
{
"epoch": 6.4146148827306755,
"grad_norm": 0.21271613240242004,
"learning_rate": 3.43031086452955e-05,
"loss": 0.0046,
"step": 140100
},
{
"epoch": 6.419193480077379,
"grad_norm": 0.09579429775476456,
"learning_rate": 3.4227254569647205e-05,
"loss": 0.0051,
"step": 140200
},
{
"epoch": 6.423772077424081,
"grad_norm": 0.043937426060438156,
"learning_rate": 3.4151440785798004e-05,
"loss": 0.0053,
"step": 140300
},
{
"epoch": 6.428350674770784,
"grad_norm": 0.4987468123435974,
"learning_rate": 3.4075667487415785e-05,
"loss": 0.0058,
"step": 140400
},
{
"epoch": 6.432929272117486,
"grad_norm": 0.6420878171920776,
"learning_rate": 3.399993486806495e-05,
"loss": 0.0067,
"step": 140500
},
{
"epoch": 6.43750786946419,
"grad_norm": 0.39332908391952515,
"learning_rate": 3.392424312120601e-05,
"loss": 0.0052,
"step": 140600
},
{
"epoch": 6.442086466810893,
"grad_norm": 0.06132081523537636,
"learning_rate": 3.384859244019511e-05,
"loss": 0.0047,
"step": 140700
},
{
"epoch": 6.446665064157595,
"grad_norm": 0.23858435451984406,
"learning_rate": 3.377298301828343e-05,
"loss": 0.0047,
"step": 140800
},
{
"epoch": 6.451243661504298,
"grad_norm": 0.32406067848205566,
"learning_rate": 3.3697415048616765e-05,
"loss": 0.0055,
"step": 140900
},
{
"epoch": 6.455822258851001,
"grad_norm": 1.4054268598556519,
"learning_rate": 3.362188872423506e-05,
"loss": 0.0051,
"step": 141000
},
{
"epoch": 6.460400856197704,
"grad_norm": 0.05585220828652382,
"learning_rate": 3.354640423807183e-05,
"loss": 0.0062,
"step": 141100
},
{
"epoch": 6.464979453544407,
"grad_norm": 0.018032953143119812,
"learning_rate": 3.347096178295371e-05,
"loss": 0.0037,
"step": 141200
},
{
"epoch": 6.469558050891109,
"grad_norm": 0.907580554485321,
"learning_rate": 3.339556155160004e-05,
"loss": 0.006,
"step": 141300
},
{
"epoch": 6.474136648237812,
"grad_norm": 0.08372417092323303,
"learning_rate": 3.3320203736622184e-05,
"loss": 0.0057,
"step": 141400
},
{
"epoch": 6.478715245584516,
"grad_norm": 0.39907532930374146,
"learning_rate": 3.324488853052326e-05,
"loss": 0.0044,
"step": 141500
},
{
"epoch": 6.483293842931218,
"grad_norm": 1.1346983909606934,
"learning_rate": 3.3169616125697486e-05,
"loss": 0.0048,
"step": 141600
},
{
"epoch": 6.487872440277921,
"grad_norm": 0.8341863751411438,
"learning_rate": 3.3094386714429724e-05,
"loss": 0.0047,
"step": 141700
},
{
"epoch": 6.492451037624623,
"grad_norm": 0.07596173137426376,
"learning_rate": 3.301920048889506e-05,
"loss": 0.0041,
"step": 141800
},
{
"epoch": 6.497029634971327,
"grad_norm": 0.7775061130523682,
"learning_rate": 3.294405764115823e-05,
"loss": 0.0049,
"step": 141900
},
{
"epoch": 6.50160823231803,
"grad_norm": 0.30017733573913574,
"learning_rate": 3.286895836317319e-05,
"loss": 0.0043,
"step": 142000
},
{
"epoch": 6.506186829664732,
"grad_norm": 0.3541896343231201,
"learning_rate": 3.2793902846782534e-05,
"loss": 0.0055,
"step": 142100
},
{
"epoch": 6.510765427011435,
"grad_norm": 0.1445729285478592,
"learning_rate": 3.271889128371712e-05,
"loss": 0.0054,
"step": 142200
},
{
"epoch": 6.5153440243581375,
"grad_norm": 0.20019569993019104,
"learning_rate": 3.2643923865595536e-05,
"loss": 0.005,
"step": 142300
},
{
"epoch": 6.519922621704841,
"grad_norm": 0.21448808908462524,
"learning_rate": 3.2569000783923544e-05,
"loss": 0.0051,
"step": 142400
},
{
"epoch": 6.524501219051544,
"grad_norm": 0.13675835728645325,
"learning_rate": 3.249412223009368e-05,
"loss": 0.0046,
"step": 142500
},
{
"epoch": 6.529079816398246,
"grad_norm": 0.11393424868583679,
"learning_rate": 3.2419288395384785e-05,
"loss": 0.004,
"step": 142600
},
{
"epoch": 6.533658413744949,
"grad_norm": 0.2209634631872177,
"learning_rate": 3.234449947096135e-05,
"loss": 0.0042,
"step": 142700
},
{
"epoch": 6.538237011091653,
"grad_norm": 0.025969982147216797,
"learning_rate": 3.226975564787322e-05,
"loss": 0.0059,
"step": 142800
},
{
"epoch": 6.542815608438355,
"grad_norm": 0.8372477293014526,
"learning_rate": 3.2195057117055036e-05,
"loss": 0.0042,
"step": 142900
},
{
"epoch": 6.547394205785058,
"grad_norm": 0.8654465675354004,
"learning_rate": 3.212040406932569e-05,
"loss": 0.0046,
"step": 143000
},
{
"epoch": 6.55197280313176,
"grad_norm": 0.614989161491394,
"learning_rate": 3.204579669538792e-05,
"loss": 0.0052,
"step": 143100
},
{
"epoch": 6.5565514004784635,
"grad_norm": 0.31863656640052795,
"learning_rate": 3.19712351858278e-05,
"loss": 0.0053,
"step": 143200
},
{
"epoch": 6.561129997825166,
"grad_norm": 0.5576188564300537,
"learning_rate": 3.1896719731114186e-05,
"loss": 0.0053,
"step": 143300
},
{
"epoch": 6.565708595171869,
"grad_norm": 0.17591483891010284,
"learning_rate": 3.182225052159833e-05,
"loss": 0.0049,
"step": 143400
},
{
"epoch": 6.570287192518572,
"grad_norm": 0.29361802339553833,
"learning_rate": 3.174782774751338e-05,
"loss": 0.0053,
"step": 143500
},
{
"epoch": 6.5748657898652745,
"grad_norm": 0.046735215932130814,
"learning_rate": 3.167345159897378e-05,
"loss": 0.0047,
"step": 143600
},
{
"epoch": 6.579444387211978,
"grad_norm": 0.3643573224544525,
"learning_rate": 3.1599122265974946e-05,
"loss": 0.0041,
"step": 143700
},
{
"epoch": 6.584022984558681,
"grad_norm": 0.1328035444021225,
"learning_rate": 3.152483993839265e-05,
"loss": 0.0045,
"step": 143800
},
{
"epoch": 6.588601581905383,
"grad_norm": 0.11621426790952682,
"learning_rate": 3.145060480598263e-05,
"loss": 0.0045,
"step": 143900
},
{
"epoch": 6.593180179252086,
"grad_norm": 0.3485720753669739,
"learning_rate": 3.137641705838004e-05,
"loss": 0.0057,
"step": 144000
},
{
"epoch": 6.597758776598789,
"grad_norm": 0.7195566892623901,
"learning_rate": 3.1302276885098955e-05,
"loss": 0.0057,
"step": 144100
},
{
"epoch": 6.602337373945492,
"grad_norm": 0.07403887808322906,
"learning_rate": 3.122818447553201e-05,
"loss": 0.0052,
"step": 144200
},
{
"epoch": 6.606915971292195,
"grad_norm": 0.07861506193876266,
"learning_rate": 3.115414001894974e-05,
"loss": 0.0054,
"step": 144300
},
{
"epoch": 6.611494568638897,
"grad_norm": 0.20306392014026642,
"learning_rate": 3.108014370450021e-05,
"loss": 0.0063,
"step": 144400
},
{
"epoch": 6.6160731659856005,
"grad_norm": 0.5941068530082703,
"learning_rate": 3.100619572120854e-05,
"loss": 0.0052,
"step": 144500
},
{
"epoch": 6.620651763332303,
"grad_norm": 0.4385504126548767,
"learning_rate": 3.0932296257976336e-05,
"loss": 0.0049,
"step": 144600
},
{
"epoch": 6.625230360679006,
"grad_norm": 0.07183999568223953,
"learning_rate": 3.0858445503581266e-05,
"loss": 0.0054,
"step": 144700
},
{
"epoch": 6.629808958025709,
"grad_norm": 0.04285150766372681,
"learning_rate": 3.0784643646676635e-05,
"loss": 0.0046,
"step": 144800
},
{
"epoch": 6.634387555372411,
"grad_norm": 0.04266421124339104,
"learning_rate": 3.071089087579074e-05,
"loss": 0.0064,
"step": 144900
},
{
"epoch": 6.638966152719115,
"grad_norm": 0.4585297405719757,
"learning_rate": 3.063718737932655e-05,
"loss": 0.0053,
"step": 145000
},
{
"epoch": 6.643544750065818,
"grad_norm": 0.04796597734093666,
"learning_rate": 3.0563533345561155e-05,
"loss": 0.0052,
"step": 145100
},
{
"epoch": 6.64812334741252,
"grad_norm": 0.7312212586402893,
"learning_rate": 3.0489928962645275e-05,
"loss": 0.0047,
"step": 145200
},
{
"epoch": 6.652701944759223,
"grad_norm": 0.4970768988132477,
"learning_rate": 3.041637441860279e-05,
"loss": 0.005,
"step": 145300
},
{
"epoch": 6.657280542105926,
"grad_norm": 0.28591519594192505,
"learning_rate": 3.0342869901330313e-05,
"loss": 0.0047,
"step": 145400
},
{
"epoch": 6.661859139452629,
"grad_norm": 0.019086016342043877,
"learning_rate": 3.02694155985966e-05,
"loss": 0.0052,
"step": 145500
},
{
"epoch": 6.666437736799331,
"grad_norm": 0.06344935297966003,
"learning_rate": 3.019601169804216e-05,
"loss": 0.0054,
"step": 145600
},
{
"epoch": 6.671016334146034,
"grad_norm": 0.2660221755504608,
"learning_rate": 3.012265838717878e-05,
"loss": 0.0049,
"step": 145700
},
{
"epoch": 6.675594931492737,
"grad_norm": 0.22985537350177765,
"learning_rate": 3.0049355853388955e-05,
"loss": 0.0049,
"step": 145800
},
{
"epoch": 6.68017352883944,
"grad_norm": 0.06822630017995834,
"learning_rate": 2.9976104283925515e-05,
"loss": 0.004,
"step": 145900
},
{
"epoch": 6.684752126186143,
"grad_norm": 0.03378499671816826,
"learning_rate": 2.9902903865911068e-05,
"loss": 0.0062,
"step": 146000
},
{
"epoch": 6.689330723532846,
"grad_norm": 0.3799358904361725,
"learning_rate": 2.9829754786337603e-05,
"loss": 0.0056,
"step": 146100
},
{
"epoch": 6.693909320879548,
"grad_norm": 0.2396411895751953,
"learning_rate": 2.975665723206591e-05,
"loss": 0.0049,
"step": 146200
},
{
"epoch": 6.698487918226252,
"grad_norm": 0.19714663922786713,
"learning_rate": 2.9683611389825167e-05,
"loss": 0.0057,
"step": 146300
},
{
"epoch": 6.703066515572954,
"grad_norm": 0.07194243371486664,
"learning_rate": 2.9610617446212495e-05,
"loss": 0.0058,
"step": 146400
},
{
"epoch": 6.707645112919657,
"grad_norm": 0.567692220211029,
"learning_rate": 2.9537675587692382e-05,
"loss": 0.0045,
"step": 146500
},
{
"epoch": 6.71222371026636,
"grad_norm": 0.4618910551071167,
"learning_rate": 2.946478600059629e-05,
"loss": 0.0051,
"step": 146600
},
{
"epoch": 6.7168023076130625,
"grad_norm": 0.09115318953990936,
"learning_rate": 2.939194887112218e-05,
"loss": 0.0046,
"step": 146700
},
{
"epoch": 6.721380904959766,
"grad_norm": 0.1926048994064331,
"learning_rate": 2.9319164385333953e-05,
"loss": 0.0039,
"step": 146800
},
{
"epoch": 6.725959502306468,
"grad_norm": 0.5767799615859985,
"learning_rate": 2.9246432729161055e-05,
"loss": 0.0068,
"step": 146900
},
{
"epoch": 6.730538099653171,
"grad_norm": 0.5855737328529358,
"learning_rate": 2.917375408839803e-05,
"loss": 0.0039,
"step": 147000
},
{
"epoch": 6.735116696999874,
"grad_norm": 0.18008683621883392,
"learning_rate": 2.910112864870388e-05,
"loss": 0.0053,
"step": 147100
},
{
"epoch": 6.739695294346577,
"grad_norm": 0.013704614713788033,
"learning_rate": 2.9028556595601786e-05,
"loss": 0.0058,
"step": 147200
},
{
"epoch": 6.74427389169328,
"grad_norm": 0.2103748619556427,
"learning_rate": 2.895603811447858e-05,
"loss": 0.0053,
"step": 147300
},
{
"epoch": 6.748852489039983,
"grad_norm": 0.3199872374534607,
"learning_rate": 2.888357339058413e-05,
"loss": 0.0045,
"step": 147400
},
{
"epoch": 6.753431086386685,
"grad_norm": 0.1987699270248413,
"learning_rate": 2.8811162609031104e-05,
"loss": 0.0038,
"step": 147500
},
{
"epoch": 6.7580096837333885,
"grad_norm": 0.12141498178243637,
"learning_rate": 2.8738805954794295e-05,
"loss": 0.004,
"step": 147600
},
{
"epoch": 6.762588281080091,
"grad_norm": 0.24456042051315308,
"learning_rate": 2.8666503612710226e-05,
"loss": 0.0052,
"step": 147700
},
{
"epoch": 6.767166878426794,
"grad_norm": 0.21112100780010223,
"learning_rate": 2.8594255767476718e-05,
"loss": 0.0057,
"step": 147800
},
{
"epoch": 6.771745475773497,
"grad_norm": 0.395063579082489,
"learning_rate": 2.852206260365237e-05,
"loss": 0.0051,
"step": 147900
},
{
"epoch": 6.7763240731201995,
"grad_norm": 0.39365246891975403,
"learning_rate": 2.8449924305656107e-05,
"loss": 0.0043,
"step": 148000
},
{
"epoch": 6.780902670466903,
"grad_norm": 0.19307668507099152,
"learning_rate": 2.8377841057766624e-05,
"loss": 0.0057,
"step": 148100
},
{
"epoch": 6.785481267813605,
"grad_norm": 0.3313720226287842,
"learning_rate": 2.8305813044122097e-05,
"loss": 0.0054,
"step": 148200
},
{
"epoch": 6.790059865160308,
"grad_norm": 0.6470041871070862,
"learning_rate": 2.8233840448719532e-05,
"loss": 0.0048,
"step": 148300
},
{
"epoch": 6.794638462507011,
"grad_norm": 0.9007655382156372,
"learning_rate": 2.8161923455414367e-05,
"loss": 0.0055,
"step": 148400
},
{
"epoch": 6.799217059853714,
"grad_norm": 0.8383020758628845,
"learning_rate": 2.8090062247920045e-05,
"loss": 0.005,
"step": 148500
},
{
"epoch": 6.803795657200417,
"grad_norm": 0.2168063223361969,
"learning_rate": 2.80182570098075e-05,
"loss": 0.0045,
"step": 148600
},
{
"epoch": 6.80837425454712,
"grad_norm": 0.1763121336698532,
"learning_rate": 2.794650792450464e-05,
"loss": 0.0058,
"step": 148700
},
{
"epoch": 6.812952851893822,
"grad_norm": 0.022952038794755936,
"learning_rate": 2.7874815175296e-05,
"loss": 0.0043,
"step": 148800
},
{
"epoch": 6.8175314492405255,
"grad_norm": 0.01308775506913662,
"learning_rate": 2.7803178945322134e-05,
"loss": 0.0047,
"step": 148900
},
{
"epoch": 6.822110046587228,
"grad_norm": 0.0029964440036565065,
"learning_rate": 2.7731599417579245e-05,
"loss": 0.0052,
"step": 149000
},
{
"epoch": 6.826688643933931,
"grad_norm": 0.2904300093650818,
"learning_rate": 2.7660076774918708e-05,
"loss": 0.0039,
"step": 149100
},
{
"epoch": 6.831267241280633,
"grad_norm": 0.19335739314556122,
"learning_rate": 2.7588611200046592e-05,
"loss": 0.004,
"step": 149200
},
{
"epoch": 6.835845838627336,
"grad_norm": 0.04841936379671097,
"learning_rate": 2.7517202875523117e-05,
"loss": 0.0048,
"step": 149300
},
{
"epoch": 6.84042443597404,
"grad_norm": 0.12522141635417938,
"learning_rate": 2.7445851983762344e-05,
"loss": 0.004,
"step": 149400
},
{
"epoch": 6.845003033320742,
"grad_norm": 0.17885401844978333,
"learning_rate": 2.737455870703155e-05,
"loss": 0.0055,
"step": 149500
},
{
"epoch": 6.849581630667445,
"grad_norm": 0.47798067331314087,
"learning_rate": 2.7303323227450857e-05,
"loss": 0.005,
"step": 149600
},
{
"epoch": 6.854160228014148,
"grad_norm": 0.02613680437207222,
"learning_rate": 2.7232145726992752e-05,
"loss": 0.0065,
"step": 149700
},
{
"epoch": 6.8587388253608506,
"grad_norm": 0.2435833066701889,
"learning_rate": 2.7161026387481636e-05,
"loss": 0.0061,
"step": 149800
},
{
"epoch": 6.863317422707554,
"grad_norm": 0.08455272018909454,
"learning_rate": 2.7089965390593263e-05,
"loss": 0.0059,
"step": 149900
},
{
"epoch": 6.867896020054256,
"grad_norm": 0.08332820981740952,
"learning_rate": 2.7018962917854418e-05,
"loss": 0.0042,
"step": 150000
},
{
"epoch": 6.872474617400959,
"grad_norm": 0.34127193689346313,
"learning_rate": 2.6948019150642383e-05,
"loss": 0.0029,
"step": 150100
},
{
"epoch": 6.877053214747662,
"grad_norm": 0.1273692101240158,
"learning_rate": 2.6877134270184435e-05,
"loss": 0.0051,
"step": 150200
},
{
"epoch": 6.881631812094365,
"grad_norm": 0.14943909645080566,
"learning_rate": 2.6806308457557423e-05,
"loss": 0.0062,
"step": 150300
},
{
"epoch": 6.886210409441068,
"grad_norm": 0.14334943890571594,
"learning_rate": 2.6735541893687343e-05,
"loss": 0.0056,
"step": 150400
},
{
"epoch": 6.89078900678777,
"grad_norm": 0.15444263815879822,
"learning_rate": 2.666483475934885e-05,
"loss": 0.0045,
"step": 150500
},
{
"epoch": 6.895367604134473,
"grad_norm": 0.31661495566368103,
"learning_rate": 2.6594187235164713e-05,
"loss": 0.0063,
"step": 150600
},
{
"epoch": 6.8999462014811765,
"grad_norm": 0.4060909152030945,
"learning_rate": 2.65235995016055e-05,
"loss": 0.0052,
"step": 150700
},
{
"epoch": 6.904524798827879,
"grad_norm": 0.20253728330135345,
"learning_rate": 2.645307173898901e-05,
"loss": 0.0064,
"step": 150800
},
{
"epoch": 6.909103396174582,
"grad_norm": 0.7078954577445984,
"learning_rate": 2.6382604127479815e-05,
"loss": 0.0044,
"step": 150900
},
{
"epoch": 6.913681993521285,
"grad_norm": 0.14812923967838287,
"learning_rate": 2.6312196847088893e-05,
"loss": 0.0052,
"step": 151000
},
{
"epoch": 6.9182605908679875,
"grad_norm": 0.10642609000205994,
"learning_rate": 2.6241850077673087e-05,
"loss": 0.0052,
"step": 151100
},
{
"epoch": 6.922839188214691,
"grad_norm": 0.15536317229270935,
"learning_rate": 2.6171563998934605e-05,
"loss": 0.0053,
"step": 151200
},
{
"epoch": 6.927417785561393,
"grad_norm": 0.1677425354719162,
"learning_rate": 2.6101338790420715e-05,
"loss": 0.0048,
"step": 151300
},
{
"epoch": 6.931996382908096,
"grad_norm": 0.1952294558286667,
"learning_rate": 2.6031174631523118e-05,
"loss": 0.0059,
"step": 151400
},
{
"epoch": 6.9365749802547985,
"grad_norm": 0.171901673078537,
"learning_rate": 2.5961071701477567e-05,
"loss": 0.0049,
"step": 151500
},
{
"epoch": 6.941153577601502,
"grad_norm": 0.0632760226726532,
"learning_rate": 2.589103017936344e-05,
"loss": 0.0043,
"step": 151600
},
{
"epoch": 6.945732174948205,
"grad_norm": 0.14387081563472748,
"learning_rate": 2.582105024410325e-05,
"loss": 0.0046,
"step": 151700
},
{
"epoch": 6.950310772294907,
"grad_norm": 0.1789962351322174,
"learning_rate": 2.575113207446213e-05,
"loss": 0.0041,
"step": 151800
},
{
"epoch": 6.95488936964161,
"grad_norm": 0.05178796499967575,
"learning_rate": 2.5681275849047482e-05,
"loss": 0.0052,
"step": 151900
},
{
"epoch": 6.9594679669883135,
"grad_norm": 0.06059027463197708,
"learning_rate": 2.5611481746308473e-05,
"loss": 0.0049,
"step": 152000
},
{
"epoch": 6.964046564335016,
"grad_norm": 0.02760574221611023,
"learning_rate": 2.5541749944535554e-05,
"loss": 0.005,
"step": 152100
},
{
"epoch": 6.968625161681719,
"grad_norm": 0.5041255950927734,
"learning_rate": 2.547208062185999e-05,
"loss": 0.0034,
"step": 152200
},
{
"epoch": 6.973203759028421,
"grad_norm": 0.12533140182495117,
"learning_rate": 2.5402473956253515e-05,
"loss": 0.0059,
"step": 152300
},
{
"epoch": 6.9777823563751245,
"grad_norm": 0.0706457793712616,
"learning_rate": 2.5332930125527787e-05,
"loss": 0.006,
"step": 152400
},
{
"epoch": 6.982360953721828,
"grad_norm": 0.37089434266090393,
"learning_rate": 2.5263449307333908e-05,
"loss": 0.0052,
"step": 152500
},
{
"epoch": 6.98693955106853,
"grad_norm": 0.034625936299562454,
"learning_rate": 2.5194031679162067e-05,
"loss": 0.0048,
"step": 152600
},
{
"epoch": 6.991518148415233,
"grad_norm": 0.19594725966453552,
"learning_rate": 2.512467741834099e-05,
"loss": 0.0048,
"step": 152700
},
{
"epoch": 6.996096745761935,
"grad_norm": 0.09410729259252548,
"learning_rate": 2.505538670203754e-05,
"loss": 0.0043,
"step": 152800
},
{
"epoch": 6.9999885535066335,
"eval_loss": 0.1694260537624359,
"eval_runtime": 268.0609,
"eval_samples_per_second": 20.518,
"eval_steps_per_second": 20.518,
"step": 152885
},
{
"epoch": 7.000675343108639,
"grad_norm": 0.03295362740755081,
"learning_rate": 2.4986159707256274e-05,
"loss": 0.0039,
"step": 152900
},
{
"epoch": 7.005253940455342,
"grad_norm": 0.06891167163848877,
"learning_rate": 2.4916996610838973e-05,
"loss": 0.0035,
"step": 153000
},
{
"epoch": 7.009832537802044,
"grad_norm": 0.12001962214708328,
"learning_rate": 2.484789758946414e-05,
"loss": 0.0034,
"step": 153100
},
{
"epoch": 7.014411135148747,
"grad_norm": 0.2218388170003891,
"learning_rate": 2.477886281964667e-05,
"loss": 0.0026,
"step": 153200
},
{
"epoch": 7.01898973249545,
"grad_norm": 0.006664707791060209,
"learning_rate": 2.4709892477737262e-05,
"loss": 0.0028,
"step": 153300
},
{
"epoch": 7.023568329842153,
"grad_norm": 0.0030799272935837507,
"learning_rate": 2.464098673992205e-05,
"loss": 0.0035,
"step": 153400
},
{
"epoch": 7.028146927188856,
"grad_norm": 0.14536774158477783,
"learning_rate": 2.457214578222215e-05,
"loss": 0.0028,
"step": 153500
},
{
"epoch": 7.032725524535558,
"grad_norm": 0.14756208658218384,
"learning_rate": 2.450336978049322e-05,
"loss": 0.0031,
"step": 153600
},
{
"epoch": 7.037304121882261,
"grad_norm": 0.22443515062332153,
"learning_rate": 2.44346589104249e-05,
"loss": 0.0044,
"step": 153700
},
{
"epoch": 7.041882719228965,
"grad_norm": 0.05271737277507782,
"learning_rate": 2.4366013347540545e-05,
"loss": 0.0034,
"step": 153800
},
{
"epoch": 7.046461316575667,
"grad_norm": 0.10101396590471268,
"learning_rate": 2.4297433267196668e-05,
"loss": 0.0043,
"step": 153900
},
{
"epoch": 7.05103991392237,
"grad_norm": 0.042054641991853714,
"learning_rate": 2.422891884458241e-05,
"loss": 0.0034,
"step": 154000
},
{
"epoch": 7.055618511269072,
"grad_norm": 0.3988908529281616,
"learning_rate": 2.4160470254719285e-05,
"loss": 0.0033,
"step": 154100
},
{
"epoch": 7.0601971086157755,
"grad_norm": 0.02858237735927105,
"learning_rate": 2.4092087672460623e-05,
"loss": 0.004,
"step": 154200
},
{
"epoch": 7.064775705962479,
"grad_norm": 0.023146772757172585,
"learning_rate": 2.4023771272491125e-05,
"loss": 0.0033,
"step": 154300
},
{
"epoch": 7.069354303309181,
"grad_norm": 0.2908150553703308,
"learning_rate": 2.39555212293264e-05,
"loss": 0.0028,
"step": 154400
},
{
"epoch": 7.073932900655884,
"grad_norm": 0.15492355823516846,
"learning_rate": 2.38873377173126e-05,
"loss": 0.003,
"step": 154500
},
{
"epoch": 7.0785114980025865,
"grad_norm": 0.15679115056991577,
"learning_rate": 2.3819220910625882e-05,
"loss": 0.002,
"step": 154600
},
{
"epoch": 7.08309009534929,
"grad_norm": 0.08304117619991302,
"learning_rate": 2.3751170983272e-05,
"loss": 0.0037,
"step": 154700
},
{
"epoch": 7.087668692695993,
"grad_norm": 0.19912482798099518,
"learning_rate": 2.368318810908588e-05,
"loss": 0.0038,
"step": 154800
},
{
"epoch": 7.092247290042695,
"grad_norm": 0.163644939661026,
"learning_rate": 2.3615272461731186e-05,
"loss": 0.0046,
"step": 154900
},
{
"epoch": 7.096825887389398,
"grad_norm": 0.564191460609436,
"learning_rate": 2.3547424214699786e-05,
"loss": 0.0027,
"step": 155000
},
{
"epoch": 7.1014044847361015,
"grad_norm": 0.21988272666931152,
"learning_rate": 2.347964354131144e-05,
"loss": 0.0032,
"step": 155100
},
{
"epoch": 7.105983082082804,
"grad_norm": 0.028988847509026527,
"learning_rate": 2.3411930614713247e-05,
"loss": 0.0038,
"step": 155200
},
{
"epoch": 7.110561679429507,
"grad_norm": 0.23106561601161957,
"learning_rate": 2.3344285607879224e-05,
"loss": 0.0026,
"step": 155300
},
{
"epoch": 7.115140276776209,
"grad_norm": 0.08767526596784592,
"learning_rate": 2.3276708693609943e-05,
"loss": 0.0044,
"step": 155400
},
{
"epoch": 7.1197188741229125,
"grad_norm": 0.11939793080091476,
"learning_rate": 2.3209200044532027e-05,
"loss": 0.0028,
"step": 155500
},
{
"epoch": 7.124297471469616,
"grad_norm": 0.19785170257091522,
"learning_rate": 2.3141759833097653e-05,
"loss": 0.003,
"step": 155600
},
{
"epoch": 7.128876068816318,
"grad_norm": 0.889499843120575,
"learning_rate": 2.307438823158425e-05,
"loss": 0.0024,
"step": 155700
},
{
"epoch": 7.133454666163021,
"grad_norm": 0.5035886764526367,
"learning_rate": 2.300708541209393e-05,
"loss": 0.0039,
"step": 155800
},
{
"epoch": 7.1380332635097234,
"grad_norm": 0.04118403419852257,
"learning_rate": 2.2939851546553094e-05,
"loss": 0.0038,
"step": 155900
},
{
"epoch": 7.142611860856427,
"grad_norm": 0.3574579358100891,
"learning_rate": 2.2872686806712035e-05,
"loss": 0.0028,
"step": 156000
},
{
"epoch": 7.14719045820313,
"grad_norm": 0.005380525719374418,
"learning_rate": 2.2805591364144447e-05,
"loss": 0.0028,
"step": 156100
},
{
"epoch": 7.151769055549832,
"grad_norm": 0.0275371465831995,
"learning_rate": 2.273856539024703e-05,
"loss": 0.0029,
"step": 156200
},
{
"epoch": 7.156347652896535,
"grad_norm": 0.05387549847364426,
"learning_rate": 2.2671609056238952e-05,
"loss": 0.0026,
"step": 156300
},
{
"epoch": 7.160926250243238,
"grad_norm": 0.25863537192344666,
"learning_rate": 2.2604722533161572e-05,
"loss": 0.0022,
"step": 156400
},
{
"epoch": 7.165504847589941,
"grad_norm": 0.03250390663743019,
"learning_rate": 2.2537905991877855e-05,
"loss": 0.0026,
"step": 156500
},
{
"epoch": 7.170083444936644,
"grad_norm": 0.15917915105819702,
"learning_rate": 2.2471159603071995e-05,
"loss": 0.0047,
"step": 156600
},
{
"epoch": 7.174662042283346,
"grad_norm": 0.25873464345932007,
"learning_rate": 2.2404483537249023e-05,
"loss": 0.0041,
"step": 156700
},
{
"epoch": 7.179240639630049,
"grad_norm": 0.03446133807301521,
"learning_rate": 2.233787796473432e-05,
"loss": 0.0027,
"step": 156800
},
{
"epoch": 7.183819236976753,
"grad_norm": 0.39116761088371277,
"learning_rate": 2.2271343055673144e-05,
"loss": 0.0027,
"step": 156900
},
{
"epoch": 7.188397834323455,
"grad_norm": 0.005667871795594692,
"learning_rate": 2.22048789800303e-05,
"loss": 0.0032,
"step": 157000
},
{
"epoch": 7.192976431670158,
"grad_norm": 0.2927045226097107,
"learning_rate": 2.2138485907589613e-05,
"loss": 0.0033,
"step": 157100
},
{
"epoch": 7.19755502901686,
"grad_norm": 0.15872865915298462,
"learning_rate": 2.2072164007953517e-05,
"loss": 0.0029,
"step": 157200
},
{
"epoch": 7.202133626363564,
"grad_norm": 0.5440332293510437,
"learning_rate": 2.200591345054267e-05,
"loss": 0.0037,
"step": 157300
},
{
"epoch": 7.206712223710267,
"grad_norm": 0.2492242008447647,
"learning_rate": 2.193973440459549e-05,
"loss": 0.0029,
"step": 157400
},
{
"epoch": 7.211290821056969,
"grad_norm": 0.0664735659956932,
"learning_rate": 2.187362703916766e-05,
"loss": 0.0036,
"step": 157500
},
{
"epoch": 7.215869418403672,
"grad_norm": 0.006082352716475725,
"learning_rate": 2.1807591523131827e-05,
"loss": 0.0023,
"step": 157600
},
{
"epoch": 7.2204480157503745,
"grad_norm": 0.008978066965937614,
"learning_rate": 2.1741628025177036e-05,
"loss": 0.0031,
"step": 157700
},
{
"epoch": 7.225026613097078,
"grad_norm": 0.034269288182258606,
"learning_rate": 2.167573671380837e-05,
"loss": 0.005,
"step": 157800
},
{
"epoch": 7.229605210443781,
"grad_norm": 0.2424388974905014,
"learning_rate": 2.1609917757346542e-05,
"loss": 0.0031,
"step": 157900
},
{
"epoch": 7.234183807790483,
"grad_norm": 0.07712133228778839,
"learning_rate": 2.1544171323927415e-05,
"loss": 0.003,
"step": 158000
},
{
"epoch": 7.238762405137186,
"grad_norm": 0.4302210509777069,
"learning_rate": 2.1478497581501616e-05,
"loss": 0.0034,
"step": 158100
},
{
"epoch": 7.243341002483889,
"grad_norm": 0.09475143998861313,
"learning_rate": 2.141289669783401e-05,
"loss": 0.0028,
"step": 158200
},
{
"epoch": 7.247919599830592,
"grad_norm": 0.35180267691612244,
"learning_rate": 2.134736884050343e-05,
"loss": 0.0042,
"step": 158300
},
{
"epoch": 7.252498197177295,
"grad_norm": 0.013500731438398361,
"learning_rate": 2.1281914176902108e-05,
"loss": 0.0043,
"step": 158400
},
{
"epoch": 7.257076794523997,
"grad_norm": 0.23346847295761108,
"learning_rate": 2.1216532874235285e-05,
"loss": 0.0031,
"step": 158500
},
{
"epoch": 7.2616553918707005,
"grad_norm": 0.16270193457603455,
"learning_rate": 2.115122509952085e-05,
"loss": 0.004,
"step": 158600
},
{
"epoch": 7.266233989217403,
"grad_norm": 0.23846475780010223,
"learning_rate": 2.1085991019588863e-05,
"loss": 0.0027,
"step": 158700
},
{
"epoch": 7.270812586564106,
"grad_norm": 0.027605965733528137,
"learning_rate": 2.1020830801081077e-05,
"loss": 0.0026,
"step": 158800
},
{
"epoch": 7.275391183910809,
"grad_norm": 0.01757560484111309,
"learning_rate": 2.0955744610450618e-05,
"loss": 0.0036,
"step": 158900
},
{
"epoch": 7.2799697812575115,
"grad_norm": 0.02324344404041767,
"learning_rate": 2.0890732613961478e-05,
"loss": 0.0029,
"step": 159000
},
{
"epoch": 7.284548378604215,
"grad_norm": 0.01704220287501812,
"learning_rate": 2.0825794977688108e-05,
"loss": 0.0037,
"step": 159100
},
{
"epoch": 7.289126975950918,
"grad_norm": 0.08089294284582138,
"learning_rate": 2.0760931867515032e-05,
"loss": 0.0035,
"step": 159200
},
{
"epoch": 7.29370557329762,
"grad_norm": 0.15005187690258026,
"learning_rate": 2.0696143449136402e-05,
"loss": 0.0022,
"step": 159300
},
{
"epoch": 7.298284170644323,
"grad_norm": 0.0878557413816452,
"learning_rate": 2.063142988805552e-05,
"loss": 0.0035,
"step": 159400
},
{
"epoch": 7.302862767991026,
"grad_norm": 0.012229022569954395,
"learning_rate": 2.056679134958453e-05,
"loss": 0.0026,
"step": 159500
},
{
"epoch": 7.307441365337729,
"grad_norm": 0.053704481571912766,
"learning_rate": 2.050222799884387e-05,
"loss": 0.0036,
"step": 159600
},
{
"epoch": 7.312019962684432,
"grad_norm": 0.5345095992088318,
"learning_rate": 2.0437740000761925e-05,
"loss": 0.0038,
"step": 159700
},
{
"epoch": 7.316598560031134,
"grad_norm": 0.09854476153850555,
"learning_rate": 2.037332752007461e-05,
"loss": 0.0031,
"step": 159800
},
{
"epoch": 7.3211771573778375,
"grad_norm": 0.04005116969347,
"learning_rate": 2.0308990721324927e-05,
"loss": 0.0027,
"step": 159900
},
{
"epoch": 7.32575575472454,
"grad_norm": 1.264863133430481,
"learning_rate": 2.0244729768862518e-05,
"loss": 0.0034,
"step": 160000
},
{
"epoch": 7.330334352071243,
"grad_norm": 0.017268653959035873,
"learning_rate": 2.01805448268433e-05,
"loss": 0.0037,
"step": 160100
},
{
"epoch": 7.334912949417946,
"grad_norm": 0.10752640664577484,
"learning_rate": 2.0116436059229038e-05,
"loss": 0.0035,
"step": 160200
},
{
"epoch": 7.339491546764648,
"grad_norm": 0.43235811591148376,
"learning_rate": 2.0052403629786858e-05,
"loss": 0.0027,
"step": 160300
},
{
"epoch": 7.344070144111352,
"grad_norm": 0.014576783403754234,
"learning_rate": 1.9988447702088898e-05,
"loss": 0.0035,
"step": 160400
},
{
"epoch": 7.348648741458054,
"grad_norm": 0.1350947916507721,
"learning_rate": 1.9924568439511876e-05,
"loss": 0.0032,
"step": 160500
},
{
"epoch": 7.353227338804757,
"grad_norm": 0.24974310398101807,
"learning_rate": 1.98607660052367e-05,
"loss": 0.0038,
"step": 160600
},
{
"epoch": 7.35780593615146,
"grad_norm": 0.05233803018927574,
"learning_rate": 1.9797040562247948e-05,
"loss": 0.0041,
"step": 160700
},
{
"epoch": 7.362384533498163,
"grad_norm": 0.18822649121284485,
"learning_rate": 1.9733392273333596e-05,
"loss": 0.0037,
"step": 160800
},
{
"epoch": 7.366963130844866,
"grad_norm": 0.19756104052066803,
"learning_rate": 1.9669821301084475e-05,
"loss": 0.0027,
"step": 160900
},
{
"epoch": 7.371541728191568,
"grad_norm": 0.00448650261387229,
"learning_rate": 1.9606327807893902e-05,
"loss": 0.0032,
"step": 161000
},
{
"epoch": 7.376120325538271,
"grad_norm": 0.14489981532096863,
"learning_rate": 1.954291195595733e-05,
"loss": 0.0031,
"step": 161100
},
{
"epoch": 7.380698922884974,
"grad_norm": 0.0051267268136143684,
"learning_rate": 1.947957390727185e-05,
"loss": 0.003,
"step": 161200
},
{
"epoch": 7.385277520231677,
"grad_norm": 0.38486120104789734,
"learning_rate": 1.941631382363576e-05,
"loss": 0.0035,
"step": 161300
},
{
"epoch": 7.38985611757838,
"grad_norm": 0.004985155537724495,
"learning_rate": 1.9353131866648273e-05,
"loss": 0.0024,
"step": 161400
},
{
"epoch": 7.394434714925083,
"grad_norm": 0.002783630508929491,
"learning_rate": 1.929002819770896e-05,
"loss": 0.0034,
"step": 161500
},
{
"epoch": 7.399013312271785,
"grad_norm": 0.2842748165130615,
"learning_rate": 1.922700297801741e-05,
"loss": 0.0034,
"step": 161600
},
{
"epoch": 7.403591909618489,
"grad_norm": 0.050929997116327286,
"learning_rate": 1.9164056368572846e-05,
"loss": 0.003,
"step": 161700
},
{
"epoch": 7.408170506965191,
"grad_norm": 0.06748020648956299,
"learning_rate": 1.9101188530173687e-05,
"loss": 0.0032,
"step": 161800
},
{
"epoch": 7.412749104311894,
"grad_norm": 0.03134176880121231,
"learning_rate": 1.9038399623417063e-05,
"loss": 0.0023,
"step": 161900
},
{
"epoch": 7.417327701658597,
"grad_norm": 0.06679194420576096,
"learning_rate": 1.897568980869855e-05,
"loss": 0.0032,
"step": 162000
},
{
"epoch": 7.4219062990052995,
"grad_norm": 0.22911858558654785,
"learning_rate": 1.8913059246211612e-05,
"loss": 0.0033,
"step": 162100
},
{
"epoch": 7.426484896352003,
"grad_norm": 0.12825864553451538,
"learning_rate": 1.8850508095947332e-05,
"loss": 0.0029,
"step": 162200
},
{
"epoch": 7.431063493698705,
"grad_norm": 0.022259972989559174,
"learning_rate": 1.8788036517693858e-05,
"loss": 0.004,
"step": 162300
},
{
"epoch": 7.435642091045408,
"grad_norm": 0.09766406565904617,
"learning_rate": 1.8725644671036126e-05,
"loss": 0.0033,
"step": 162400
},
{
"epoch": 7.440220688392111,
"grad_norm": 0.6670352816581726,
"learning_rate": 1.8663332715355396e-05,
"loss": 0.0032,
"step": 162500
},
{
"epoch": 7.444799285738814,
"grad_norm": 0.009802890941500664,
"learning_rate": 1.8601100809828787e-05,
"loss": 0.0039,
"step": 162600
},
{
"epoch": 7.449377883085517,
"grad_norm": 0.08977996557950974,
"learning_rate": 1.853894911342901e-05,
"loss": 0.0029,
"step": 162700
},
{
"epoch": 7.45395648043222,
"grad_norm": 0.713555097579956,
"learning_rate": 1.847687778492382e-05,
"loss": 0.0027,
"step": 162800
},
{
"epoch": 7.458535077778922,
"grad_norm": 0.3743430972099304,
"learning_rate": 1.8414886982875664e-05,
"loss": 0.0034,
"step": 162900
},
{
"epoch": 7.4631136751256255,
"grad_norm": 0.0767466276884079,
"learning_rate": 1.8352976865641326e-05,
"loss": 0.0032,
"step": 163000
},
{
"epoch": 7.467692272472328,
"grad_norm": 0.28391310572624207,
"learning_rate": 1.8291147591371482e-05,
"loss": 0.0035,
"step": 163100
},
{
"epoch": 7.472270869819031,
"grad_norm": 0.25534164905548096,
"learning_rate": 1.822939931801024e-05,
"loss": 0.0028,
"step": 163200
},
{
"epoch": 7.476849467165734,
"grad_norm": 0.03635001927614212,
"learning_rate": 1.816773220329484e-05,
"loss": 0.0035,
"step": 163300
},
{
"epoch": 7.4814280645124365,
"grad_norm": 0.06547212600708008,
"learning_rate": 1.810614640475518e-05,
"loss": 0.004,
"step": 163400
},
{
"epoch": 7.48600666185914,
"grad_norm": 0.10231446474790573,
"learning_rate": 1.8044642079713408e-05,
"loss": 0.0026,
"step": 163500
},
{
"epoch": 7.490585259205842,
"grad_norm": 0.08887581527233124,
"learning_rate": 1.79832193852836e-05,
"loss": 0.002,
"step": 163600
},
{
"epoch": 7.495163856552545,
"grad_norm": 0.01825689524412155,
"learning_rate": 1.792187847837129e-05,
"loss": 0.0032,
"step": 163700
},
{
"epoch": 7.499742453899248,
"grad_norm": 0.0413985475897789,
"learning_rate": 1.7860619515673033e-05,
"loss": 0.003,
"step": 163800
},
{
"epoch": 7.504321051245951,
"grad_norm": 0.11123603582382202,
"learning_rate": 1.779944265367614e-05,
"loss": 0.0031,
"step": 163900
},
{
"epoch": 7.508899648592654,
"grad_norm": 0.11079199612140656,
"learning_rate": 1.7738348048658127e-05,
"loss": 0.0029,
"step": 164000
},
{
"epoch": 7.513478245939356,
"grad_norm": 0.026996923610568047,
"learning_rate": 1.767733585668639e-05,
"loss": 0.0028,
"step": 164100
},
{
"epoch": 7.518056843286059,
"grad_norm": 0.2861877381801605,
"learning_rate": 1.7616406233617832e-05,
"loss": 0.0033,
"step": 164200
},
{
"epoch": 7.5226354406327625,
"grad_norm": 0.013889641501009464,
"learning_rate": 1.7555559335098414e-05,
"loss": 0.0034,
"step": 164300
},
{
"epoch": 7.527214037979465,
"grad_norm": 0.5749355554580688,
"learning_rate": 1.749479531656279e-05,
"loss": 0.0034,
"step": 164400
},
{
"epoch": 7.531792635326168,
"grad_norm": 0.03499993681907654,
"learning_rate": 1.7434114333233852e-05,
"loss": 0.0029,
"step": 164500
},
{
"epoch": 7.53637123267287,
"grad_norm": 0.1424218863248825,
"learning_rate": 1.737351654012244e-05,
"loss": 0.0025,
"step": 164600
},
{
"epoch": 7.540949830019573,
"grad_norm": 0.009633993729948997,
"learning_rate": 1.7313002092026837e-05,
"loss": 0.0032,
"step": 164700
},
{
"epoch": 7.545528427366277,
"grad_norm": 0.02650436945259571,
"learning_rate": 1.725257114353241e-05,
"loss": 0.0044,
"step": 164800
},
{
"epoch": 7.550107024712979,
"grad_norm": 0.0338139683008194,
"learning_rate": 1.7192223849011258e-05,
"loss": 0.0029,
"step": 164900
},
{
"epoch": 7.554685622059682,
"grad_norm": 1.0118355751037598,
"learning_rate": 1.7131960362621796e-05,
"loss": 0.0041,
"step": 165000
},
{
"epoch": 7.559264219406385,
"grad_norm": 0.014256274327635765,
"learning_rate": 1.7071780838308288e-05,
"loss": 0.0027,
"step": 165100
},
{
"epoch": 7.563842816753088,
"grad_norm": 0.05664459615945816,
"learning_rate": 1.7011685429800595e-05,
"loss": 0.0026,
"step": 165200
},
{
"epoch": 7.568421414099791,
"grad_norm": 0.14832501113414764,
"learning_rate": 1.695167429061364e-05,
"loss": 0.0027,
"step": 165300
},
{
"epoch": 7.573000011446493,
"grad_norm": 0.19807232916355133,
"learning_rate": 1.6891747574047078e-05,
"loss": 0.0026,
"step": 165400
},
{
"epoch": 7.577578608793196,
"grad_norm": 0.09145753085613251,
"learning_rate": 1.6831905433184946e-05,
"loss": 0.0032,
"step": 165500
},
{
"epoch": 7.582157206139899,
"grad_norm": 0.021602990105748177,
"learning_rate": 1.6772148020895228e-05,
"loss": 0.0022,
"step": 165600
},
{
"epoch": 7.586735803486602,
"grad_norm": 0.2839347720146179,
"learning_rate": 1.671247548982941e-05,
"loss": 0.0034,
"step": 165700
},
{
"epoch": 7.591314400833305,
"grad_norm": 0.02294602431356907,
"learning_rate": 1.6652887992422235e-05,
"loss": 0.0023,
"step": 165800
},
{
"epoch": 7.595892998180007,
"grad_norm": 0.027606772258877754,
"learning_rate": 1.659338568089114e-05,
"loss": 0.0032,
"step": 165900
},
{
"epoch": 7.60047159552671,
"grad_norm": 0.01902574673295021,
"learning_rate": 1.653396870723599e-05,
"loss": 0.0036,
"step": 166000
},
{
"epoch": 7.6050501928734136,
"grad_norm": 0.06941546499729156,
"learning_rate": 1.6474637223238665e-05,
"loss": 0.0031,
"step": 166100
},
{
"epoch": 7.609628790220116,
"grad_norm": 0.06622402369976044,
"learning_rate": 1.641539138046264e-05,
"loss": 0.003,
"step": 166200
},
{
"epoch": 7.614207387566819,
"grad_norm": 0.0019321365980431437,
"learning_rate": 1.6356231330252657e-05,
"loss": 0.0031,
"step": 166300
},
{
"epoch": 7.618785984913522,
"grad_norm": 0.11348855495452881,
"learning_rate": 1.629715722373423e-05,
"loss": 0.0039,
"step": 166400
},
{
"epoch": 7.6233645822602245,
"grad_norm": 0.14493609964847565,
"learning_rate": 1.6238169211813387e-05,
"loss": 0.0019,
"step": 166500
},
{
"epoch": 7.627943179606928,
"grad_norm": 0.11578594148159027,
"learning_rate": 1.6179267445176206e-05,
"loss": 0.0031,
"step": 166600
},
{
"epoch": 7.63252177695363,
"grad_norm": 0.026161905378103256,
"learning_rate": 1.6120452074288416e-05,
"loss": 0.0031,
"step": 166700
},
{
"epoch": 7.637100374300333,
"grad_norm": 0.048572130501270294,
"learning_rate": 1.6061723249395104e-05,
"loss": 0.0027,
"step": 166800
},
{
"epoch": 7.6416789716470355,
"grad_norm": 0.08658236265182495,
"learning_rate": 1.600308112052027e-05,
"loss": 0.0048,
"step": 166900
},
{
"epoch": 7.646257568993739,
"grad_norm": 0.03995939716696739,
"learning_rate": 1.594452583746638e-05,
"loss": 0.0029,
"step": 167000
},
{
"epoch": 7.650836166340442,
"grad_norm": 0.5306475758552551,
"learning_rate": 1.588605754981413e-05,
"loss": 0.0032,
"step": 167100
},
{
"epoch": 7.655414763687144,
"grad_norm": 0.008948258124291897,
"learning_rate": 1.582767640692194e-05,
"loss": 0.0024,
"step": 167200
},
{
"epoch": 7.659993361033847,
"grad_norm": 0.09350460022687912,
"learning_rate": 1.576938255792561e-05,
"loss": 0.0032,
"step": 167300
},
{
"epoch": 7.6645719583805505,
"grad_norm": 0.34027963876724243,
"learning_rate": 1.5711176151737984e-05,
"loss": 0.0029,
"step": 167400
},
{
"epoch": 7.669150555727253,
"grad_norm": 0.012650508433580399,
"learning_rate": 1.5653057337048514e-05,
"loss": 0.0031,
"step": 167500
},
{
"epoch": 7.673729153073956,
"grad_norm": 0.07974658906459808,
"learning_rate": 1.5595026262322875e-05,
"loss": 0.0023,
"step": 167600
},
{
"epoch": 7.678307750420658,
"grad_norm": 0.06705432385206223,
"learning_rate": 1.553708307580265e-05,
"loss": 0.0032,
"step": 167700
},
{
"epoch": 7.6828863477673615,
"grad_norm": 0.027641797438263893,
"learning_rate": 1.547922792550488e-05,
"loss": 0.0036,
"step": 167800
},
{
"epoch": 7.687464945114065,
"grad_norm": 0.44552162289619446,
"learning_rate": 1.5421460959221707e-05,
"loss": 0.0036,
"step": 167900
},
{
"epoch": 7.692043542460767,
"grad_norm": 0.02241067960858345,
"learning_rate": 1.536378232452003e-05,
"loss": 0.0037,
"step": 168000
},
{
"epoch": 7.69662213980747,
"grad_norm": 0.2189732789993286,
"learning_rate": 1.5306192168741117e-05,
"loss": 0.0026,
"step": 168100
},
{
"epoch": 7.701200737154172,
"grad_norm": 0.046641841530799866,
"learning_rate": 1.5248690639000162e-05,
"loss": 0.0035,
"step": 168200
},
{
"epoch": 7.705779334500876,
"grad_norm": 0.02562684379518032,
"learning_rate": 1.5191277882186023e-05,
"loss": 0.003,
"step": 168300
},
{
"epoch": 7.710357931847579,
"grad_norm": 0.22241626679897308,
"learning_rate": 1.513395404496072e-05,
"loss": 0.0022,
"step": 168400
},
{
"epoch": 7.714936529194281,
"grad_norm": 0.2740160822868347,
"learning_rate": 1.5076719273759198e-05,
"loss": 0.0033,
"step": 168500
},
{
"epoch": 7.719515126540984,
"grad_norm": 0.02267398126423359,
"learning_rate": 1.5019573714788809e-05,
"loss": 0.002,
"step": 168600
},
{
"epoch": 7.7240937238876874,
"grad_norm": 0.008224272169172764,
"learning_rate": 1.4962517514029067e-05,
"loss": 0.0022,
"step": 168700
},
{
"epoch": 7.72867232123439,
"grad_norm": 0.11832094937562943,
"learning_rate": 1.4905550817231206e-05,
"loss": 0.0029,
"step": 168800
},
{
"epoch": 7.733250918581093,
"grad_norm": 0.3029548227787018,
"learning_rate": 1.4848673769917787e-05,
"loss": 0.0042,
"step": 168900
},
{
"epoch": 7.737829515927795,
"grad_norm": 0.026391340419650078,
"learning_rate": 1.4791886517382413e-05,
"loss": 0.0031,
"step": 169000
},
{
"epoch": 7.742408113274498,
"grad_norm": 0.4289281666278839,
"learning_rate": 1.473518920468926e-05,
"loss": 0.0033,
"step": 169100
},
{
"epoch": 7.746986710621201,
"grad_norm": 0.1801924854516983,
"learning_rate": 1.4678581976672751e-05,
"loss": 0.0028,
"step": 169200
},
{
"epoch": 7.751565307967904,
"grad_norm": 0.06808359920978546,
"learning_rate": 1.4622064977937222e-05,
"loss": 0.0037,
"step": 169300
},
{
"epoch": 7.756143905314607,
"grad_norm": 0.5008605122566223,
"learning_rate": 1.4565638352856503e-05,
"loss": 0.0032,
"step": 169400
},
{
"epoch": 7.760722502661309,
"grad_norm": 0.13920585811138153,
"learning_rate": 1.4509302245573536e-05,
"loss": 0.0032,
"step": 169500
},
{
"epoch": 7.7653011000080125,
"grad_norm": 0.002380757825449109,
"learning_rate": 1.4453056800000076e-05,
"loss": 0.0025,
"step": 169600
},
{
"epoch": 7.769879697354716,
"grad_norm": 0.03281938657164574,
"learning_rate": 1.4396902159816245e-05,
"loss": 0.0028,
"step": 169700
},
{
"epoch": 7.774458294701418,
"grad_norm": 0.2583022117614746,
"learning_rate": 1.4340838468470197e-05,
"loss": 0.0031,
"step": 169800
},
{
"epoch": 7.779036892048121,
"grad_norm": 0.0035414681769907475,
"learning_rate": 1.4284865869177789e-05,
"loss": 0.0031,
"step": 169900
},
{
"epoch": 7.783615489394824,
"grad_norm": 0.23097677528858185,
"learning_rate": 1.4228984504922178e-05,
"loss": 0.0034,
"step": 170000
},
{
"epoch": 7.788194086741527,
"grad_norm": 0.515470027923584,
"learning_rate": 1.4173194518453414e-05,
"loss": 0.004,
"step": 170100
},
{
"epoch": 7.79277268408823,
"grad_norm": 0.03734416887164116,
"learning_rate": 1.4117496052288193e-05,
"loss": 0.0025,
"step": 170200
},
{
"epoch": 7.797351281434932,
"grad_norm": 0.270358681678772,
"learning_rate": 1.4061889248709343e-05,
"loss": 0.0017,
"step": 170300
},
{
"epoch": 7.801929878781635,
"grad_norm": 0.027283625677227974,
"learning_rate": 1.4006374249765597e-05,
"loss": 0.0028,
"step": 170400
},
{
"epoch": 7.806508476128338,
"grad_norm": 0.06574155390262604,
"learning_rate": 1.3950951197271134e-05,
"loss": 0.0031,
"step": 170500
},
{
"epoch": 7.811087073475041,
"grad_norm": 0.05151946470141411,
"learning_rate": 1.3895620232805279e-05,
"loss": 0.0017,
"step": 170600
},
{
"epoch": 7.815665670821744,
"grad_norm": 0.012561053037643433,
"learning_rate": 1.3840381497712113e-05,
"loss": 0.0025,
"step": 170700
},
{
"epoch": 7.820244268168446,
"grad_norm": 0.005159000866115093,
"learning_rate": 1.3785235133100088e-05,
"loss": 0.0034,
"step": 170800
},
{
"epoch": 7.8248228655151495,
"grad_norm": 0.04550444707274437,
"learning_rate": 1.3730181279841748e-05,
"loss": 0.0024,
"step": 170900
},
{
"epoch": 7.829401462861853,
"grad_norm": 0.05944928154349327,
"learning_rate": 1.3675220078573253e-05,
"loss": 0.0022,
"step": 171000
},
{
"epoch": 7.833980060208555,
"grad_norm": 0.31237590312957764,
"learning_rate": 1.3620351669694103e-05,
"loss": 0.0023,
"step": 171100
},
{
"epoch": 7.838558657555258,
"grad_norm": 0.0012041196459904313,
"learning_rate": 1.356557619336678e-05,
"loss": 0.0027,
"step": 171200
},
{
"epoch": 7.8431372549019605,
"grad_norm": 0.1280195415019989,
"learning_rate": 1.3510893789516372e-05,
"loss": 0.0034,
"step": 171300
},
{
"epoch": 7.847715852248664,
"grad_norm": 0.2050485610961914,
"learning_rate": 1.345630459783015e-05,
"loss": 0.0028,
"step": 171400
},
{
"epoch": 7.852294449595367,
"grad_norm": 0.15840676426887512,
"learning_rate": 1.340180875775735e-05,
"loss": 0.002,
"step": 171500
},
{
"epoch": 7.856873046942069,
"grad_norm": 0.7529467344284058,
"learning_rate": 1.3347406408508695e-05,
"loss": 0.0022,
"step": 171600
},
{
"epoch": 7.861451644288772,
"grad_norm": 0.03594828397035599,
"learning_rate": 1.3293097689056078e-05,
"loss": 0.0025,
"step": 171700
},
{
"epoch": 7.866030241635475,
"grad_norm": 0.4587234854698181,
"learning_rate": 1.323888273813223e-05,
"loss": 0.0029,
"step": 171800
},
{
"epoch": 7.870608838982178,
"grad_norm": 0.05882592126727104,
"learning_rate": 1.3184761694230375e-05,
"loss": 0.0026,
"step": 171900
},
{
"epoch": 7.875187436328881,
"grad_norm": 0.07484336197376251,
"learning_rate": 1.3130734695603786e-05,
"loss": 0.0028,
"step": 172000
},
{
"epoch": 7.879766033675583,
"grad_norm": 0.008674757555127144,
"learning_rate": 1.3076801880265554e-05,
"loss": 0.0028,
"step": 172100
},
{
"epoch": 7.884344631022286,
"grad_norm": 0.41222670674324036,
"learning_rate": 1.3022963385988151e-05,
"loss": 0.0036,
"step": 172200
},
{
"epoch": 7.88892322836899,
"grad_norm": 0.10513575375080109,
"learning_rate": 1.296921935030308e-05,
"loss": 0.0029,
"step": 172300
},
{
"epoch": 7.893501825715692,
"grad_norm": 0.29091617465019226,
"learning_rate": 1.2915569910500591e-05,
"loss": 0.004,
"step": 172400
},
{
"epoch": 7.898080423062395,
"grad_norm": 0.09394501894712448,
"learning_rate": 1.2862015203629274e-05,
"loss": 0.0032,
"step": 172500
},
{
"epoch": 7.902659020409097,
"grad_norm": 0.0589442253112793,
"learning_rate": 1.2808555366495728e-05,
"loss": 0.0027,
"step": 172600
},
{
"epoch": 7.907237617755801,
"grad_norm": 0.02068307250738144,
"learning_rate": 1.2755190535664168e-05,
"loss": 0.0024,
"step": 172700
},
{
"epoch": 7.911816215102503,
"grad_norm": 0.08841919153928757,
"learning_rate": 1.2701920847456166e-05,
"loss": 0.0027,
"step": 172800
},
{
"epoch": 7.916394812449206,
"grad_norm": 0.22736288607120514,
"learning_rate": 1.264874643795021e-05,
"loss": 0.0034,
"step": 172900
},
{
"epoch": 7.920973409795909,
"grad_norm": 0.16831666231155396,
"learning_rate": 1.2595667442981401e-05,
"loss": 0.0023,
"step": 173000
},
{
"epoch": 7.9255520071426115,
"grad_norm": 0.04770100489258766,
"learning_rate": 1.2542683998141119e-05,
"loss": 0.0025,
"step": 173100
},
{
"epoch": 7.930130604489315,
"grad_norm": 0.6141162514686584,
"learning_rate": 1.2489796238776675e-05,
"loss": 0.004,
"step": 173200
},
{
"epoch": 7.934709201836018,
"grad_norm": 0.7967793345451355,
"learning_rate": 1.243700429999089e-05,
"loss": 0.0027,
"step": 173300
},
{
"epoch": 7.93928779918272,
"grad_norm": 0.015516542829573154,
"learning_rate": 1.2384308316641874e-05,
"loss": 0.0017,
"step": 173400
},
{
"epoch": 7.943866396529423,
"grad_norm": 0.0020021158270537853,
"learning_rate": 1.233170842334258e-05,
"loss": 0.0029,
"step": 173500
},
{
"epoch": 7.948444993876126,
"grad_norm": 0.014905404299497604,
"learning_rate": 1.2279204754460493e-05,
"loss": 0.0026,
"step": 173600
},
{
"epoch": 7.953023591222829,
"grad_norm": 0.04339270293712616,
"learning_rate": 1.222679744411731e-05,
"loss": 0.0031,
"step": 173700
},
{
"epoch": 7.957602188569532,
"grad_norm": 0.10109388083219528,
"learning_rate": 1.2174486626188586e-05,
"loss": 0.0033,
"step": 173800
},
{
"epoch": 7.962180785916234,
"grad_norm": 0.018510516732931137,
"learning_rate": 1.2122272434303344e-05,
"loss": 0.0026,
"step": 173900
},
{
"epoch": 7.9667593832629375,
"grad_norm": 0.014604040421545506,
"learning_rate": 1.2070155001843835e-05,
"loss": 0.0024,
"step": 174000
},
{
"epoch": 7.97133798060964,
"grad_norm": 0.20794948935508728,
"learning_rate": 1.2018134461945075e-05,
"loss": 0.0033,
"step": 174100
},
{
"epoch": 7.975916577956343,
"grad_norm": 0.06476528197526932,
"learning_rate": 1.1966210947494583e-05,
"loss": 0.0024,
"step": 174200
},
{
"epoch": 7.980495175303046,
"grad_norm": 0.0063975718803703785,
"learning_rate": 1.1914384591132044e-05,
"loss": 0.0022,
"step": 174300
},
{
"epoch": 7.9850737726497485,
"grad_norm": 0.03397635370492935,
"learning_rate": 1.1862655525248945e-05,
"loss": 0.0025,
"step": 174400
},
{
"epoch": 7.989652369996452,
"grad_norm": 0.030696725472807884,
"learning_rate": 1.1811023881988248e-05,
"loss": 0.0021,
"step": 174500
},
{
"epoch": 7.994230967343155,
"grad_norm": 0.08137042820453644,
"learning_rate": 1.1759489793244022e-05,
"loss": 0.0025,
"step": 174600
},
{
"epoch": 7.998809564689857,
"grad_norm": 0.0656815618276596,
"learning_rate": 1.1708053390661128e-05,
"loss": 0.0026,
"step": 174700
},
{
"epoch": 8.0,
"eval_loss": 0.17588233947753906,
"eval_runtime": 260.0784,
"eval_samples_per_second": 21.147,
"eval_steps_per_second": 21.147,
"step": 174726
},
{
"epoch": 8.00338816203656,
"grad_norm": 0.10640919208526611,
"learning_rate": 1.1656714805634938e-05,
"loss": 0.0018,
"step": 174800
},
{
"epoch": 8.007966759383264,
"grad_norm": 0.0020934424828737974,
"learning_rate": 1.1605474169310881e-05,
"loss": 0.002,
"step": 174900
},
{
"epoch": 8.012545356729966,
"grad_norm": 0.09055866301059723,
"learning_rate": 1.1554331612584218e-05,
"loss": 0.0017,
"step": 175000
},
{
"epoch": 8.017123954076668,
"grad_norm": 0.49149322509765625,
"learning_rate": 1.1503287266099666e-05,
"loss": 0.0025,
"step": 175100
},
{
"epoch": 8.021702551423372,
"grad_norm": 0.01625397428870201,
"learning_rate": 1.145234126025102e-05,
"loss": 0.0021,
"step": 175200
},
{
"epoch": 8.026281148770074,
"grad_norm": 0.8061564564704895,
"learning_rate": 1.1401493725180912e-05,
"loss": 0.0015,
"step": 175300
},
{
"epoch": 8.030859746116777,
"grad_norm": 0.6298221349716187,
"learning_rate": 1.1350744790780388e-05,
"loss": 0.0018,
"step": 175400
},
{
"epoch": 8.035438343463479,
"grad_norm": 0.051574669778347015,
"learning_rate": 1.130009458668863e-05,
"loss": 0.0019,
"step": 175500
},
{
"epoch": 8.040016940810183,
"grad_norm": 0.034144267439842224,
"learning_rate": 1.1249543242292627e-05,
"loss": 0.0019,
"step": 175600
},
{
"epoch": 8.044595538156885,
"grad_norm": 0.05505882203578949,
"learning_rate": 1.119909088672682e-05,
"loss": 0.0019,
"step": 175700
},
{
"epoch": 8.049174135503588,
"grad_norm": 0.01235408615320921,
"learning_rate": 1.1148737648872759e-05,
"loss": 0.0019,
"step": 175800
},
{
"epoch": 8.053752732850292,
"grad_norm": 0.07047531008720398,
"learning_rate": 1.1098483657358844e-05,
"loss": 0.0017,
"step": 175900
},
{
"epoch": 8.058331330196994,
"grad_norm": 0.048473093658685684,
"learning_rate": 1.1048329040559896e-05,
"loss": 0.0019,
"step": 176000
},
{
"epoch": 8.062909927543696,
"grad_norm": 0.019425269216299057,
"learning_rate": 1.0998273926596897e-05,
"loss": 0.0015,
"step": 176100
},
{
"epoch": 8.0674885248904,
"grad_norm": 0.0072584389708936214,
"learning_rate": 1.094831844333667e-05,
"loss": 0.0024,
"step": 176200
},
{
"epoch": 8.072067122237103,
"grad_norm": 0.0020360236521810293,
"learning_rate": 1.0898462718391523e-05,
"loss": 0.0014,
"step": 176300
},
{
"epoch": 8.076645719583805,
"grad_norm": 0.5871603488922119,
"learning_rate": 1.0848706879118892e-05,
"loss": 0.0019,
"step": 176400
},
{
"epoch": 8.08122431693051,
"grad_norm": 0.13031832873821259,
"learning_rate": 1.0799051052621106e-05,
"loss": 0.0017,
"step": 176500
},
{
"epoch": 8.085802914277211,
"grad_norm": 0.008929682895541191,
"learning_rate": 1.074949536574496e-05,
"loss": 0.0016,
"step": 176600
},
{
"epoch": 8.090381511623914,
"grad_norm": 0.003812073729932308,
"learning_rate": 1.0700039945081498e-05,
"loss": 0.0017,
"step": 176700
},
{
"epoch": 8.094960108970616,
"grad_norm": 0.011707616969943047,
"learning_rate": 1.0650684916965559e-05,
"loss": 0.0016,
"step": 176800
},
{
"epoch": 8.09953870631732,
"grad_norm": 0.037662629038095474,
"learning_rate": 1.0601430407475582e-05,
"loss": 0.002,
"step": 176900
},
{
"epoch": 8.104117303664022,
"grad_norm": 0.012711996212601662,
"learning_rate": 1.0552276542433237e-05,
"loss": 0.0015,
"step": 177000
},
{
"epoch": 8.108695901010725,
"grad_norm": 0.19156889617443085,
"learning_rate": 1.0503223447403032e-05,
"loss": 0.0011,
"step": 177100
},
{
"epoch": 8.113274498357429,
"grad_norm": 0.010708093643188477,
"learning_rate": 1.0454271247692137e-05,
"loss": 0.0013,
"step": 177200
},
{
"epoch": 8.117853095704131,
"grad_norm": 0.02960583008825779,
"learning_rate": 1.040542006834992e-05,
"loss": 0.0024,
"step": 177300
},
{
"epoch": 8.122431693050833,
"grad_norm": 0.1249750480055809,
"learning_rate": 1.0356670034167698e-05,
"loss": 0.0015,
"step": 177400
},
{
"epoch": 8.127010290397537,
"grad_norm": 0.0189303457736969,
"learning_rate": 1.0308021269678442e-05,
"loss": 0.0021,
"step": 177500
},
{
"epoch": 8.13158888774424,
"grad_norm": 0.004004355985671282,
"learning_rate": 1.025947389915643e-05,
"loss": 0.0025,
"step": 177600
},
{
"epoch": 8.136167485090942,
"grad_norm": 0.021703239530324936,
"learning_rate": 1.0211028046616866e-05,
"loss": 0.0012,
"step": 177700
},
{
"epoch": 8.140746082437646,
"grad_norm": 0.41312482953071594,
"learning_rate": 1.0162683835815705e-05,
"loss": 0.0013,
"step": 177800
},
{
"epoch": 8.145324679784348,
"grad_norm": 0.021725183352828026,
"learning_rate": 1.0114441390249202e-05,
"loss": 0.0012,
"step": 177900
},
{
"epoch": 8.14990327713105,
"grad_norm": 0.33544811606407166,
"learning_rate": 1.0066300833153647e-05,
"loss": 0.002,
"step": 178000
},
{
"epoch": 8.154481874477753,
"grad_norm": 0.024289660155773163,
"learning_rate": 1.0018262287505086e-05,
"loss": 0.0023,
"step": 178100
},
{
"epoch": 8.159060471824457,
"grad_norm": 0.49725693464279175,
"learning_rate": 9.970325876018982e-06,
"loss": 0.002,
"step": 178200
},
{
"epoch": 8.16363906917116,
"grad_norm": 0.018485499545931816,
"learning_rate": 9.922491721149845e-06,
"loss": 0.0019,
"step": 178300
},
{
"epoch": 8.168217666517862,
"grad_norm": 0.009344914928078651,
"learning_rate": 9.874759945091016e-06,
"loss": 0.0016,
"step": 178400
},
{
"epoch": 8.172796263864566,
"grad_norm": 0.019952520728111267,
"learning_rate": 9.82713066977427e-06,
"loss": 0.0012,
"step": 178500
},
{
"epoch": 8.177374861211268,
"grad_norm": 0.5553386211395264,
"learning_rate": 9.77960401686958e-06,
"loss": 0.0019,
"step": 178600
},
{
"epoch": 8.18195345855797,
"grad_norm": 0.009466302581131458,
"learning_rate": 9.732180107784727e-06,
"loss": 0.0022,
"step": 178700
},
{
"epoch": 8.186532055904674,
"grad_norm": 0.5055824518203735,
"learning_rate": 9.684859063665059e-06,
"loss": 0.0017,
"step": 178800
},
{
"epoch": 8.191110653251377,
"grad_norm": 0.38719162344932556,
"learning_rate": 9.637641005393167e-06,
"loss": 0.002,
"step": 178900
},
{
"epoch": 8.195689250598079,
"grad_norm": 0.0033107008785009384,
"learning_rate": 9.590526053588505e-06,
"loss": 0.0013,
"step": 179000
},
{
"epoch": 8.200267847944781,
"grad_norm": 0.015472437255084515,
"learning_rate": 9.543514328607212e-06,
"loss": 0.0019,
"step": 179100
},
{
"epoch": 8.204846445291485,
"grad_norm": 0.004773670807480812,
"learning_rate": 9.496605950541676e-06,
"loss": 0.002,
"step": 179200
},
{
"epoch": 8.209425042638188,
"grad_norm": 0.0060659064911305904,
"learning_rate": 9.44980103922029e-06,
"loss": 0.0018,
"step": 179300
},
{
"epoch": 8.21400363998489,
"grad_norm": 0.004397235810756683,
"learning_rate": 9.403099714207175e-06,
"loss": 0.0017,
"step": 179400
},
{
"epoch": 8.218582237331594,
"grad_norm": 0.004803112708032131,
"learning_rate": 9.356502094801816e-06,
"loss": 0.0015,
"step": 179500
},
{
"epoch": 8.223160834678296,
"grad_norm": 0.0035059794317930937,
"learning_rate": 9.310008300038758e-06,
"loss": 0.0018,
"step": 179600
},
{
"epoch": 8.227739432024999,
"grad_norm": 0.025477442890405655,
"learning_rate": 9.263618448687377e-06,
"loss": 0.002,
"step": 179700
},
{
"epoch": 8.232318029371703,
"grad_norm": 0.3329303562641144,
"learning_rate": 9.217332659251477e-06,
"loss": 0.0018,
"step": 179800
},
{
"epoch": 8.236896626718405,
"grad_norm": 0.2675701379776001,
"learning_rate": 9.171151049969029e-06,
"loss": 0.0012,
"step": 179900
},
{
"epoch": 8.241475224065107,
"grad_norm": 1.2457773685455322,
"learning_rate": 9.125073738811918e-06,
"loss": 0.0019,
"step": 180000
},
{
"epoch": 8.246053821411811,
"grad_norm": 0.1400783210992813,
"learning_rate": 9.079100843485578e-06,
"loss": 0.0021,
"step": 180100
},
{
"epoch": 8.250632418758514,
"grad_norm": 0.025368591770529747,
"learning_rate": 9.033232481428678e-06,
"loss": 0.0018,
"step": 180200
},
{
"epoch": 8.255211016105216,
"grad_norm": 0.0014903460396453738,
"learning_rate": 8.987468769812912e-06,
"loss": 0.0014,
"step": 180300
},
{
"epoch": 8.259789613451918,
"grad_norm": 0.22623829543590546,
"learning_rate": 8.941809825542596e-06,
"loss": 0.0025,
"step": 180400
},
{
"epoch": 8.264368210798622,
"grad_norm": 0.017613211646676064,
"learning_rate": 8.896255765254424e-06,
"loss": 0.0012,
"step": 180500
},
{
"epoch": 8.268946808145325,
"grad_norm": 0.005598566494882107,
"learning_rate": 8.850806705317183e-06,
"loss": 0.001,
"step": 180600
},
{
"epoch": 8.273525405492027,
"grad_norm": 0.17524650692939758,
"learning_rate": 8.805462761831418e-06,
"loss": 0.001,
"step": 180700
},
{
"epoch": 8.278104002838731,
"grad_norm": 0.03338591754436493,
"learning_rate": 8.760224050629162e-06,
"loss": 0.0014,
"step": 180800
},
{
"epoch": 8.282682600185433,
"grad_norm": 0.017168212682008743,
"learning_rate": 8.715090687273614e-06,
"loss": 0.001,
"step": 180900
},
{
"epoch": 8.287261197532136,
"grad_norm": 0.09427805244922638,
"learning_rate": 8.67006278705888e-06,
"loss": 0.0013,
"step": 181000
},
{
"epoch": 8.29183979487884,
"grad_norm": 0.0094602657482028,
"learning_rate": 8.625140465009635e-06,
"loss": 0.0013,
"step": 181100
},
{
"epoch": 8.296418392225542,
"grad_norm": 0.06793930381536484,
"learning_rate": 8.58032383588086e-06,
"loss": 0.0018,
"step": 181200
},
{
"epoch": 8.300996989572244,
"grad_norm": 0.08039774000644684,
"learning_rate": 8.535613014157557e-06,
"loss": 0.0019,
"step": 181300
},
{
"epoch": 8.305575586918948,
"grad_norm": 0.03726482763886452,
"learning_rate": 8.491008114054439e-06,
"loss": 0.0021,
"step": 181400
},
{
"epoch": 8.31015418426565,
"grad_norm": 0.10031867027282715,
"learning_rate": 8.446509249515605e-06,
"loss": 0.0021,
"step": 181500
},
{
"epoch": 8.314732781612353,
"grad_norm": 0.38206222653388977,
"learning_rate": 8.402116534214338e-06,
"loss": 0.0021,
"step": 181600
},
{
"epoch": 8.319311378959055,
"grad_norm": 0.05652381107211113,
"learning_rate": 8.35783008155272e-06,
"loss": 0.0009,
"step": 181700
},
{
"epoch": 8.32388997630576,
"grad_norm": 0.0731114000082016,
"learning_rate": 8.313650004661383e-06,
"loss": 0.0016,
"step": 181800
},
{
"epoch": 8.328468573652462,
"grad_norm": 0.43218135833740234,
"learning_rate": 8.26957641639924e-06,
"loss": 0.0024,
"step": 181900
},
{
"epoch": 8.333047170999164,
"grad_norm": 0.08536510914564133,
"learning_rate": 8.225609429353187e-06,
"loss": 0.0021,
"step": 182000
},
{
"epoch": 8.337625768345868,
"grad_norm": 0.011019705794751644,
"learning_rate": 8.181749155837754e-06,
"loss": 0.0016,
"step": 182100
},
{
"epoch": 8.34220436569257,
"grad_norm": 0.040587395429611206,
"learning_rate": 8.137995707894942e-06,
"loss": 0.0018,
"step": 182200
},
{
"epoch": 8.346782963039272,
"grad_norm": 0.0023947455920279026,
"learning_rate": 8.094349197293793e-06,
"loss": 0.0015,
"step": 182300
},
{
"epoch": 8.351361560385977,
"grad_norm": 0.007556082680821419,
"learning_rate": 8.050809735530207e-06,
"loss": 0.0016,
"step": 182400
},
{
"epoch": 8.355940157732679,
"grad_norm": 0.11117005348205566,
"learning_rate": 8.007377433826634e-06,
"loss": 0.0016,
"step": 182500
},
{
"epoch": 8.360518755079381,
"grad_norm": 0.0016330329235643148,
"learning_rate": 7.964052403131773e-06,
"loss": 0.0013,
"step": 182600
},
{
"epoch": 8.365097352426083,
"grad_norm": 0.4123118221759796,
"learning_rate": 7.920834754120304e-06,
"loss": 0.0021,
"step": 182700
},
{
"epoch": 8.369675949772788,
"grad_norm": 0.014765871688723564,
"learning_rate": 7.877724597192582e-06,
"loss": 0.0022,
"step": 182800
},
{
"epoch": 8.37425454711949,
"grad_norm": 0.004433237481862307,
"learning_rate": 7.834722042474374e-06,
"loss": 0.0012,
"step": 182900
},
{
"epoch": 8.378833144466192,
"grad_norm": 0.0037168385460972786,
"learning_rate": 7.791827199816593e-06,
"loss": 0.0016,
"step": 183000
},
{
"epoch": 8.383411741812896,
"grad_norm": 0.04149395972490311,
"learning_rate": 7.74904017879497e-06,
"loss": 0.0029,
"step": 183100
},
{
"epoch": 8.387990339159598,
"grad_norm": 0.011970234103500843,
"learning_rate": 7.70636108870983e-06,
"loss": 0.0022,
"step": 183200
},
{
"epoch": 8.3925689365063,
"grad_norm": 0.049423061311244965,
"learning_rate": 7.663790038585793e-06,
"loss": 0.0021,
"step": 183300
},
{
"epoch": 8.397147533853005,
"grad_norm": 0.029166920110583305,
"learning_rate": 7.621327137171447e-06,
"loss": 0.0015,
"step": 183400
},
{
"epoch": 8.401726131199707,
"grad_norm": 0.029471127316355705,
"learning_rate": 7.5789724929391625e-06,
"loss": 0.0019,
"step": 183500
},
{
"epoch": 8.40630472854641,
"grad_norm": 0.039268478751182556,
"learning_rate": 7.536726214084722e-06,
"loss": 0.0019,
"step": 183600
},
{
"epoch": 8.410883325893113,
"grad_norm": 0.4737110137939453,
"learning_rate": 7.494588408527103e-06,
"loss": 0.0018,
"step": 183700
},
{
"epoch": 8.415461923239816,
"grad_norm": 0.03173527121543884,
"learning_rate": 7.4525591839081865e-06,
"loss": 0.0019,
"step": 183800
},
{
"epoch": 8.420040520586518,
"grad_norm": 0.013487137854099274,
"learning_rate": 7.4106386475925046e-06,
"loss": 0.0013,
"step": 183900
},
{
"epoch": 8.42461911793322,
"grad_norm": 0.0010746048064902425,
"learning_rate": 7.368826906666887e-06,
"loss": 0.0019,
"step": 184000
},
{
"epoch": 8.429197715279924,
"grad_norm": 0.01748150959610939,
"learning_rate": 7.327124067940311e-06,
"loss": 0.0025,
"step": 184100
},
{
"epoch": 8.433776312626627,
"grad_norm": 0.0159548781812191,
"learning_rate": 7.285530237943505e-06,
"loss": 0.0022,
"step": 184200
},
{
"epoch": 8.438354909973329,
"grad_norm": 0.0013540086802095175,
"learning_rate": 7.24404552292875e-06,
"loss": 0.0012,
"step": 184300
},
{
"epoch": 8.442933507320033,
"grad_norm": 0.12859028577804565,
"learning_rate": 7.202670028869601e-06,
"loss": 0.002,
"step": 184400
},
{
"epoch": 8.447512104666735,
"grad_norm": 0.006918368861079216,
"learning_rate": 7.161403861460614e-06,
"loss": 0.0014,
"step": 184500
},
{
"epoch": 8.452090702013438,
"grad_norm": 0.014983629807829857,
"learning_rate": 7.1202471261170245e-06,
"loss": 0.0016,
"step": 184600
},
{
"epoch": 8.456669299360142,
"grad_norm": 0.03064214624464512,
"learning_rate": 7.079199927974584e-06,
"loss": 0.0021,
"step": 184700
},
{
"epoch": 8.461247896706844,
"grad_norm": 0.030842667445540428,
"learning_rate": 7.038262371889159e-06,
"loss": 0.0012,
"step": 184800
},
{
"epoch": 8.465826494053546,
"grad_norm": 0.0024680488277226686,
"learning_rate": 6.997434562436606e-06,
"loss": 0.002,
"step": 184900
},
{
"epoch": 8.470405091400249,
"grad_norm": 0.06068078801035881,
"learning_rate": 6.956716603912361e-06,
"loss": 0.0021,
"step": 185000
},
{
"epoch": 8.474983688746953,
"grad_norm": 0.5528777241706848,
"learning_rate": 6.9161086003312945e-06,
"loss": 0.0015,
"step": 185100
},
{
"epoch": 8.479562286093655,
"grad_norm": 0.05493824928998947,
"learning_rate": 6.875610655427389e-06,
"loss": 0.0017,
"step": 185200
},
{
"epoch": 8.484140883440357,
"grad_norm": 0.07727139443159103,
"learning_rate": 6.83522287265344e-06,
"loss": 0.0017,
"step": 185300
},
{
"epoch": 8.488719480787061,
"grad_norm": 0.5921161770820618,
"learning_rate": 6.794945355180893e-06,
"loss": 0.0019,
"step": 185400
},
{
"epoch": 8.493298078133764,
"grad_norm": 0.1638752669095993,
"learning_rate": 6.754778205899465e-06,
"loss": 0.0011,
"step": 185500
},
{
"epoch": 8.497876675480466,
"grad_norm": 0.0014271615073084831,
"learning_rate": 6.714721527416956e-06,
"loss": 0.0017,
"step": 185600
},
{
"epoch": 8.50245527282717,
"grad_norm": 0.010398001410067081,
"learning_rate": 6.674775422058965e-06,
"loss": 0.0024,
"step": 185700
},
{
"epoch": 8.507033870173872,
"grad_norm": 0.019598359242081642,
"learning_rate": 6.63493999186865e-06,
"loss": 0.0024,
"step": 185800
},
{
"epoch": 8.511612467520575,
"grad_norm": 0.00348674226552248,
"learning_rate": 6.595215338606397e-06,
"loss": 0.0012,
"step": 185900
},
{
"epoch": 8.516191064867279,
"grad_norm": 0.0019242248963564634,
"learning_rate": 6.555601563749675e-06,
"loss": 0.0012,
"step": 186000
},
{
"epoch": 8.520769662213981,
"grad_norm": 0.008147502318024635,
"learning_rate": 6.516098768492662e-06,
"loss": 0.0015,
"step": 186100
},
{
"epoch": 8.525348259560683,
"grad_norm": 0.04510408639907837,
"learning_rate": 6.47670705374604e-06,
"loss": 0.001,
"step": 186200
},
{
"epoch": 8.529926856907386,
"grad_norm": 0.011768829077482224,
"learning_rate": 6.437426520136758e-06,
"loss": 0.0019,
"step": 186300
},
{
"epoch": 8.53450545425409,
"grad_norm": 0.049900226294994354,
"learning_rate": 6.398257268007746e-06,
"loss": 0.001,
"step": 186400
},
{
"epoch": 8.539084051600792,
"grad_norm": 0.030545897781848907,
"learning_rate": 6.359199397417637e-06,
"loss": 0.0019,
"step": 186500
},
{
"epoch": 8.543662648947494,
"grad_norm": 0.007900966331362724,
"learning_rate": 6.320253008140575e-06,
"loss": 0.0018,
"step": 186600
},
{
"epoch": 8.548241246294198,
"grad_norm": 0.002196391811594367,
"learning_rate": 6.281418199665884e-06,
"loss": 0.002,
"step": 186700
},
{
"epoch": 8.5528198436409,
"grad_norm": 0.08688097447156906,
"learning_rate": 6.242695071197896e-06,
"loss": 0.0014,
"step": 186800
},
{
"epoch": 8.557398440987603,
"grad_norm": 0.06626435369253159,
"learning_rate": 6.204083721655607e-06,
"loss": 0.0017,
"step": 186900
},
{
"epoch": 8.561977038334307,
"grad_norm": 0.08788962662220001,
"learning_rate": 6.165584249672507e-06,
"loss": 0.0016,
"step": 187000
},
{
"epoch": 8.56655563568101,
"grad_norm": 0.021715328097343445,
"learning_rate": 6.127196753596287e-06,
"loss": 0.0017,
"step": 187100
},
{
"epoch": 8.571134233027712,
"grad_norm": 0.0046193236485123634,
"learning_rate": 6.088921331488568e-06,
"loss": 0.001,
"step": 187200
},
{
"epoch": 8.575712830374414,
"grad_norm": 0.03280609846115112,
"learning_rate": 6.050758081124719e-06,
"loss": 0.0021,
"step": 187300
},
{
"epoch": 8.580291427721118,
"grad_norm": 0.01722414791584015,
"learning_rate": 6.012707099993525e-06,
"loss": 0.0015,
"step": 187400
},
{
"epoch": 8.58487002506782,
"grad_norm": 0.003511949675157666,
"learning_rate": 5.974768485296977e-06,
"loss": 0.0019,
"step": 187500
},
{
"epoch": 8.589448622414523,
"grad_norm": 0.1540764719247818,
"learning_rate": 5.936942333950063e-06,
"loss": 0.0022,
"step": 187600
},
{
"epoch": 8.594027219761227,
"grad_norm": 0.013510748744010925,
"learning_rate": 5.8992287425804485e-06,
"loss": 0.0012,
"step": 187700
},
{
"epoch": 8.598605817107929,
"grad_norm": 0.07007778435945511,
"learning_rate": 5.861627807528264e-06,
"loss": 0.001,
"step": 187800
},
{
"epoch": 8.603184414454631,
"grad_norm": 0.010667093098163605,
"learning_rate": 5.82413962484587e-06,
"loss": 0.0015,
"step": 187900
},
{
"epoch": 8.607763011801335,
"grad_norm": 0.024145985022187233,
"learning_rate": 5.7867642902975975e-06,
"loss": 0.0025,
"step": 188000
},
{
"epoch": 8.612341609148038,
"grad_norm": 0.12270953506231308,
"learning_rate": 5.749501899359477e-06,
"loss": 0.0019,
"step": 188100
},
{
"epoch": 8.61692020649474,
"grad_norm": 0.36917853355407715,
"learning_rate": 5.712352547219058e-06,
"loss": 0.0018,
"step": 188200
},
{
"epoch": 8.621498803841444,
"grad_norm": 0.8480884432792664,
"learning_rate": 5.675316328775126e-06,
"loss": 0.0023,
"step": 188300
},
{
"epoch": 8.626077401188146,
"grad_norm": 0.009566806256771088,
"learning_rate": 5.638393338637432e-06,
"loss": 0.0018,
"step": 188400
},
{
"epoch": 8.630655998534849,
"grad_norm": 0.15766866505146027,
"learning_rate": 5.601583671126531e-06,
"loss": 0.0015,
"step": 188500
},
{
"epoch": 8.635234595881553,
"grad_norm": 0.003668803023174405,
"learning_rate": 5.5648874202734565e-06,
"loss": 0.0014,
"step": 188600
},
{
"epoch": 8.639813193228255,
"grad_norm": 0.01124663557857275,
"learning_rate": 5.528304679819513e-06,
"loss": 0.0012,
"step": 188700
},
{
"epoch": 8.644391790574957,
"grad_norm": 0.06179165840148926,
"learning_rate": 5.4918355432160726e-06,
"loss": 0.0013,
"step": 188800
},
{
"epoch": 8.64897038792166,
"grad_norm": 0.1397646963596344,
"learning_rate": 5.455480103624283e-06,
"loss": 0.0018,
"step": 188900
},
{
"epoch": 8.653548985268364,
"grad_norm": 0.0030619765166193247,
"learning_rate": 5.41923845391486e-06,
"loss": 0.002,
"step": 189000
},
{
"epoch": 8.658127582615066,
"grad_norm": 0.015350698493421078,
"learning_rate": 5.383110686667831e-06,
"loss": 0.0018,
"step": 189100
},
{
"epoch": 8.662706179961768,
"grad_norm": 0.6472819447517395,
"learning_rate": 5.347096894172304e-06,
"loss": 0.0014,
"step": 189200
},
{
"epoch": 8.667284777308472,
"grad_norm": 1.0994236469268799,
"learning_rate": 5.3111971684262574e-06,
"loss": 0.0017,
"step": 189300
},
{
"epoch": 8.671863374655175,
"grad_norm": 0.010606258176267147,
"learning_rate": 5.275411601136254e-06,
"loss": 0.0016,
"step": 189400
},
{
"epoch": 8.676441972001877,
"grad_norm": 0.0032367429230362177,
"learning_rate": 5.239740283717265e-06,
"loss": 0.002,
"step": 189500
},
{
"epoch": 8.68102056934858,
"grad_norm": 0.0026265005581080914,
"learning_rate": 5.20418330729241e-06,
"loss": 0.001,
"step": 189600
},
{
"epoch": 8.685599166695283,
"grad_norm": 0.05965089425444603,
"learning_rate": 5.168740762692681e-06,
"loss": 0.0016,
"step": 189700
},
{
"epoch": 8.690177764041985,
"grad_norm": 0.040079813450574875,
"learning_rate": 5.133412740456806e-06,
"loss": 0.0022,
"step": 189800
},
{
"epoch": 8.694756361388688,
"grad_norm": 0.06221432238817215,
"learning_rate": 5.098199330830922e-06,
"loss": 0.002,
"step": 189900
},
{
"epoch": 8.699334958735392,
"grad_norm": 0.013662228360772133,
"learning_rate": 5.063100623768391e-06,
"loss": 0.0013,
"step": 190000
},
{
"epoch": 8.703913556082094,
"grad_norm": 0.00042499735718593,
"learning_rate": 5.028116708929587e-06,
"loss": 0.0017,
"step": 190100
},
{
"epoch": 8.708492153428796,
"grad_norm": 0.6862035989761353,
"learning_rate": 4.993247675681639e-06,
"loss": 0.0019,
"step": 190200
},
{
"epoch": 8.7130707507755,
"grad_norm": 0.10454216599464417,
"learning_rate": 4.958493613098186e-06,
"loss": 0.0017,
"step": 190300
},
{
"epoch": 8.717649348122203,
"grad_norm": 0.26306042075157166,
"learning_rate": 4.9238546099592e-06,
"loss": 0.0013,
"step": 190400
},
{
"epoch": 8.722227945468905,
"grad_norm": 0.026483699679374695,
"learning_rate": 4.8893307547507205e-06,
"loss": 0.0016,
"step": 190500
},
{
"epoch": 8.72680654281561,
"grad_norm": 0.033151958137750626,
"learning_rate": 4.854922135664619e-06,
"loss": 0.002,
"step": 190600
},
{
"epoch": 8.731385140162311,
"grad_norm": 0.03364422544836998,
"learning_rate": 4.820628840598423e-06,
"loss": 0.0018,
"step": 190700
},
{
"epoch": 8.735963737509014,
"grad_norm": 0.004185411147773266,
"learning_rate": 4.786450957155064e-06,
"loss": 0.0021,
"step": 190800
},
{
"epoch": 8.740542334855718,
"grad_norm": 0.007191179320216179,
"learning_rate": 4.7523885726426355e-06,
"loss": 0.0017,
"step": 190900
},
{
"epoch": 8.74512093220242,
"grad_norm": 0.0175678301602602,
"learning_rate": 4.71844177407419e-06,
"loss": 0.002,
"step": 191000
},
{
"epoch": 8.749699529549122,
"grad_norm": 0.20129109919071198,
"learning_rate": 4.684610648167503e-06,
"loss": 0.0017,
"step": 191100
},
{
"epoch": 8.754278126895825,
"grad_norm": 0.22425204515457153,
"learning_rate": 4.6508952813448965e-06,
"loss": 0.0015,
"step": 191200
},
{
"epoch": 8.758856724242529,
"grad_norm": 0.011632180772721767,
"learning_rate": 4.617295759732937e-06,
"loss": 0.0019,
"step": 191300
},
{
"epoch": 8.763435321589231,
"grad_norm": 0.00452096201479435,
"learning_rate": 4.5838121691623e-06,
"loss": 0.0012,
"step": 191400
},
{
"epoch": 8.768013918935933,
"grad_norm": 0.004986160434782505,
"learning_rate": 4.550444595167502e-06,
"loss": 0.0014,
"step": 191500
},
{
"epoch": 8.772592516282637,
"grad_norm": 0.0067661721259355545,
"learning_rate": 4.517193122986679e-06,
"loss": 0.0013,
"step": 191600
},
{
"epoch": 8.77717111362934,
"grad_norm": 0.06658513098955154,
"learning_rate": 4.484057837561406e-06,
"loss": 0.003,
"step": 191700
},
{
"epoch": 8.781749710976042,
"grad_norm": 0.003857834730297327,
"learning_rate": 4.4510388235364405e-06,
"loss": 0.0015,
"step": 191800
},
{
"epoch": 8.786328308322746,
"grad_norm": 0.00674202898517251,
"learning_rate": 4.418136165259512e-06,
"loss": 0.001,
"step": 191900
},
{
"epoch": 8.790906905669448,
"grad_norm": 0.004305652808398008,
"learning_rate": 4.385349946781136e-06,
"loss": 0.0008,
"step": 192000
},
{
"epoch": 8.79548550301615,
"grad_norm": 0.011842915788292885,
"learning_rate": 4.352680251854391e-06,
"loss": 0.0015,
"step": 192100
},
{
"epoch": 8.800064100362853,
"grad_norm": 0.030167168006300926,
"learning_rate": 4.320127163934657e-06,
"loss": 0.0015,
"step": 192200
},
{
"epoch": 8.804642697709557,
"grad_norm": 0.006344472989439964,
"learning_rate": 4.2876907661794755e-06,
"loss": 0.0016,
"step": 192300
},
{
"epoch": 8.80922129505626,
"grad_norm": 0.8136438131332397,
"learning_rate": 4.255371141448272e-06,
"loss": 0.0015,
"step": 192400
},
{
"epoch": 8.813799892402962,
"grad_norm": 0.03604700043797493,
"learning_rate": 4.223168372302189e-06,
"loss": 0.0019,
"step": 192500
},
{
"epoch": 8.818378489749666,
"grad_norm": 0.023059792816638947,
"learning_rate": 4.191082541003849e-06,
"loss": 0.0009,
"step": 192600
},
{
"epoch": 8.822957087096368,
"grad_norm": 0.04644302278757095,
"learning_rate": 4.159113729517184e-06,
"loss": 0.0023,
"step": 192700
},
{
"epoch": 8.82753568444307,
"grad_norm": 0.038498032838106155,
"learning_rate": 4.127262019507145e-06,
"loss": 0.0017,
"step": 192800
},
{
"epoch": 8.832114281789774,
"grad_norm": 0.010661243461072445,
"learning_rate": 4.095527492339596e-06,
"loss": 0.0017,
"step": 192900
},
{
"epoch": 8.836692879136477,
"grad_norm": 0.03207453712821007,
"learning_rate": 4.0639102290810135e-06,
"loss": 0.0024,
"step": 193000
},
{
"epoch": 8.841271476483179,
"grad_norm": 0.1272786557674408,
"learning_rate": 4.032410310498358e-06,
"loss": 0.0015,
"step": 193100
},
{
"epoch": 8.845850073829883,
"grad_norm": 0.004953332711011171,
"learning_rate": 4.001027817058789e-06,
"loss": 0.0015,
"step": 193200
},
{
"epoch": 8.850428671176585,
"grad_norm": 0.08756324648857117,
"learning_rate": 3.969762828929547e-06,
"loss": 0.0006,
"step": 193300
},
{
"epoch": 8.855007268523288,
"grad_norm": 0.5247501134872437,
"learning_rate": 3.938615425977676e-06,
"loss": 0.0018,
"step": 193400
},
{
"epoch": 8.85958586586999,
"grad_norm": 0.0369555726647377,
"learning_rate": 3.907585687769838e-06,
"loss": 0.0012,
"step": 193500
},
{
"epoch": 8.864164463216694,
"grad_norm": 0.13189056515693665,
"learning_rate": 3.876673693572147e-06,
"loss": 0.0009,
"step": 193600
},
{
"epoch": 8.868743060563396,
"grad_norm": 0.022357501089572906,
"learning_rate": 3.84587952234991e-06,
"loss": 0.001,
"step": 193700
},
{
"epoch": 8.873321657910099,
"grad_norm": 0.0008908796007744968,
"learning_rate": 3.815203252767463e-06,
"loss": 0.001,
"step": 193800
},
{
"epoch": 8.877900255256803,
"grad_norm": 0.003647018224000931,
"learning_rate": 3.7846449631879667e-06,
"loss": 0.0017,
"step": 193900
},
{
"epoch": 8.882478852603505,
"grad_norm": 0.427442729473114,
"learning_rate": 3.754204731673194e-06,
"loss": 0.0018,
"step": 194000
},
{
"epoch": 8.887057449950207,
"grad_norm": 0.07006958872079849,
"learning_rate": 3.723882635983328e-06,
"loss": 0.0018,
"step": 194100
},
{
"epoch": 8.891636047296911,
"grad_norm": 0.18033552169799805,
"learning_rate": 3.6936787535767903e-06,
"loss": 0.002,
"step": 194200
},
{
"epoch": 8.896214644643614,
"grad_norm": 0.6087344288825989,
"learning_rate": 3.6635931616100073e-06,
"loss": 0.0016,
"step": 194300
},
{
"epoch": 8.900793241990316,
"grad_norm": 0.14380215108394623,
"learning_rate": 3.6336259369372296e-06,
"loss": 0.0019,
"step": 194400
},
{
"epoch": 8.905371839337018,
"grad_norm": 0.09099259227514267,
"learning_rate": 3.6037771561103496e-06,
"loss": 0.0007,
"step": 194500
},
{
"epoch": 8.909950436683722,
"grad_norm": 0.3938591480255127,
"learning_rate": 3.5740468953786855e-06,
"loss": 0.002,
"step": 194600
},
{
"epoch": 8.914529034030425,
"grad_norm": 0.024904364719986916,
"learning_rate": 3.544435230688792e-06,
"loss": 0.0007,
"step": 194700
},
{
"epoch": 8.919107631377127,
"grad_norm": 0.0034130678977817297,
"learning_rate": 3.514942237684271e-06,
"loss": 0.0015,
"step": 194800
},
{
"epoch": 8.923686228723831,
"grad_norm": 0.007133205886930227,
"learning_rate": 3.485567991705563e-06,
"loss": 0.0012,
"step": 194900
},
{
"epoch": 8.928264826070533,
"grad_norm": 0.0010082671651616693,
"learning_rate": 3.4563125677897932e-06,
"loss": 0.0016,
"step": 195000
},
{
"epoch": 8.932843423417236,
"grad_norm": 0.011788592673838139,
"learning_rate": 3.427176040670521e-06,
"loss": 0.0023,
"step": 195100
},
{
"epoch": 8.93742202076394,
"grad_norm": 0.11746617406606674,
"learning_rate": 3.3981584847776026e-06,
"loss": 0.0014,
"step": 195200
},
{
"epoch": 8.942000618110642,
"grad_norm": 0.06418687850236893,
"learning_rate": 3.369259974236988e-06,
"loss": 0.0018,
"step": 195300
},
{
"epoch": 8.946579215457344,
"grad_norm": 0.012977411039173603,
"learning_rate": 3.340480582870503e-06,
"loss": 0.0014,
"step": 195400
},
{
"epoch": 8.951157812804048,
"grad_norm": 0.0007548317080363631,
"learning_rate": 3.311820384195674e-06,
"loss": 0.0013,
"step": 195500
},
{
"epoch": 8.95573641015075,
"grad_norm": 0.004369072150439024,
"learning_rate": 3.2832794514255803e-06,
"loss": 0.0011,
"step": 195600
},
{
"epoch": 8.960315007497453,
"grad_norm": 0.14467285573482513,
"learning_rate": 3.2548578574686018e-06,
"loss": 0.0016,
"step": 195700
},
{
"epoch": 8.964893604844155,
"grad_norm": 0.018095914274454117,
"learning_rate": 3.2265556749282834e-06,
"loss": 0.0013,
"step": 195800
},
{
"epoch": 8.96947220219086,
"grad_norm": 0.031725652515888214,
"learning_rate": 3.198372976103137e-06,
"loss": 0.0013,
"step": 195900
},
{
"epoch": 8.974050799537562,
"grad_norm": 0.12283707410097122,
"learning_rate": 3.1703098329864233e-06,
"loss": 0.0019,
"step": 196000
},
{
"epoch": 8.978629396884264,
"grad_norm": 0.0016571872401982546,
"learning_rate": 3.1423663172660267e-06,
"loss": 0.002,
"step": 196100
},
{
"epoch": 8.983207994230968,
"grad_norm": 0.005055413115769625,
"learning_rate": 3.114542500324219e-06,
"loss": 0.001,
"step": 196200
},
{
"epoch": 8.98778659157767,
"grad_norm": 0.008997324854135513,
"learning_rate": 3.086838453237506e-06,
"loss": 0.0007,
"step": 196300
},
{
"epoch": 8.992365188924373,
"grad_norm": 0.42291346192359924,
"learning_rate": 3.059254246776433e-06,
"loss": 0.0017,
"step": 196400
},
{
"epoch": 8.996943786271077,
"grad_norm": 0.23169781267642975,
"learning_rate": 3.0317899514054336e-06,
"loss": 0.0015,
"step": 196500
},
{
"epoch": 8.9999656605199,
"eval_loss": 0.2418144792318344,
"eval_runtime": 261.8983,
"eval_samples_per_second": 21.001,
"eval_steps_per_second": 21.001,
"step": 196566
},
{
"epoch": 9.001522383617779,
"grad_norm": 0.08088653534650803,
"learning_rate": 3.0044456372825992e-06,
"loss": 0.0019,
"step": 196600
},
{
"epoch": 9.006100980964481,
"grad_norm": 0.13099326193332672,
"learning_rate": 2.9772213742595367e-06,
"loss": 0.001,
"step": 196700
},
{
"epoch": 9.010679578311183,
"grad_norm": 0.008611609227955341,
"learning_rate": 2.950117231881183e-06,
"loss": 0.0008,
"step": 196800
},
{
"epoch": 9.015258175657888,
"grad_norm": 0.6876717209815979,
"learning_rate": 2.923133279385615e-06,
"loss": 0.0006,
"step": 196900
},
{
"epoch": 9.01983677300459,
"grad_norm": 0.2837451100349426,
"learning_rate": 2.8962695857038922e-06,
"loss": 0.0009,
"step": 197000
},
{
"epoch": 9.024415370351292,
"grad_norm": 0.012697268277406693,
"learning_rate": 2.8695262194598615e-06,
"loss": 0.0011,
"step": 197100
},
{
"epoch": 9.028993967697996,
"grad_norm": 0.002202101983129978,
"learning_rate": 2.8429032489700135e-06,
"loss": 0.0007,
"step": 197200
},
{
"epoch": 9.033572565044699,
"grad_norm": 0.00148550805170089,
"learning_rate": 2.8164007422432583e-06,
"loss": 0.001,
"step": 197300
},
{
"epoch": 9.0381511623914,
"grad_norm": 0.002804514952003956,
"learning_rate": 2.790018766980773e-06,
"loss": 0.0008,
"step": 197400
},
{
"epoch": 9.042729759738105,
"grad_norm": 0.004013043362647295,
"learning_rate": 2.763757390575872e-06,
"loss": 0.0013,
"step": 197500
},
{
"epoch": 9.047308357084807,
"grad_norm": 0.3350330591201782,
"learning_rate": 2.737616680113758e-06,
"loss": 0.0013,
"step": 197600
},
{
"epoch": 9.05188695443151,
"grad_norm": 0.2957717478275299,
"learning_rate": 2.7115967023714215e-06,
"loss": 0.0009,
"step": 197700
},
{
"epoch": 9.056465551778214,
"grad_norm": 0.13251306116580963,
"learning_rate": 2.6856975238174266e-06,
"loss": 0.0008,
"step": 197800
},
{
"epoch": 9.061044149124916,
"grad_norm": 0.0039755236357450485,
"learning_rate": 2.6599192106117333e-06,
"loss": 0.0021,
"step": 197900
},
{
"epoch": 9.065622746471618,
"grad_norm": 0.040325064212083817,
"learning_rate": 2.634261828605594e-06,
"loss": 0.001,
"step": 198000
},
{
"epoch": 9.07020134381832,
"grad_norm": 0.001606648089364171,
"learning_rate": 2.608725443341292e-06,
"loss": 0.0014,
"step": 198100
},
{
"epoch": 9.074779941165025,
"grad_norm": 0.0010452588321641088,
"learning_rate": 2.583310120052046e-06,
"loss": 0.0012,
"step": 198200
},
{
"epoch": 9.079358538511727,
"grad_norm": 0.06777454912662506,
"learning_rate": 2.5580159236618162e-06,
"loss": 0.0006,
"step": 198300
},
{
"epoch": 9.083937135858429,
"grad_norm": 0.014749701134860516,
"learning_rate": 2.5328429187851552e-06,
"loss": 0.0012,
"step": 198400
},
{
"epoch": 9.088515733205133,
"grad_norm": 0.0003846607287414372,
"learning_rate": 2.507791169727003e-06,
"loss": 0.0008,
"step": 198500
},
{
"epoch": 9.093094330551835,
"grad_norm": 0.004392546135932207,
"learning_rate": 2.4828607404825677e-06,
"loss": 0.0006,
"step": 198600
},
{
"epoch": 9.097672927898538,
"grad_norm": 0.006986264605075121,
"learning_rate": 2.4580516947371348e-06,
"loss": 0.001,
"step": 198700
},
{
"epoch": 9.102251525245242,
"grad_norm": 0.009725336916744709,
"learning_rate": 2.4333640958659143e-06,
"loss": 0.0007,
"step": 198800
},
{
"epoch": 9.106830122591944,
"grad_norm": 0.02167440392076969,
"learning_rate": 2.408798006933882e-06,
"loss": 0.001,
"step": 198900
},
{
"epoch": 9.111408719938646,
"grad_norm": 0.054156869649887085,
"learning_rate": 2.3843534906956123e-06,
"loss": 0.0013,
"step": 199000
},
{
"epoch": 9.11598731728535,
"grad_norm": 0.011062448844313622,
"learning_rate": 2.3600306095951264e-06,
"loss": 0.0013,
"step": 199100
},
{
"epoch": 9.120565914632053,
"grad_norm": 0.0029075967613607645,
"learning_rate": 2.335829425765712e-06,
"loss": 0.0015,
"step": 199200
},
{
"epoch": 9.125144511978755,
"grad_norm": 0.013693880289793015,
"learning_rate": 2.311750001029783e-06,
"loss": 0.0014,
"step": 199300
},
{
"epoch": 9.129723109325457,
"grad_norm": 0.011990150436758995,
"learning_rate": 2.2877923968987247e-06,
"loss": 0.0011,
"step": 199400
},
{
"epoch": 9.134301706672161,
"grad_norm": 0.01806030236184597,
"learning_rate": 2.2639566745727205e-06,
"loss": 0.0007,
"step": 199500
},
{
"epoch": 9.138880304018864,
"grad_norm": 0.005009980872273445,
"learning_rate": 2.2402428949406086e-06,
"loss": 0.0007,
"step": 199600
},
{
"epoch": 9.143458901365566,
"grad_norm": 0.031974907964468,
"learning_rate": 2.216651118579727e-06,
"loss": 0.0013,
"step": 199700
},
{
"epoch": 9.14803749871227,
"grad_norm": 0.0008488456369377673,
"learning_rate": 2.19318140575574e-06,
"loss": 0.0009,
"step": 199800
},
{
"epoch": 9.152616096058972,
"grad_norm": 0.12342657893896103,
"learning_rate": 2.169833816422517e-06,
"loss": 0.001,
"step": 199900
},
{
"epoch": 9.157194693405675,
"grad_norm": 0.0888073593378067,
"learning_rate": 2.1466084102219452e-06,
"loss": 0.0011,
"step": 200000
},
{
"epoch": 9.161773290752379,
"grad_norm": 0.0031123904045671225,
"learning_rate": 2.123505246483787e-06,
"loss": 0.0012,
"step": 200100
},
{
"epoch": 9.166351888099081,
"grad_norm": 0.021208738908171654,
"learning_rate": 2.100524384225555e-06,
"loss": 0.001,
"step": 200200
},
{
"epoch": 9.170930485445783,
"grad_norm": 0.025913584977388382,
"learning_rate": 2.077665882152335e-06,
"loss": 0.0012,
"step": 200300
},
{
"epoch": 9.175509082792486,
"grad_norm": 0.16090908646583557,
"learning_rate": 2.0549297986566186e-06,
"loss": 0.0014,
"step": 200400
},
{
"epoch": 9.18008768013919,
"grad_norm": 0.08480704575777054,
"learning_rate": 2.032316191818212e-06,
"loss": 0.0018,
"step": 200500
},
{
"epoch": 9.184666277485892,
"grad_norm": 0.0889834314584732,
"learning_rate": 2.009825119404024e-06,
"loss": 0.0012,
"step": 200600
},
{
"epoch": 9.189244874832594,
"grad_norm": 0.10864217579364777,
"learning_rate": 1.9874566388679518e-06,
"loss": 0.001,
"step": 200700
},
{
"epoch": 9.193823472179298,
"grad_norm": 0.0008274565334431827,
"learning_rate": 1.9652108073507425e-06,
"loss": 0.0011,
"step": 200800
},
{
"epoch": 9.198402069526,
"grad_norm": 0.002116286661475897,
"learning_rate": 1.943087681679823e-06,
"loss": 0.001,
"step": 200900
},
{
"epoch": 9.202980666872703,
"grad_norm": 0.029289819300174713,
"learning_rate": 1.9210873183691692e-06,
"loss": 0.0007,
"step": 201000
},
{
"epoch": 9.207559264219407,
"grad_norm": 0.006239714100956917,
"learning_rate": 1.899209773619154e-06,
"loss": 0.0011,
"step": 201100
},
{
"epoch": 9.21213786156611,
"grad_norm": 0.002388751832768321,
"learning_rate": 1.8774551033164112e-06,
"loss": 0.0009,
"step": 201200
},
{
"epoch": 9.216716458912812,
"grad_norm": 0.15311861038208008,
"learning_rate": 1.8558233630336929e-06,
"loss": 0.0011,
"step": 201300
},
{
"epoch": 9.221295056259516,
"grad_norm": 0.20378436148166656,
"learning_rate": 1.8343146080297135e-06,
"loss": 0.0007,
"step": 201400
},
{
"epoch": 9.225873653606218,
"grad_norm": 0.01194208487868309,
"learning_rate": 1.8129288932490274e-06,
"loss": 0.0008,
"step": 201500
},
{
"epoch": 9.23045225095292,
"grad_norm": 0.002687977161258459,
"learning_rate": 1.7916662733218847e-06,
"loss": 0.001,
"step": 201600
},
{
"epoch": 9.235030848299623,
"grad_norm": 0.0027624531649053097,
"learning_rate": 1.7705268025640709e-06,
"loss": 0.0005,
"step": 201700
},
{
"epoch": 9.239609445646327,
"grad_norm": 0.04479651898145676,
"learning_rate": 1.7495105349767948e-06,
"loss": 0.0012,
"step": 201800
},
{
"epoch": 9.244188042993029,
"grad_norm": 0.21192124485969543,
"learning_rate": 1.7286175242465509e-06,
"loss": 0.0012,
"step": 201900
},
{
"epoch": 9.248766640339731,
"grad_norm": 0.0029038949869573116,
"learning_rate": 1.7078478237449402e-06,
"loss": 0.0008,
"step": 202000
},
{
"epoch": 9.253345237686435,
"grad_norm": 0.009675376117229462,
"learning_rate": 1.6872014865286057e-06,
"loss": 0.0013,
"step": 202100
},
{
"epoch": 9.257923835033138,
"grad_norm": 0.007381136529147625,
"learning_rate": 1.6666785653390249e-06,
"loss": 0.001,
"step": 202200
},
{
"epoch": 9.26250243237984,
"grad_norm": 0.001989328535273671,
"learning_rate": 1.6462791126024169e-06,
"loss": 0.0007,
"step": 202300
},
{
"epoch": 9.267081029726544,
"grad_norm": 0.0015792534686625004,
"learning_rate": 1.6260031804296084e-06,
"loss": 0.0008,
"step": 202400
},
{
"epoch": 9.271659627073246,
"grad_norm": 0.0014649959048256278,
"learning_rate": 1.6058508206158728e-06,
"loss": 0.0008,
"step": 202500
},
{
"epoch": 9.276238224419949,
"grad_norm": 0.017108794301748276,
"learning_rate": 1.58582208464082e-06,
"loss": 0.0009,
"step": 202600
},
{
"epoch": 9.280816821766653,
"grad_norm": 0.0022038191091269255,
"learning_rate": 1.5659170236682674e-06,
"loss": 0.0007,
"step": 202700
},
{
"epoch": 9.285395419113355,
"grad_norm": 0.002022168133407831,
"learning_rate": 1.5461356885461075e-06,
"loss": 0.0014,
"step": 202800
},
{
"epoch": 9.289974016460057,
"grad_norm": 0.003931309096515179,
"learning_rate": 1.5264781298061415e-06,
"loss": 0.0013,
"step": 202900
},
{
"epoch": 9.29455261380676,
"grad_norm": 0.13134440779685974,
"learning_rate": 1.5069443976640284e-06,
"loss": 0.0009,
"step": 203000
},
{
"epoch": 9.299131211153464,
"grad_norm": 0.004310674965381622,
"learning_rate": 1.4875345420190645e-06,
"loss": 0.0012,
"step": 203100
},
{
"epoch": 9.303709808500166,
"grad_norm": 0.00843301322311163,
"learning_rate": 1.4682486124541373e-06,
"loss": 0.0011,
"step": 203200
},
{
"epoch": 9.308288405846868,
"grad_norm": 0.01970786415040493,
"learning_rate": 1.4490866582355267e-06,
"loss": 0.0015,
"step": 203300
},
{
"epoch": 9.312867003193572,
"grad_norm": 0.012120225466787815,
"learning_rate": 1.4300487283128495e-06,
"loss": 0.0011,
"step": 203400
},
{
"epoch": 9.317445600540275,
"grad_norm": 0.0033403183333575726,
"learning_rate": 1.4111348713188866e-06,
"loss": 0.0007,
"step": 203500
},
{
"epoch": 9.322024197886977,
"grad_norm": 0.07732047885656357,
"learning_rate": 1.3923451355694617e-06,
"loss": 0.0008,
"step": 203600
},
{
"epoch": 9.326602795233681,
"grad_norm": 0.004604123532772064,
"learning_rate": 1.3736795690633354e-06,
"loss": 0.0011,
"step": 203700
},
{
"epoch": 9.331181392580383,
"grad_norm": 0.003377101384103298,
"learning_rate": 1.3551382194820884e-06,
"loss": 0.0008,
"step": 203800
},
{
"epoch": 9.335759989927086,
"grad_norm": 0.0024128479417413473,
"learning_rate": 1.3367211341899667e-06,
"loss": 0.0009,
"step": 203900
},
{
"epoch": 9.340338587273788,
"grad_norm": 0.01782609149813652,
"learning_rate": 1.3184283602337865e-06,
"loss": 0.001,
"step": 204000
},
{
"epoch": 9.344917184620492,
"grad_norm": 0.0036396984942257404,
"learning_rate": 1.3002599443428243e-06,
"loss": 0.0009,
"step": 204100
},
{
"epoch": 9.349495781967194,
"grad_norm": 0.08581870794296265,
"learning_rate": 1.2822159329286598e-06,
"loss": 0.0009,
"step": 204200
},
{
"epoch": 9.354074379313897,
"grad_norm": 0.044847775250673294,
"learning_rate": 1.264296372085083e-06,
"loss": 0.0011,
"step": 204300
},
{
"epoch": 9.3586529766606,
"grad_norm": 0.001584995654411614,
"learning_rate": 1.2465013075879883e-06,
"loss": 0.0018,
"step": 204400
},
{
"epoch": 9.363231574007303,
"grad_norm": 0.009455603547394276,
"learning_rate": 1.2288307848952186e-06,
"loss": 0.0007,
"step": 204500
},
{
"epoch": 9.367810171354005,
"grad_norm": 0.08216769993305206,
"learning_rate": 1.2112848491464824e-06,
"loss": 0.0012,
"step": 204600
},
{
"epoch": 9.37238876870071,
"grad_norm": 0.00180336635094136,
"learning_rate": 1.1938635451632429e-06,
"loss": 0.0013,
"step": 204700
},
{
"epoch": 9.376967366047412,
"grad_norm": 0.007049913518130779,
"learning_rate": 1.1765669174485684e-06,
"loss": 0.0008,
"step": 204800
},
{
"epoch": 9.381545963394114,
"grad_norm": 0.07437339425086975,
"learning_rate": 1.1593950101870422e-06,
"loss": 0.0006,
"step": 204900
},
{
"epoch": 9.386124560740818,
"grad_norm": 0.0391901396214962,
"learning_rate": 1.1423478672446586e-06,
"loss": 0.001,
"step": 205000
},
{
"epoch": 9.39070315808752,
"grad_norm": 0.16931991279125214,
"learning_rate": 1.1254255321686836e-06,
"loss": 0.0012,
"step": 205100
},
{
"epoch": 9.395281755434223,
"grad_norm": 0.0040185777470469475,
"learning_rate": 1.1086280481875654e-06,
"loss": 0.0012,
"step": 205200
},
{
"epoch": 9.399860352780925,
"grad_norm": 0.1352093517780304,
"learning_rate": 1.0919554582108249e-06,
"loss": 0.0005,
"step": 205300
},
{
"epoch": 9.404438950127629,
"grad_norm": 0.11504676938056946,
"learning_rate": 1.0754078048289374e-06,
"loss": 0.0006,
"step": 205400
},
{
"epoch": 9.409017547474331,
"grad_norm": 0.015873286873102188,
"learning_rate": 1.0589851303132114e-06,
"loss": 0.0017,
"step": 205500
},
{
"epoch": 9.413596144821033,
"grad_norm": 0.12407149374485016,
"learning_rate": 1.0426874766157003e-06,
"loss": 0.001,
"step": 205600
},
{
"epoch": 9.418174742167738,
"grad_norm": 0.002775526139885187,
"learning_rate": 1.0265148853691009e-06,
"loss": 0.0017,
"step": 205700
},
{
"epoch": 9.42275333951444,
"grad_norm": 0.15169379115104675,
"learning_rate": 1.0104673978866164e-06,
"loss": 0.0011,
"step": 205800
},
{
"epoch": 9.427331936861142,
"grad_norm": 0.007750590797513723,
"learning_rate": 9.945450551618884e-07,
"loss": 0.001,
"step": 205900
},
{
"epoch": 9.431910534207846,
"grad_norm": 0.0007150355377234519,
"learning_rate": 9.787478978688646e-07,
"loss": 0.001,
"step": 206000
},
{
"epoch": 9.436489131554548,
"grad_norm": 0.01111397985368967,
"learning_rate": 9.630759663616983e-07,
"loss": 0.0011,
"step": 206100
},
{
"epoch": 9.44106772890125,
"grad_norm": 0.027570601552724838,
"learning_rate": 9.475293006746711e-07,
"loss": 0.0007,
"step": 206200
},
{
"epoch": 9.445646326247953,
"grad_norm": 0.24770981073379517,
"learning_rate": 9.321079405220423e-07,
"loss": 0.0008,
"step": 206300
},
{
"epoch": 9.450224923594657,
"grad_norm": 0.003402331378310919,
"learning_rate": 9.168119252979946e-07,
"loss": 0.0005,
"step": 206400
},
{
"epoch": 9.45480352094136,
"grad_norm": 0.0008960131090134382,
"learning_rate": 9.016412940765106e-07,
"loss": 0.0007,
"step": 206500
},
{
"epoch": 9.459382118288062,
"grad_norm": 0.02515571191906929,
"learning_rate": 8.865960856112799e-07,
"loss": 0.0015,
"step": 206600
},
{
"epoch": 9.463960715634766,
"grad_norm": 0.1351311206817627,
"learning_rate": 8.716763383355864e-07,
"loss": 0.0013,
"step": 206700
},
{
"epoch": 9.468539312981468,
"grad_norm": 0.0024123205803334713,
"learning_rate": 8.568820903622376e-07,
"loss": 0.0012,
"step": 206800
},
{
"epoch": 9.47311791032817,
"grad_norm": 0.010249449871480465,
"learning_rate": 8.422133794834363e-07,
"loss": 0.001,
"step": 206900
},
{
"epoch": 9.477696507674874,
"grad_norm": 0.006125771440565586,
"learning_rate": 8.276702431706973e-07,
"loss": 0.0009,
"step": 207000
},
{
"epoch": 9.482275105021577,
"grad_norm": 0.0066106487065553665,
"learning_rate": 8.132527185747641e-07,
"loss": 0.0008,
"step": 207100
},
{
"epoch": 9.486853702368279,
"grad_norm": 0.010087539441883564,
"learning_rate": 7.989608425254924e-07,
"loss": 0.001,
"step": 207200
},
{
"epoch": 9.491432299714983,
"grad_norm": 0.06039687991142273,
"learning_rate": 7.847946515317839e-07,
"loss": 0.0011,
"step": 207300
},
{
"epoch": 9.496010897061685,
"grad_norm": 0.0053437924943864346,
"learning_rate": 7.707541817814468e-07,
"loss": 0.001,
"step": 207400
},
{
"epoch": 9.500589494408388,
"grad_norm": 0.0029611322097480297,
"learning_rate": 7.568394691411462e-07,
"loss": 0.0005,
"step": 207500
},
{
"epoch": 9.50516809175509,
"grad_norm": 0.007037085480988026,
"learning_rate": 7.4305054915631e-07,
"loss": 0.0007,
"step": 207600
},
{
"epoch": 9.509746689101794,
"grad_norm": 0.0024143033660948277,
"learning_rate": 7.293874570510062e-07,
"loss": 0.001,
"step": 207700
},
{
"epoch": 9.514325286448496,
"grad_norm": 0.0061892117373645306,
"learning_rate": 7.158502277278823e-07,
"loss": 0.0011,
"step": 207800
},
{
"epoch": 9.518903883795199,
"grad_norm": 0.006517268251627684,
"learning_rate": 7.024388957680705e-07,
"loss": 0.0013,
"step": 207900
},
{
"epoch": 9.523482481141903,
"grad_norm": 0.02947130799293518,
"learning_rate": 6.891534954310885e-07,
"loss": 0.0005,
"step": 208000
},
{
"epoch": 9.528061078488605,
"grad_norm": 0.009931573644280434,
"learning_rate": 6.75994060654761e-07,
"loss": 0.0009,
"step": 208100
},
{
"epoch": 9.532639675835307,
"grad_norm": 0.0006516918656416237,
"learning_rate": 6.629606250551368e-07,
"loss": 0.0006,
"step": 208200
},
{
"epoch": 9.537218273182011,
"grad_norm": 0.0033905524760484695,
"learning_rate": 6.500532219263833e-07,
"loss": 0.0008,
"step": 208300
},
{
"epoch": 9.541796870528714,
"grad_norm": 0.035781797021627426,
"learning_rate": 6.372718842407255e-07,
"loss": 0.001,
"step": 208400
},
{
"epoch": 9.546375467875416,
"grad_norm": 0.006109519395977259,
"learning_rate": 6.24616644648357e-07,
"loss": 0.0013,
"step": 208500
},
{
"epoch": 9.550954065222118,
"grad_norm": 0.10211105644702911,
"learning_rate": 6.120875354773459e-07,
"loss": 0.0008,
"step": 208600
},
{
"epoch": 9.555532662568822,
"grad_norm": 0.002378986682742834,
"learning_rate": 5.996845887335511e-07,
"loss": 0.0011,
"step": 208700
},
{
"epoch": 9.560111259915525,
"grad_norm": 0.007951854728162289,
"learning_rate": 5.874078361005564e-07,
"loss": 0.0012,
"step": 208800
},
{
"epoch": 9.564689857262227,
"grad_norm": 0.01049245335161686,
"learning_rate": 5.75257308939564e-07,
"loss": 0.0009,
"step": 208900
},
{
"epoch": 9.569268454608931,
"grad_norm": 0.00224009295925498,
"learning_rate": 5.632330382893569e-07,
"loss": 0.0005,
"step": 209000
},
{
"epoch": 9.573847051955633,
"grad_norm": 0.004827695898711681,
"learning_rate": 5.513350548661811e-07,
"loss": 0.0007,
"step": 209100
},
{
"epoch": 9.578425649302336,
"grad_norm": 0.021496234461665154,
"learning_rate": 5.395633890636631e-07,
"loss": 0.0007,
"step": 209200
},
{
"epoch": 9.58300424664904,
"grad_norm": 0.001609979895874858,
"learning_rate": 5.279180709527765e-07,
"loss": 0.0009,
"step": 209300
},
{
"epoch": 9.587582843995742,
"grad_norm": 0.006100552622228861,
"learning_rate": 5.163991302817139e-07,
"loss": 0.0012,
"step": 209400
},
{
"epoch": 9.592161441342444,
"grad_norm": 0.0038456227630376816,
"learning_rate": 5.050065964758488e-07,
"loss": 0.0009,
"step": 209500
},
{
"epoch": 9.596740038689148,
"grad_norm": 0.0012561274925246835,
"learning_rate": 4.937404986376348e-07,
"loss": 0.0011,
"step": 209600
},
{
"epoch": 9.60131863603585,
"grad_norm": 0.007517179474234581,
"learning_rate": 4.826008655465508e-07,
"loss": 0.0011,
"step": 209700
},
{
"epoch": 9.605897233382553,
"grad_norm": 0.16051574051380157,
"learning_rate": 4.7158772565902843e-07,
"loss": 0.0003,
"step": 209800
},
{
"epoch": 9.610475830729255,
"grad_norm": 0.004418348427861929,
"learning_rate": 4.6070110710834116e-07,
"loss": 0.0013,
"step": 209900
},
{
"epoch": 9.61505442807596,
"grad_norm": 0.003388006007298827,
"learning_rate": 4.4994103770457653e-07,
"loss": 0.0007,
"step": 210000
},
{
"epoch": 9.619633025422662,
"grad_norm": 0.1264602690935135,
"learning_rate": 4.3930754493456403e-07,
"loss": 0.0006,
"step": 210100
},
{
"epoch": 9.624211622769364,
"grad_norm": 0.0033033695071935654,
"learning_rate": 4.2880065596176967e-07,
"loss": 0.0009,
"step": 210200
},
{
"epoch": 9.628790220116068,
"grad_norm": 0.002471612999215722,
"learning_rate": 4.184203976262513e-07,
"loss": 0.0011,
"step": 210300
},
{
"epoch": 9.63336881746277,
"grad_norm": 0.02394956909120083,
"learning_rate": 4.081667964446034e-07,
"loss": 0.0009,
"step": 210400
},
{
"epoch": 9.637947414809473,
"grad_norm": 0.0018725660629570484,
"learning_rate": 3.980398786098405e-07,
"loss": 0.0013,
"step": 210500
},
{
"epoch": 9.642526012156177,
"grad_norm": 0.0007891812711022794,
"learning_rate": 3.8803966999139684e-07,
"loss": 0.0006,
"step": 210600
},
{
"epoch": 9.647104609502879,
"grad_norm": 0.01715545915067196,
"learning_rate": 3.7816619613499913e-07,
"loss": 0.0017,
"step": 210700
},
{
"epoch": 9.651683206849581,
"grad_norm": 0.004238943103700876,
"learning_rate": 3.6841948226263854e-07,
"loss": 0.0009,
"step": 210800
},
{
"epoch": 9.656261804196284,
"grad_norm": 0.07635319977998734,
"learning_rate": 3.587995532724986e-07,
"loss": 0.0013,
"step": 210900
},
{
"epoch": 9.660840401542988,
"grad_norm": 0.016986342146992683,
"learning_rate": 3.493064337388774e-07,
"loss": 0.0004,
"step": 211000
},
{
"epoch": 9.66541899888969,
"grad_norm": 0.004902221262454987,
"learning_rate": 3.399401479121489e-07,
"loss": 0.0009,
"step": 211100
},
{
"epoch": 9.669997596236392,
"grad_norm": 0.004907084163278341,
"learning_rate": 3.30700719718674e-07,
"loss": 0.0011,
"step": 211200
},
{
"epoch": 9.674576193583096,
"grad_norm": 0.0083727166056633,
"learning_rate": 3.215881727607617e-07,
"loss": 0.0015,
"step": 211300
},
{
"epoch": 9.679154790929799,
"grad_norm": 0.005267091561108828,
"learning_rate": 3.126025303166025e-07,
"loss": 0.0009,
"step": 211400
},
{
"epoch": 9.6837333882765,
"grad_norm": 0.0178202036768198,
"learning_rate": 3.0374381534019613e-07,
"loss": 0.0009,
"step": 211500
},
{
"epoch": 9.688311985623205,
"grad_norm": 0.05054371431469917,
"learning_rate": 2.9501205046131295e-07,
"loss": 0.0006,
"step": 211600
},
{
"epoch": 9.692890582969907,
"grad_norm": 0.01484039518982172,
"learning_rate": 2.8640725798543266e-07,
"loss": 0.0009,
"step": 211700
},
{
"epoch": 9.69746918031661,
"grad_norm": 0.1556658148765564,
"learning_rate": 2.7792945989366105e-07,
"loss": 0.0013,
"step": 211800
},
{
"epoch": 9.702047777663314,
"grad_norm": 0.0019681896083056927,
"learning_rate": 2.6957867784270787e-07,
"loss": 0.0008,
"step": 211900
},
{
"epoch": 9.706626375010016,
"grad_norm": 0.014196610078215599,
"learning_rate": 2.6135493316482017e-07,
"loss": 0.0008,
"step": 212000
},
{
"epoch": 9.711204972356718,
"grad_norm": 0.0016565111000090837,
"learning_rate": 2.532582468677214e-07,
"loss": 0.0012,
"step": 212100
},
{
"epoch": 9.715783569703422,
"grad_norm": 0.0729876384139061,
"learning_rate": 2.452886396345555e-07,
"loss": 0.0008,
"step": 212200
},
{
"epoch": 9.720362167050125,
"grad_norm": 0.05016588792204857,
"learning_rate": 2.3744613182384856e-07,
"loss": 0.0008,
"step": 212300
},
{
"epoch": 9.724940764396827,
"grad_norm": 0.0004378720186650753,
"learning_rate": 2.2973074346944734e-07,
"loss": 0.001,
"step": 212400
},
{
"epoch": 9.72951936174353,
"grad_norm": 0.005694561637938023,
"learning_rate": 2.2214249428046952e-07,
"loss": 0.001,
"step": 212500
},
{
"epoch": 9.734097959090233,
"grad_norm": 0.006911196745932102,
"learning_rate": 2.1468140364125367e-07,
"loss": 0.0015,
"step": 212600
},
{
"epoch": 9.738676556436936,
"grad_norm": 0.002870377618819475,
"learning_rate": 2.0734749061130377e-07,
"loss": 0.0012,
"step": 212700
},
{
"epoch": 9.743255153783638,
"grad_norm": 0.029163073748350143,
"learning_rate": 2.0014077392525031e-07,
"loss": 0.0007,
"step": 212800
},
{
"epoch": 9.747833751130342,
"grad_norm": 0.5740547180175781,
"learning_rate": 1.930612719927949e-07,
"loss": 0.0013,
"step": 212900
},
{
"epoch": 9.752412348477044,
"grad_norm": 0.017990708351135254,
"learning_rate": 1.8610900289867673e-07,
"loss": 0.0009,
"step": 213000
},
{
"epoch": 9.756990945823746,
"grad_norm": 0.0017326328670606017,
"learning_rate": 1.792839844026062e-07,
"loss": 0.001,
"step": 213100
},
{
"epoch": 9.76156954317045,
"grad_norm": 0.0013371937675401568,
"learning_rate": 1.725862339392259e-07,
"loss": 0.0008,
"step": 213200
},
{
"epoch": 9.766148140517153,
"grad_norm": 0.0008829734288156033,
"learning_rate": 1.66015768618083e-07,
"loss": 0.0017,
"step": 213300
},
{
"epoch": 9.770726737863855,
"grad_norm": 0.09034759551286697,
"learning_rate": 1.5957260522356243e-07,
"loss": 0.0009,
"step": 213400
},
{
"epoch": 9.775305335210557,
"grad_norm": 0.0011923068668693304,
"learning_rate": 1.5325676021484825e-07,
"loss": 0.001,
"step": 213500
},
{
"epoch": 9.779883932557262,
"grad_norm": 0.0013882212806493044,
"learning_rate": 1.4706824972591238e-07,
"loss": 0.001,
"step": 213600
},
{
"epoch": 9.784462529903964,
"grad_norm": 0.004688725806772709,
"learning_rate": 1.410070895654203e-07,
"loss": 0.0012,
"step": 213700
},
{
"epoch": 9.789041127250666,
"grad_norm": 0.0011394763132557273,
"learning_rate": 1.3507329521672552e-07,
"loss": 0.0005,
"step": 213800
},
{
"epoch": 9.79361972459737,
"grad_norm": 0.015895133838057518,
"learning_rate": 1.2926688183783066e-07,
"loss": 0.0006,
"step": 213900
},
{
"epoch": 9.798198321944072,
"grad_norm": 0.032990917563438416,
"learning_rate": 1.235878642613375e-07,
"loss": 0.0006,
"step": 214000
},
{
"epoch": 9.802776919290775,
"grad_norm": 0.012515276670455933,
"learning_rate": 1.1803625699440824e-07,
"loss": 0.001,
"step": 214100
},
{
"epoch": 9.807355516637479,
"grad_norm": 0.002379771787673235,
"learning_rate": 1.1261207421874309e-07,
"loss": 0.0007,
"step": 214200
},
{
"epoch": 9.811934113984181,
"grad_norm": 0.09647377580404282,
"learning_rate": 1.0731532979051939e-07,
"loss": 0.0007,
"step": 214300
},
{
"epoch": 9.816512711330883,
"grad_norm": 0.0067366501316428185,
"learning_rate": 1.021460372403915e-07,
"loss": 0.0012,
"step": 214400
},
{
"epoch": 9.821091308677588,
"grad_norm": 0.005677223205566406,
"learning_rate": 9.710420977340762e-08,
"loss": 0.0015,
"step": 214500
},
{
"epoch": 9.82566990602429,
"grad_norm": 0.004333006218075752,
"learning_rate": 9.218986026902632e-08,
"loss": 0.0014,
"step": 214600
},
{
"epoch": 9.830248503370992,
"grad_norm": 0.0027217718306928873,
"learning_rate": 8.740300128105005e-08,
"loss": 0.0003,
"step": 214700
},
{
"epoch": 9.834827100717694,
"grad_norm": 0.30776289105415344,
"learning_rate": 8.274364503760845e-08,
"loss": 0.0014,
"step": 214800
},
{
"epoch": 9.839405698064398,
"grad_norm": 0.005994019098579884,
"learning_rate": 7.8211803441125e-08,
"loss": 0.0009,
"step": 214900
},
{
"epoch": 9.8439842954111,
"grad_norm": 0.0010411434341222048,
"learning_rate": 7.380748806827819e-08,
"loss": 0.0011,
"step": 215000
},
{
"epoch": 9.848562892757803,
"grad_norm": 0.04015972465276718,
"learning_rate": 6.953071016998491e-08,
"loss": 0.0005,
"step": 215100
},
{
"epoch": 9.853141490104507,
"grad_norm": 0.0010900140041485429,
"learning_rate": 6.538148067135596e-08,
"loss": 0.001,
"step": 215200
},
{
"epoch": 9.85772008745121,
"grad_norm": 0.5897096991539001,
"learning_rate": 6.135981017167947e-08,
"loss": 0.0007,
"step": 215300
},
{
"epoch": 9.862298684797912,
"grad_norm": 0.03506353124976158,
"learning_rate": 5.7465708944404175e-08,
"loss": 0.0006,
"step": 215400
},
{
"epoch": 9.866877282144616,
"grad_norm": 0.042582686990499496,
"learning_rate": 5.3699186937089526e-08,
"loss": 0.001,
"step": 215500
},
{
"epoch": 9.871455879491318,
"grad_norm": 0.008183780126273632,
"learning_rate": 5.006025377138901e-08,
"loss": 0.0009,
"step": 215600
},
{
"epoch": 9.87603447683802,
"grad_norm": 0.0004912464646622539,
"learning_rate": 4.6548918743033464e-08,
"loss": 0.0013,
"step": 215700
},
{
"epoch": 9.880613074184723,
"grad_norm": 0.008000398054718971,
"learning_rate": 4.316519082179227e-08,
"loss": 0.0008,
"step": 215800
},
{
"epoch": 9.885191671531427,
"grad_norm": 0.0017890778835862875,
"learning_rate": 3.9909078651478856e-08,
"loss": 0.0008,
"step": 215900
},
{
"epoch": 9.889770268878129,
"grad_norm": 0.002216469496488571,
"learning_rate": 3.678059054988969e-08,
"loss": 0.0008,
"step": 216000
},
{
"epoch": 9.894348866224831,
"grad_norm": 0.07988656312227249,
"learning_rate": 3.377973450881533e-08,
"loss": 0.001,
"step": 216100
},
{
"epoch": 9.898927463571535,
"grad_norm": 0.0008704246138222516,
"learning_rate": 3.0906518194001586e-08,
"loss": 0.0006,
"step": 216200
},
{
"epoch": 9.903506060918238,
"grad_norm": 0.0025461604818701744,
"learning_rate": 2.8160948945138434e-08,
"loss": 0.0012,
"step": 216300
},
{
"epoch": 9.90808465826494,
"grad_norm": 0.00465493043884635,
"learning_rate": 2.554303377584333e-08,
"loss": 0.0006,
"step": 216400
},
{
"epoch": 9.912663255611644,
"grad_norm": 0.0038689495995640755,
"learning_rate": 2.305277937362238e-08,
"loss": 0.0011,
"step": 216500
},
{
"epoch": 9.917241852958346,
"grad_norm": 0.7484769225120544,
"learning_rate": 2.0690192099892535e-08,
"loss": 0.001,
"step": 216600
},
{
"epoch": 9.921820450305049,
"grad_norm": 0.01186487078666687,
"learning_rate": 1.845527798992608e-08,
"loss": 0.0008,
"step": 216700
},
{
"epoch": 9.926399047651753,
"grad_norm": 0.030242715030908585,
"learning_rate": 1.6348042752856173e-08,
"loss": 0.0005,
"step": 216800
},
{
"epoch": 9.930977644998455,
"grad_norm": 0.007519678212702274,
"learning_rate": 1.436849177166022e-08,
"loss": 0.0005,
"step": 216900
},
{
"epoch": 9.935556242345157,
"grad_norm": 0.0013559481594711542,
"learning_rate": 1.2516630103137638e-08,
"loss": 0.0007,
"step": 217000
},
{
"epoch": 9.94013483969186,
"grad_norm": 0.0019348141504451632,
"learning_rate": 1.0792462477909882e-08,
"loss": 0.0008,
"step": 217100
},
{
"epoch": 9.944713437038564,
"grad_norm": 0.004812970757484436,
"learning_rate": 9.195993300398221e-09,
"loss": 0.0015,
"step": 217200
},
{
"epoch": 9.949292034385266,
"grad_norm": 0.0022001820616424084,
"learning_rate": 7.727226648818198e-09,
"loss": 0.0014,
"step": 217300
},
{
"epoch": 9.953870631731968,
"grad_norm": 0.07122460752725601,
"learning_rate": 6.386166275157424e-09,
"loss": 0.0011,
"step": 217400
},
{
"epoch": 9.958449229078672,
"grad_norm": 0.10452170670032501,
"learning_rate": 5.172815605186676e-09,
"loss": 0.0016,
"step": 217500
},
{
"epoch": 9.963027826425375,
"grad_norm": 0.009487117640674114,
"learning_rate": 4.087177738432146e-09,
"loss": 0.001,
"step": 217600
},
{
"epoch": 9.967606423772077,
"grad_norm": 0.0035909826401621103,
"learning_rate": 3.12925544818099e-09,
"loss": 0.0016,
"step": 217700
},
{
"epoch": 9.972185021118781,
"grad_norm": 0.0012206127867102623,
"learning_rate": 2.299051181464673e-09,
"loss": 0.0007,
"step": 217800
},
{
"epoch": 9.976763618465483,
"grad_norm": 0.016972968354821205,
"learning_rate": 1.596567059053422e-09,
"loss": 0.001,
"step": 217900
},
{
"epoch": 9.981342215812186,
"grad_norm": 0.0021455176174640656,
"learning_rate": 1.0218048754617738e-09,
"loss": 0.0006,
"step": 218000
},
{
"epoch": 9.985920813158888,
"grad_norm": 0.0012913336977362633,
"learning_rate": 5.747660989263714e-10,
"loss": 0.001,
"step": 218100
},
{
"epoch": 9.990499410505592,
"grad_norm": 0.0040626926347613335,
"learning_rate": 2.554518714226184e-10,
"loss": 0.001,
"step": 218200
},
{
"epoch": 9.995078007852294,
"grad_norm": 0.0009735480998642743,
"learning_rate": 6.386300864247297e-11,
"loss": 0.0008,
"step": 218300
},
{
"epoch": 9.999656605198997,
"grad_norm": 0.01641685888171196,
"learning_rate": 0.0,
"loss": 0.0009,
"step": 218400
},
{
"epoch": 9.999656605198997,
"eval_loss": 0.29180198907852173,
"eval_runtime": 243.5776,
"eval_samples_per_second": 22.58,
"eval_steps_per_second": 22.58,
"step": 218400
}
],
"logging_steps": 100,
"max_steps": 218400,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.5885525416859468e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}