| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 3289, | |
| "global_step": 16443, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 24.78779624251123, | |
| "learning_rate": 6.079027355623101e-10, | |
| "loss": 2.6939, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 35.873109917628135, | |
| "learning_rate": 1.9452887537993922e-08, | |
| "loss": 3.4616, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 54.36320326793996, | |
| "learning_rate": 3.8905775075987844e-08, | |
| "loss": 3.544, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 26.680119386381374, | |
| "learning_rate": 5.8358662613981756e-08, | |
| "loss": 3.3751, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 21.12845447174909, | |
| "learning_rate": 7.781155015197569e-08, | |
| "loss": 3.5534, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 14.326825936063585, | |
| "learning_rate": 9.72644376899696e-08, | |
| "loss": 3.2669, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 25.878315201522145, | |
| "learning_rate": 1.1671732522796351e-07, | |
| "loss": 2.9778, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 36.45201708374618, | |
| "learning_rate": 1.3617021276595745e-07, | |
| "loss": 2.5704, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 23.711566820577637, | |
| "learning_rate": 1.5562310030395138e-07, | |
| "loss": 1.3894, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 4.875938314628448, | |
| "learning_rate": 1.7507598784194527e-07, | |
| "loss": 0.6738, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 6.241944245269492, | |
| "learning_rate": 1.945288753799392e-07, | |
| "loss": 0.5629, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.8778836099761524, | |
| "learning_rate": 2.1398176291793313e-07, | |
| "loss": 0.4872, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.5436867519527993, | |
| "learning_rate": 2.3343465045592702e-07, | |
| "loss": 0.4714, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 4.3096231574697645, | |
| "learning_rate": 2.52887537993921e-07, | |
| "loss": 0.4378, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.9666937909371047, | |
| "learning_rate": 2.723404255319149e-07, | |
| "loss": 0.4291, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 2.2115158499878724, | |
| "learning_rate": 2.917933130699088e-07, | |
| "loss": 0.3954, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.7998977018189959, | |
| "learning_rate": 3.1124620060790275e-07, | |
| "loss": 0.4086, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.8295831973352625, | |
| "learning_rate": 3.3069908814589665e-07, | |
| "loss": 0.3926, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 2.14598569285712, | |
| "learning_rate": 3.5015197568389055e-07, | |
| "loss": 0.4026, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.5109663276292344, | |
| "learning_rate": 3.696048632218845e-07, | |
| "loss": 0.3688, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 2.453154442640789, | |
| "learning_rate": 3.890577507598784e-07, | |
| "loss": 0.3919, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.6423681780897064, | |
| "learning_rate": 4.085106382978723e-07, | |
| "loss": 0.364, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.8702893489598191, | |
| "learning_rate": 4.2796352583586625e-07, | |
| "loss": 0.3543, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.8401881440614818, | |
| "learning_rate": 4.4741641337386015e-07, | |
| "loss": 0.3601, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 1.850566734809788, | |
| "learning_rate": 4.6686930091185405e-07, | |
| "loss": 0.3566, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.6994281914890326, | |
| "learning_rate": 4.86322188449848e-07, | |
| "loss": 0.3505, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.8680937366461803, | |
| "learning_rate": 5.05775075987842e-07, | |
| "loss": 0.3358, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.660364495604376, | |
| "learning_rate": 5.252279635258359e-07, | |
| "loss": 0.3312, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 2.562680158872079, | |
| "learning_rate": 5.446808510638298e-07, | |
| "loss": 0.3346, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.6948446357514406, | |
| "learning_rate": 5.641337386018237e-07, | |
| "loss": 0.3356, | |
| "step": 928 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.209363365641442, | |
| "learning_rate": 5.835866261398176e-07, | |
| "loss": 0.3279, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.6925812671610025, | |
| "learning_rate": 6.030395136778115e-07, | |
| "loss": 0.3367, | |
| "step": 992 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.5391435373855369, | |
| "learning_rate": 6.224924012158055e-07, | |
| "loss": 0.3356, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.4744316930455266, | |
| "learning_rate": 6.419452887537994e-07, | |
| "loss": 0.3277, | |
| "step": 1056 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.7620056653926688, | |
| "learning_rate": 6.613981762917933e-07, | |
| "loss": 0.3307, | |
| "step": 1088 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.4025350626099162, | |
| "learning_rate": 6.808510638297872e-07, | |
| "loss": 0.3197, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.3405530061214819, | |
| "learning_rate": 7.003039513677811e-07, | |
| "loss": 0.3218, | |
| "step": 1152 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.4535529320190042, | |
| "learning_rate": 7.197568389057751e-07, | |
| "loss": 0.3067, | |
| "step": 1184 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.445157014748904, | |
| "learning_rate": 7.39209726443769e-07, | |
| "loss": 0.3087, | |
| "step": 1216 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.5871727851863946, | |
| "learning_rate": 7.586626139817629e-07, | |
| "loss": 0.3252, | |
| "step": 1248 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.3848383016325827, | |
| "learning_rate": 7.781155015197568e-07, | |
| "loss": 0.3113, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.7455453453163468, | |
| "learning_rate": 7.975683890577507e-07, | |
| "loss": 0.297, | |
| "step": 1312 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.3001668041428838, | |
| "learning_rate": 8.170212765957446e-07, | |
| "loss": 0.3223, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.2132377152476534, | |
| "learning_rate": 8.364741641337386e-07, | |
| "loss": 0.315, | |
| "step": 1376 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.4384605625322182, | |
| "learning_rate": 8.559270516717325e-07, | |
| "loss": 0.3105, | |
| "step": 1408 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.1957556077083222, | |
| "learning_rate": 8.753799392097264e-07, | |
| "loss": 0.3007, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.466257939308454, | |
| "learning_rate": 8.948328267477203e-07, | |
| "loss": 0.3034, | |
| "step": 1472 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.348037306034132, | |
| "learning_rate": 9.142857142857142e-07, | |
| "loss": 0.3044, | |
| "step": 1504 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.3312009756781122, | |
| "learning_rate": 9.337386018237081e-07, | |
| "loss": 0.3124, | |
| "step": 1536 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.5165497951579907, | |
| "learning_rate": 9.531914893617021e-07, | |
| "loss": 0.3064, | |
| "step": 1568 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.2308464212993413, | |
| "learning_rate": 9.72644376899696e-07, | |
| "loss": 0.3096, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.4477927464029368, | |
| "learning_rate": 9.9209726443769e-07, | |
| "loss": 0.2967, | |
| "step": 1632 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.6161236994381232, | |
| "learning_rate": 9.999959323804955e-07, | |
| "loss": 0.307, | |
| "step": 1664 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.408597466262752, | |
| "learning_rate": 9.999706931043734e-07, | |
| "loss": 0.3045, | |
| "step": 1696 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.2262804987989535, | |
| "learning_rate": 9.999223791032621e-07, | |
| "loss": 0.2982, | |
| "step": 1728 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.4772857240369206, | |
| "learning_rate": 9.998509926069568e-07, | |
| "loss": 0.3085, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.1705510022471537, | |
| "learning_rate": 9.997565369100982e-07, | |
| "loss": 0.3102, | |
| "step": 1792 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.354193914272892, | |
| "learning_rate": 9.996390163720203e-07, | |
| "loss": 0.3079, | |
| "step": 1824 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.6499740915478518, | |
| "learning_rate": 9.99498436416549e-07, | |
| "loss": 0.3058, | |
| "step": 1856 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.4176797632580183, | |
| "learning_rate": 9.993348035317523e-07, | |
| "loss": 0.2898, | |
| "step": 1888 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.221569587599335, | |
| "learning_rate": 9.991481252696405e-07, | |
| "loss": 0.2971, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 1.6858523944684565, | |
| "learning_rate": 9.98938410245818e-07, | |
| "loss": 0.3015, | |
| "step": 1952 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 1.3071072285598695, | |
| "learning_rate": 9.987056681390846e-07, | |
| "loss": 0.3017, | |
| "step": 1984 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.3559736389856838, | |
| "learning_rate": 9.984499096909904e-07, | |
| "loss": 0.2996, | |
| "step": 2016 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.6577354687629797, | |
| "learning_rate": 9.981711467053391e-07, | |
| "loss": 0.2899, | |
| "step": 2048 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.3911817831183464, | |
| "learning_rate": 9.97869392047643e-07, | |
| "loss": 0.2916, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.4068725629362402, | |
| "learning_rate": 9.9754465964453e-07, | |
| "loss": 0.3042, | |
| "step": 2112 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.2845003819775453, | |
| "learning_rate": 9.971969644831001e-07, | |
| "loss": 0.2953, | |
| "step": 2144 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.3034821415736415, | |
| "learning_rate": 9.968263226102348e-07, | |
| "loss": 0.2947, | |
| "step": 2176 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.297243001736353, | |
| "learning_rate": 9.96432751131855e-07, | |
| "loss": 0.2849, | |
| "step": 2208 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.411186926138334, | |
| "learning_rate": 9.960162682121326e-07, | |
| "loss": 0.2974, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.2374941457414885, | |
| "learning_rate": 9.955768930726523e-07, | |
| "loss": 0.2824, | |
| "step": 2272 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.2627652146626807, | |
| "learning_rate": 9.95114645991523e-07, | |
| "loss": 0.2865, | |
| "step": 2304 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.3134029095546937, | |
| "learning_rate": 9.946295483024446e-07, | |
| "loss": 0.2867, | |
| "step": 2336 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.125719093465591, | |
| "learning_rate": 9.9412162239372e-07, | |
| "loss": 0.292, | |
| "step": 2368 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.4224011133495338, | |
| "learning_rate": 9.93590891707225e-07, | |
| "loss": 0.309, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.558277250619359, | |
| "learning_rate": 9.930373807373245e-07, | |
| "loss": 0.289, | |
| "step": 2432 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.3208871305799181, | |
| "learning_rate": 9.924611150297428e-07, | |
| "loss": 0.2883, | |
| "step": 2464 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.3373963931205746, | |
| "learning_rate": 9.91862121180384e-07, | |
| "loss": 0.2808, | |
| "step": 2496 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.3910055395902359, | |
| "learning_rate": 9.912404268341051e-07, | |
| "loss": 0.2932, | |
| "step": 2528 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.4574028412197861, | |
| "learning_rate": 9.905960606834404e-07, | |
| "loss": 0.2947, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.203878457595175, | |
| "learning_rate": 9.899290524672763e-07, | |
| "loss": 0.2867, | |
| "step": 2592 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.4673267738670204, | |
| "learning_rate": 9.892394329694793e-07, | |
| "loss": 0.2837, | |
| "step": 2624 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.6281520652734605, | |
| "learning_rate": 9.885272340174754e-07, | |
| "loss": 0.3032, | |
| "step": 2656 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 1.2522157670744074, | |
| "learning_rate": 9.877924884807814e-07, | |
| "loss": 0.2963, | |
| "step": 2688 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.4632043067121088, | |
| "learning_rate": 9.870352302694869e-07, | |
| "loss": 0.2898, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.398722594166713, | |
| "learning_rate": 9.86255494332691e-07, | |
| "loss": 0.2896, | |
| "step": 2752 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.3916529397644415, | |
| "learning_rate": 9.854533166568867e-07, | |
| "loss": 0.2967, | |
| "step": 2784 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.3208718268032766, | |
| "learning_rate": 9.846287342643032e-07, | |
| "loss": 0.2838, | |
| "step": 2816 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.0718990960806063, | |
| "learning_rate": 9.837817852111949e-07, | |
| "loss": 0.2942, | |
| "step": 2848 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.3984255388696125, | |
| "learning_rate": 9.829125085860858e-07, | |
| "loss": 0.2832, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.258839102452435, | |
| "learning_rate": 9.820209445079655e-07, | |
| "loss": 0.2964, | |
| "step": 2912 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.1619142104791713, | |
| "learning_rate": 9.811071341244379e-07, | |
| "loss": 0.2727, | |
| "step": 2944 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.3494984294429655, | |
| "learning_rate": 9.801711196098213e-07, | |
| "loss": 0.2848, | |
| "step": 2976 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.2501831577892621, | |
| "learning_rate": 9.792129441632027e-07, | |
| "loss": 0.2749, | |
| "step": 3008 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.2580708434458154, | |
| "learning_rate": 9.782326520064443e-07, | |
| "loss": 0.2868, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.5601918050638766, | |
| "learning_rate": 9.772302883821418e-07, | |
| "loss": 0.292, | |
| "step": 3072 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.3545732270149806, | |
| "learning_rate": 9.762058995515362e-07, | |
| "loss": 0.2895, | |
| "step": 3104 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.4796071501521761, | |
| "learning_rate": 9.751595327923802e-07, | |
| "loss": 0.291, | |
| "step": 3136 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.3020677021992657, | |
| "learning_rate": 9.740912363967546e-07, | |
| "loss": 0.294, | |
| "step": 3168 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.2903648809439663, | |
| "learning_rate": 9.730010596688405e-07, | |
| "loss": 0.2971, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.4322652904369149, | |
| "learning_rate": 9.718890529226432e-07, | |
| "loss": 0.2842, | |
| "step": 3232 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.4918014229559349, | |
| "learning_rate": 9.707552674796704e-07, | |
| "loss": 0.2855, | |
| "step": 3264 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.495160093783148, | |
| "learning_rate": 9.695997556665642e-07, | |
| "loss": 0.2961, | |
| "step": 3296 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.2299397480418435, | |
| "learning_rate": 9.68422570812685e-07, | |
| "loss": 0.275, | |
| "step": 3328 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.4595570985088122, | |
| "learning_rate": 9.672237672476504e-07, | |
| "loss": 0.2816, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.2523764393563719, | |
| "learning_rate": 9.660034002988288e-07, | |
| "loss": 0.2826, | |
| "step": 3392 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.1556114989117734, | |
| "learning_rate": 9.64761526288785e-07, | |
| "loss": 0.2979, | |
| "step": 3424 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.2760487310976472, | |
| "learning_rate": 9.634982025326808e-07, | |
| "loss": 0.2703, | |
| "step": 3456 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.1268494754892229, | |
| "learning_rate": 9.622134873356302e-07, | |
| "loss": 0.2888, | |
| "step": 3488 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.5177691734350403, | |
| "learning_rate": 9.60907439990008e-07, | |
| "loss": 0.2854, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.4479572539864909, | |
| "learning_rate": 9.595801207727145e-07, | |
| "loss": 0.2871, | |
| "step": 3552 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.3320405553461194, | |
| "learning_rate": 9.58231590942392e-07, | |
| "loss": 0.2722, | |
| "step": 3584 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.2684704306064236, | |
| "learning_rate": 9.568619127365979e-07, | |
| "loss": 0.2918, | |
| "step": 3616 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.153157224662708, | |
| "learning_rate": 9.554711493689337e-07, | |
| "loss": 0.2727, | |
| "step": 3648 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.2053424871803355, | |
| "learning_rate": 9.540593650261259e-07, | |
| "loss": 0.2727, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.5070136801134912, | |
| "learning_rate": 9.526266248650647e-07, | |
| "loss": 0.2829, | |
| "step": 3712 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.3626256868915094, | |
| "learning_rate": 9.511729950097962e-07, | |
| "loss": 0.2948, | |
| "step": 3744 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.3413698467022634, | |
| "learning_rate": 9.496985425484708e-07, | |
| "loss": 0.2804, | |
| "step": 3776 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.1666360657022152, | |
| "learning_rate": 9.482033355302474e-07, | |
| "loss": 0.2779, | |
| "step": 3808 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.2681212059953142, | |
| "learning_rate": 9.466874429621523e-07, | |
| "loss": 0.2766, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.570024716133246, | |
| "learning_rate": 9.451509348058943e-07, | |
| "loss": 0.278, | |
| "step": 3872 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.348864102054253, | |
| "learning_rate": 9.435938819746363e-07, | |
| "loss": 0.2814, | |
| "step": 3904 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.479630195353394, | |
| "learning_rate": 9.420163563297221e-07, | |
| "loss": 0.2768, | |
| "step": 3936 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.3437829697550214, | |
| "learning_rate": 9.4041843067736e-07, | |
| "loss": 0.2796, | |
| "step": 3968 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.7505808749964085, | |
| "learning_rate": 9.388001787652626e-07, | |
| "loss": 0.2804, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.2804732688993112, | |
| "learning_rate": 9.37161675279243e-07, | |
| "loss": 0.2671, | |
| "step": 4032 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.5599945996293794, | |
| "learning_rate": 9.355029958397686e-07, | |
| "loss": 0.2778, | |
| "step": 4064 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.140562030900848, | |
| "learning_rate": 9.338242169984701e-07, | |
| "loss": 0.2779, | |
| "step": 4096 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.1891602382042474, | |
| "learning_rate": 9.321254162346089e-07, | |
| "loss": 0.2779, | |
| "step": 4128 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.6066027745264384, | |
| "learning_rate": 9.304066719515013e-07, | |
| "loss": 0.2831, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.3535261095585958, | |
| "learning_rate": 9.286680634729005e-07, | |
| "loss": 0.2869, | |
| "step": 4192 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.449372671449822, | |
| "learning_rate": 9.269096710393343e-07, | |
| "loss": 0.2859, | |
| "step": 4224 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 1.2209167474794183, | |
| "learning_rate": 9.251315758044032e-07, | |
| "loss": 0.2908, | |
| "step": 4256 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 1.4925119088076193, | |
| "learning_rate": 9.233338598310343e-07, | |
| "loss": 0.2821, | |
| "step": 4288 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.2663637068625269, | |
| "learning_rate": 9.215166060876943e-07, | |
| "loss": 0.2839, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.335605475288019, | |
| "learning_rate": 9.196798984445595e-07, | |
| "loss": 0.2767, | |
| "step": 4352 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.1999064453898192, | |
| "learning_rate": 9.178238216696463e-07, | |
| "loss": 0.2813, | |
| "step": 4384 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.2005680882302527, | |
| "learning_rate": 9.159484614248978e-07, | |
| "loss": 0.2776, | |
| "step": 4416 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.2022881962586156, | |
| "learning_rate": 9.140539042622311e-07, | |
| "loss": 0.2812, | |
| "step": 4448 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.4918926856382582, | |
| "learning_rate": 9.121402376195421e-07, | |
| "loss": 0.2763, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.452212648334464, | |
| "learning_rate": 9.102075498166705e-07, | |
| "loss": 0.2756, | |
| "step": 4512 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.4610868927687688, | |
| "learning_rate": 9.082559300513237e-07, | |
| "loss": 0.2755, | |
| "step": 4544 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.2140564103776683, | |
| "learning_rate": 9.062854683949602e-07, | |
| "loss": 0.266, | |
| "step": 4576 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.23274371490023, | |
| "learning_rate": 9.042962557886313e-07, | |
| "loss": 0.2856, | |
| "step": 4608 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.3898063562012308, | |
| "learning_rate": 9.022883840387865e-07, | |
| "loss": 0.2815, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.330139097581169, | |
| "learning_rate": 9.002619458130339e-07, | |
| "loss": 0.28, | |
| "step": 4672 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.2874922030319198, | |
| "learning_rate": 8.982170346358651e-07, | |
| "loss": 0.2669, | |
| "step": 4704 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.107586259057668, | |
| "learning_rate": 8.961537448843377e-07, | |
| "loss": 0.2853, | |
| "step": 4736 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.448957134616539, | |
| "learning_rate": 8.940721717837205e-07, | |
| "loss": 0.2652, | |
| "step": 4768 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.210724298652212, | |
| "learning_rate": 8.919724114030984e-07, | |
| "loss": 0.2795, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.3698632505550181, | |
| "learning_rate": 8.898545606509378e-07, | |
| "loss": 0.284, | |
| "step": 4832 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.1911981203106947, | |
| "learning_rate": 8.877187172706155e-07, | |
| "loss": 0.2746, | |
| "step": 4864 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.3333516374957606, | |
| "learning_rate": 8.855649798359064e-07, | |
| "loss": 0.2888, | |
| "step": 4896 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.2351888608354114, | |
| "learning_rate": 8.833934477464347e-07, | |
| "loss": 0.2831, | |
| "step": 4928 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.3038819344536112, | |
| "learning_rate": 8.812042212230862e-07, | |
| "loss": 0.2753, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.5707271650867898, | |
| "learning_rate": 8.789974013033831e-07, | |
| "loss": 0.2759, | |
| "step": 4992 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.2820047239265864, | |
| "learning_rate": 8.767730898368208e-07, | |
| "loss": 0.2746, | |
| "step": 5024 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.1744722963290781, | |
| "learning_rate": 8.745313894801669e-07, | |
| "loss": 0.291, | |
| "step": 5056 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.0913528023957022, | |
| "learning_rate": 8.722724036927242e-07, | |
| "loss": 0.2837, | |
| "step": 5088 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.3404827415629068, | |
| "learning_rate": 8.699962367315552e-07, | |
| "loss": 0.2676, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 1.163041143181648, | |
| "learning_rate": 8.677029936466706e-07, | |
| "loss": 0.2832, | |
| "step": 5152 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.436779107568984, | |
| "learning_rate": 8.653927802761809e-07, | |
| "loss": 0.2728, | |
| "step": 5184 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.0080451953033644, | |
| "learning_rate": 8.630657032414121e-07, | |
| "loss": 0.2617, | |
| "step": 5216 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.1100819749235487, | |
| "learning_rate": 8.607218699419843e-07, | |
| "loss": 0.2942, | |
| "step": 5248 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.2992077116418177, | |
| "learning_rate": 8.583613885508556e-07, | |
| "loss": 0.2672, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 1.223919929869006, | |
| "learning_rate": 8.559843680093296e-07, | |
| "loss": 0.277, | |
| "step": 5312 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.1391386556761285, | |
| "learning_rate": 8.535909180220268e-07, | |
| "loss": 0.2806, | |
| "step": 5344 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.2127293368764673, | |
| "learning_rate": 8.511811490518225e-07, | |
| "loss": 0.2619, | |
| "step": 5376 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 1.140029182839281, | |
| "learning_rate": 8.48755172314748e-07, | |
| "loss": 0.2718, | |
| "step": 5408 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 1.4671327877043736, | |
| "learning_rate": 8.463130997748578e-07, | |
| "loss": 0.272, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.1390017949538402, | |
| "learning_rate": 8.43855044139063e-07, | |
| "loss": 0.2737, | |
| "step": 5472 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.2142876905398077, | |
| "learning_rate": 8.413811188519282e-07, | |
| "loss": 0.2599, | |
| "step": 5504 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 1.266246650515794, | |
| "learning_rate": 8.388914380904373e-07, | |
| "loss": 0.2465, | |
| "step": 5536 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 1.173280963516161, | |
| "learning_rate": 8.36386116758723e-07, | |
| "loss": 0.2483, | |
| "step": 5568 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 1.1617564369184084, | |
| "learning_rate": 8.33865270482764e-07, | |
| "loss": 0.2568, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 1.420351934994977, | |
| "learning_rate": 8.313290156050487e-07, | |
| "loss": 0.2502, | |
| "step": 5632 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 1.3547439002353128, | |
| "learning_rate": 8.287774691792057e-07, | |
| "loss": 0.246, | |
| "step": 5664 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 1.1455753710552021, | |
| "learning_rate": 8.262107489646014e-07, | |
| "loss": 0.2554, | |
| "step": 5696 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 1.3859247572490137, | |
| "learning_rate": 8.236289734209054e-07, | |
| "loss": 0.2466, | |
| "step": 5728 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 1.1343048895326515, | |
| "learning_rate": 8.210322617026232e-07, | |
| "loss": 0.2467, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 1.2217575468560562, | |
| "learning_rate": 8.184207336535967e-07, | |
| "loss": 0.2574, | |
| "step": 5792 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 1.3780997994944977, | |
| "learning_rate": 8.157945098014733e-07, | |
| "loss": 0.2481, | |
| "step": 5824 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 1.0870131072717084, | |
| "learning_rate": 8.131537113521443e-07, | |
| "loss": 0.2588, | |
| "step": 5856 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 1.0490087402219708, | |
| "learning_rate": 8.104984601841489e-07, | |
| "loss": 0.246, | |
| "step": 5888 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 1.2965280317141281, | |
| "learning_rate": 8.07828878843051e-07, | |
| "loss": 0.249, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 1.2861279001372838, | |
| "learning_rate": 8.051450905357829e-07, | |
| "loss": 0.2412, | |
| "step": 5952 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 1.409445153252456, | |
| "learning_rate": 8.024472191249588e-07, | |
| "loss": 0.2431, | |
| "step": 5984 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.3691683733505722, | |
| "learning_rate": 7.997353891231584e-07, | |
| "loss": 0.2517, | |
| "step": 6016 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.4906649500848184, | |
| "learning_rate": 7.970097256871811e-07, | |
| "loss": 0.2517, | |
| "step": 6048 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 1.4438858223954047, | |
| "learning_rate": 7.942703546122682e-07, | |
| "loss": 0.2584, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.268958450317541, | |
| "learning_rate": 7.915174023262988e-07, | |
| "loss": 0.2522, | |
| "step": 6112 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.4781129605275483, | |
| "learning_rate": 7.887509958839538e-07, | |
| "loss": 0.243, | |
| "step": 6144 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 1.331949493045608, | |
| "learning_rate": 7.859712629608524e-07, | |
| "loss": 0.2429, | |
| "step": 6176 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 1.306062446109076, | |
| "learning_rate": 7.831783318476596e-07, | |
| "loss": 0.2446, | |
| "step": 6208 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 1.242511236714611, | |
| "learning_rate": 7.803723314441656e-07, | |
| "loss": 0.2451, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 1.4482758408678764, | |
| "learning_rate": 7.775533912533363e-07, | |
| "loss": 0.2412, | |
| "step": 6272 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 1.3719989834400954, | |
| "learning_rate": 7.747216413753366e-07, | |
| "loss": 0.2421, | |
| "step": 6304 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 1.1916507119961282, | |
| "learning_rate": 7.718772125015263e-07, | |
| "loss": 0.2536, | |
| "step": 6336 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 1.2009787273487618, | |
| "learning_rate": 7.690202359084278e-07, | |
| "loss": 0.2501, | |
| "step": 6368 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 1.2823116236203336, | |
| "learning_rate": 7.661508434516682e-07, | |
| "loss": 0.2524, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 1.3430035950312693, | |
| "learning_rate": 7.632691675598935e-07, | |
| "loss": 0.2577, | |
| "step": 6432 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 1.4985924089749454, | |
| "learning_rate": 7.60375341228656e-07, | |
| "loss": 0.2444, | |
| "step": 6464 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 1.2984272436604025, | |
| "learning_rate": 7.574694980142779e-07, | |
| "loss": 0.2615, | |
| "step": 6496 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 1.3354550623218928, | |
| "learning_rate": 7.545517720276857e-07, | |
| "loss": 0.2389, | |
| "step": 6528 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.4301768294716657, | |
| "learning_rate": 7.51622297928222e-07, | |
| "loss": 0.2449, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.1719113606920948, | |
| "learning_rate": 7.4868121091743e-07, | |
| "loss": 0.2423, | |
| "step": 6592 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 1.4204627356681534, | |
| "learning_rate": 7.457286467328135e-07, | |
| "loss": 0.2445, | |
| "step": 6624 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 1.4900172188399152, | |
| "learning_rate": 7.427647416415725e-07, | |
| "loss": 0.2483, | |
| "step": 6656 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 1.3961859599185324, | |
| "learning_rate": 7.397896324343151e-07, | |
| "loss": 0.2359, | |
| "step": 6688 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 1.3110300825940397, | |
| "learning_rate": 7.368034564187424e-07, | |
| "loss": 0.2512, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 1.2757069881397953, | |
| "learning_rate": 7.338063514133136e-07, | |
| "loss": 0.2547, | |
| "step": 6752 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.4074893254437457, | |
| "learning_rate": 7.307984557408837e-07, | |
| "loss": 0.2496, | |
| "step": 6784 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.351839197713545, | |
| "learning_rate": 7.277799082223204e-07, | |
| "loss": 0.2353, | |
| "step": 6816 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 1.2030842443572731, | |
| "learning_rate": 7.24750848170097e-07, | |
| "loss": 0.2559, | |
| "step": 6848 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 1.324374188137303, | |
| "learning_rate": 7.217114153818629e-07, | |
| "loss": 0.2448, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 1.2343518311233508, | |
| "learning_rate": 7.186617501339917e-07, | |
| "loss": 0.2475, | |
| "step": 6912 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 1.257477506253048, | |
| "learning_rate": 7.156019931751071e-07, | |
| "loss": 0.2529, | |
| "step": 6944 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 1.3065578571620988, | |
| "learning_rate": 7.125322857195867e-07, | |
| "loss": 0.2439, | |
| "step": 6976 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.491754680548748, | |
| "learning_rate": 7.094527694410455e-07, | |
| "loss": 0.2581, | |
| "step": 7008 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.2313699652632502, | |
| "learning_rate": 7.063635864657964e-07, | |
| "loss": 0.2462, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 1.200487097478514, | |
| "learning_rate": 7.032648793662912e-07, | |
| "loss": 0.2422, | |
| "step": 7072 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.3322942461823077, | |
| "learning_rate": 7.001567911545406e-07, | |
| "loss": 0.2456, | |
| "step": 7104 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.4431169758144065, | |
| "learning_rate": 6.970394652755144e-07, | |
| "loss": 0.2482, | |
| "step": 7136 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 1.261334251262187, | |
| "learning_rate": 6.939130456005196e-07, | |
| "loss": 0.2456, | |
| "step": 7168 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 1.4803616635583337, | |
| "learning_rate": 6.907776764205622e-07, | |
| "loss": 0.2628, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 1.318319361538514, | |
| "learning_rate": 6.876335024396871e-07, | |
| "loss": 0.2408, | |
| "step": 7232 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 1.2799133546143286, | |
| "learning_rate": 6.844806687682996e-07, | |
| "loss": 0.2445, | |
| "step": 7264 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 1.2877655780208916, | |
| "learning_rate": 6.813193209164683e-07, | |
| "loss": 0.2406, | |
| "step": 7296 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 1.344111445915826, | |
| "learning_rate": 6.781496047872098e-07, | |
| "loss": 0.2408, | |
| "step": 7328 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 1.5226804349220626, | |
| "learning_rate": 6.749716666697545e-07, | |
| "loss": 0.2469, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 1.3170301342228112, | |
| "learning_rate": 6.717856532327956e-07, | |
| "loss": 0.2407, | |
| "step": 7392 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 1.185336007297627, | |
| "learning_rate": 6.685917115177193e-07, | |
| "loss": 0.2445, | |
| "step": 7424 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 1.3491930633425604, | |
| "learning_rate": 6.653899889318192e-07, | |
| "loss": 0.2562, | |
| "step": 7456 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 1.160879383772545, | |
| "learning_rate": 6.621806332414925e-07, | |
| "loss": 0.2427, | |
| "step": 7488 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 1.5333857134961049, | |
| "learning_rate": 6.589637925654215e-07, | |
| "loss": 0.251, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.5263236812468681, | |
| "learning_rate": 6.557396153677356e-07, | |
| "loss": 0.2543, | |
| "step": 7552 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.0995345813700304, | |
| "learning_rate": 6.525082504511612e-07, | |
| "loss": 0.2493, | |
| "step": 7584 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 1.4740647676869196, | |
| "learning_rate": 6.492698469501532e-07, | |
| "loss": 0.2414, | |
| "step": 7616 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.4036985523166017, | |
| "learning_rate": 6.460245543240123e-07, | |
| "loss": 0.2373, | |
| "step": 7648 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.4125143998119496, | |
| "learning_rate": 6.427725223499871e-07, | |
| "loss": 0.2427, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 1.195620693310162, | |
| "learning_rate": 6.39513901116362e-07, | |
| "loss": 0.2522, | |
| "step": 7712 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 1.2679060414661505, | |
| "learning_rate": 6.362488410155298e-07, | |
| "loss": 0.2448, | |
| "step": 7744 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 1.4675997129467317, | |
| "learning_rate": 6.329774927370504e-07, | |
| "loss": 0.2459, | |
| "step": 7776 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 1.3511806562135458, | |
| "learning_rate": 6.297000072606972e-07, | |
| "loss": 0.257, | |
| "step": 7808 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 1.272848715104825, | |
| "learning_rate": 6.264165358494884e-07, | |
| "loss": 0.246, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.4081103274783833, | |
| "learning_rate": 6.231272300427059e-07, | |
| "loss": 0.2485, | |
| "step": 7872 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.2073750932804077, | |
| "learning_rate": 6.198322416489016e-07, | |
| "loss": 0.2354, | |
| "step": 7904 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 1.3509938246652688, | |
| "learning_rate": 6.165317227388911e-07, | |
| "loss": 0.2423, | |
| "step": 7936 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 1.8975854309012892, | |
| "learning_rate": 6.132258256387348e-07, | |
| "loss": 0.2422, | |
| "step": 7968 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 1.616263805539082, | |
| "learning_rate": 6.099147029227088e-07, | |
| "loss": 0.2452, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 1.4581239370531887, | |
| "learning_rate": 6.065985074062624e-07, | |
| "loss": 0.2564, | |
| "step": 8032 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 1.4107792940118344, | |
| "learning_rate": 6.032773921389654e-07, | |
| "loss": 0.2613, | |
| "step": 8064 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 1.4254738015511745, | |
| "learning_rate": 5.999515103974447e-07, | |
| "loss": 0.2441, | |
| "step": 8096 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 1.2694180719111068, | |
| "learning_rate": 5.966210156783108e-07, | |
| "loss": 0.2559, | |
| "step": 8128 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.3375582532247023, | |
| "learning_rate": 5.932860616910721e-07, | |
| "loss": 0.2488, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.2586037041521065, | |
| "learning_rate": 5.899468023510428e-07, | |
| "loss": 0.2427, | |
| "step": 8192 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.2896167720703278, | |
| "learning_rate": 5.866033917722379e-07, | |
| "loss": 0.2567, | |
| "step": 8224 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 1.101067530433089, | |
| "learning_rate": 5.832559842602608e-07, | |
| "loss": 0.2342, | |
| "step": 8256 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 1.1591181439961709, | |
| "learning_rate": 5.799047343051827e-07, | |
| "loss": 0.2411, | |
| "step": 8288 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.2887740481533372, | |
| "learning_rate": 5.765497965744111e-07, | |
| "loss": 0.2433, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.1116096883871758, | |
| "learning_rate": 5.731913259055527e-07, | |
| "loss": 0.2435, | |
| "step": 8352 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 1.3978304499588927, | |
| "learning_rate": 5.698294772992669e-07, | |
| "loss": 0.2485, | |
| "step": 8384 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 1.1353060264434358, | |
| "learning_rate": 5.664644059121121e-07, | |
| "loss": 0.2508, | |
| "step": 8416 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 1.337326494552789, | |
| "learning_rate": 5.630962670493848e-07, | |
| "loss": 0.2446, | |
| "step": 8448 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.5497857023189168, | |
| "learning_rate": 5.597252161579525e-07, | |
| "loss": 0.2498, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.6735926212275043, | |
| "learning_rate": 5.563514088190788e-07, | |
| "loss": 0.2382, | |
| "step": 8512 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 1.5802941502096313, | |
| "learning_rate": 5.529750007412435e-07, | |
| "loss": 0.24, | |
| "step": 8544 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 1.434775976642666, | |
| "learning_rate": 5.495961477529559e-07, | |
| "loss": 0.2508, | |
| "step": 8576 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 1.3871703235640929, | |
| "learning_rate": 5.462150057955633e-07, | |
| "loss": 0.24, | |
| "step": 8608 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 1.3448291807441222, | |
| "learning_rate": 5.428317309160538e-07, | |
| "loss": 0.2484, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 1.3157038644549268, | |
| "learning_rate": 5.394464792598545e-07, | |
| "loss": 0.2409, | |
| "step": 8672 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 1.2008640772377304, | |
| "learning_rate": 5.360594070636248e-07, | |
| "loss": 0.2454, | |
| "step": 8704 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 1.3073489027432128, | |
| "learning_rate": 5.326706706480467e-07, | |
| "loss": 0.2446, | |
| "step": 8736 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.5955373808274311, | |
| "learning_rate": 5.292804264106083e-07, | |
| "loss": 0.2433, | |
| "step": 8768 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 1.2901907823242207, | |
| "learning_rate": 5.25888830818388e-07, | |
| "loss": 0.2426, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 1.4454540634426136, | |
| "learning_rate": 5.224960404008319e-07, | |
| "loss": 0.2487, | |
| "step": 8832 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 1.3252857725269103, | |
| "learning_rate": 5.19102211742529e-07, | |
| "loss": 0.242, | |
| "step": 8864 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 1.3963560963709287, | |
| "learning_rate": 5.157075014759866e-07, | |
| "loss": 0.2333, | |
| "step": 8896 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 1.7474842902023573, | |
| "learning_rate": 5.123120662743993e-07, | |
| "loss": 0.2397, | |
| "step": 8928 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 1.463552241271405, | |
| "learning_rate": 5.089160628444193e-07, | |
| "loss": 0.2475, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 1.4711904387458636, | |
| "learning_rate": 5.055196479189237e-07, | |
| "loss": 0.2468, | |
| "step": 8992 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.1486741329450094, | |
| "learning_rate": 5.021229782497811e-07, | |
| "loss": 0.2474, | |
| "step": 9024 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.465014768579321, | |
| "learning_rate": 4.987262106006171e-07, | |
| "loss": 0.2437, | |
| "step": 9056 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 1.6572146980799467, | |
| "learning_rate": 4.953295017395788e-07, | |
| "loss": 0.2358, | |
| "step": 9088 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 1.1497400773353195, | |
| "learning_rate": 4.919330084321009e-07, | |
| "loss": 0.2459, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.4973289494569486, | |
| "learning_rate": 4.885368874336694e-07, | |
| "loss": 0.2497, | |
| "step": 9152 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 1.375218554438502, | |
| "learning_rate": 4.851412954825874e-07, | |
| "loss": 0.2508, | |
| "step": 9184 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 1.3780517501267755, | |
| "learning_rate": 4.817463892927411e-07, | |
| "loss": 0.2567, | |
| "step": 9216 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 1.4504412670024611, | |
| "learning_rate": 4.783523255463679e-07, | |
| "loss": 0.2409, | |
| "step": 9248 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 1.2870854505712077, | |
| "learning_rate": 4.7495926088682436e-07, | |
| "loss": 0.2423, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 1.3985153319949002, | |
| "learning_rate": 4.71567351911357e-07, | |
| "loss": 0.251, | |
| "step": 9312 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 1.0349716694291753, | |
| "learning_rate": 4.681767551638751e-07, | |
| "loss": 0.2404, | |
| "step": 9344 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 1.428114124678113, | |
| "learning_rate": 4.647876271277257e-07, | |
| "loss": 0.2391, | |
| "step": 9376 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 1.3200951440851763, | |
| "learning_rate": 4.6140012421847136e-07, | |
| "loss": 0.2535, | |
| "step": 9408 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 1.3868492152964986, | |
| "learning_rate": 4.5801440277667235e-07, | |
| "loss": 0.2483, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 1.2729900579595428, | |
| "learning_rate": 4.5463061906066965e-07, | |
| "loss": 0.2512, | |
| "step": 9472 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 1.315198823460812, | |
| "learning_rate": 4.5124892923937416e-07, | |
| "loss": 0.2417, | |
| "step": 9504 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 1.7120982404617975, | |
| "learning_rate": 4.478694893850593e-07, | |
| "loss": 0.2362, | |
| "step": 9536 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.412290074923546, | |
| "learning_rate": 4.4449245546615753e-07, | |
| "loss": 0.2483, | |
| "step": 9568 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.2470698026793279, | |
| "learning_rate": 4.411179833400619e-07, | |
| "loss": 0.2432, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.268429893461802, | |
| "learning_rate": 4.377462287459337e-07, | |
| "loss": 0.2428, | |
| "step": 9632 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.2455579800684395, | |
| "learning_rate": 4.343773472975139e-07, | |
| "loss": 0.2508, | |
| "step": 9664 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 1.228328713009981, | |
| "learning_rate": 4.3101149447594163e-07, | |
| "loss": 0.2551, | |
| "step": 9696 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 1.2042503922389745, | |
| "learning_rate": 4.276488256225787e-07, | |
| "loss": 0.2541, | |
| "step": 9728 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 1.2346699498129847, | |
| "learning_rate": 4.242894959318395e-07, | |
| "loss": 0.2461, | |
| "step": 9760 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 1.3489561185627454, | |
| "learning_rate": 4.209336604440294e-07, | |
| "loss": 0.2477, | |
| "step": 9792 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 1.6713827739937024, | |
| "learning_rate": 4.175814740381879e-07, | |
| "loss": 0.2458, | |
| "step": 9824 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.3796544902936816, | |
| "learning_rate": 4.1423309142494236e-07, | |
| "loss": 0.2517, | |
| "step": 9856 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.6289046211602507, | |
| "learning_rate": 4.1088866713936633e-07, | |
| "loss": 0.2411, | |
| "step": 9888 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 1.4906278025262827, | |
| "learning_rate": 4.0754835553384864e-07, | |
| "loss": 0.2425, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 1.1884900865492198, | |
| "learning_rate": 4.0421231077096844e-07, | |
| "loss": 0.2587, | |
| "step": 9952 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 1.329339394262938, | |
| "learning_rate": 4.008806868163814e-07, | |
| "loss": 0.2391, | |
| "step": 9984 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 1.2186411235918218, | |
| "learning_rate": 3.9755363743171265e-07, | |
| "loss": 0.253, | |
| "step": 10016 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 2.0559660633265047, | |
| "learning_rate": 3.9423131616746187e-07, | |
| "loss": 0.2566, | |
| "step": 10048 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.255138693515569, | |
| "learning_rate": 3.9091387635591533e-07, | |
| "loss": 0.2404, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.5524707973995229, | |
| "learning_rate": 3.8760147110406995e-07, | |
| "loss": 0.2519, | |
| "step": 10112 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 1.2244273291645265, | |
| "learning_rate": 3.8429425328656674e-07, | |
| "loss": 0.2388, | |
| "step": 10144 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 1.438386617275495, | |
| "learning_rate": 3.809923755386355e-07, | |
| "loss": 0.25, | |
| "step": 10176 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 1.5262712808405177, | |
| "learning_rate": 3.7769599024905004e-07, | |
| "loss": 0.2299, | |
| "step": 10208 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 1.0721875435698476, | |
| "learning_rate": 3.744052495530959e-07, | |
| "loss": 0.2375, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 1.1626282358875817, | |
| "learning_rate": 3.7112030532554806e-07, | |
| "loss": 0.2497, | |
| "step": 10272 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 1.3954184625846942, | |
| "learning_rate": 3.6784130917366195e-07, | |
| "loss": 0.242, | |
| "step": 10304 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 1.3259939974154864, | |
| "learning_rate": 3.64568412430177e-07, | |
| "loss": 0.2516, | |
| "step": 10336 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 1.73086784272557, | |
| "learning_rate": 3.613017661463313e-07, | |
| "loss": 0.2483, | |
| "step": 10368 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 1.188961288223144, | |
| "learning_rate": 3.5804152108489065e-07, | |
| "loss": 0.2391, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 1.1246493355621157, | |
| "learning_rate": 3.547878277131913e-07, | |
| "loss": 0.2383, | |
| "step": 10432 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 1.059964023169643, | |
| "learning_rate": 3.515408361961941e-07, | |
| "loss": 0.2404, | |
| "step": 10464 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 1.70561149747635, | |
| "learning_rate": 3.483006963895555e-07, | |
| "loss": 0.2452, | |
| "step": 10496 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.2969563700400484, | |
| "learning_rate": 3.4506755783271045e-07, | |
| "loss": 0.2324, | |
| "step": 10528 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 1.2846755096320104, | |
| "learning_rate": 3.418415697419712e-07, | |
| "loss": 0.244, | |
| "step": 10560 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 1.7188757741977305, | |
| "learning_rate": 3.386228810036408e-07, | |
| "loss": 0.2369, | |
| "step": 10592 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 1.362077199933714, | |
| "learning_rate": 3.3541164016714065e-07, | |
| "loss": 0.2458, | |
| "step": 10624 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 1.354641295840275, | |
| "learning_rate": 3.3220799543815634e-07, | |
| "loss": 0.2397, | |
| "step": 10656 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 1.3277536016021265, | |
| "learning_rate": 3.2901209467179637e-07, | |
| "loss": 0.2458, | |
| "step": 10688 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 1.3886060047726545, | |
| "learning_rate": 3.2582408536576877e-07, | |
| "loss": 0.2344, | |
| "step": 10720 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 1.3601591408784948, | |
| "learning_rate": 3.2264411465357335e-07, | |
| "loss": 0.2343, | |
| "step": 10752 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 1.1017573737076742, | |
| "learning_rate": 3.194723292977123e-07, | |
| "loss": 0.2442, | |
| "step": 10784 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 1.4213916209193176, | |
| "learning_rate": 3.1630887568291465e-07, | |
| "loss": 0.2364, | |
| "step": 10816 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 1.3035008616357522, | |
| "learning_rate": 3.131538998093828e-07, | |
| "loss": 0.2458, | |
| "step": 10848 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 1.218412032577656, | |
| "learning_rate": 3.1000754728605256e-07, | |
| "loss": 0.2413, | |
| "step": 10880 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 1.2325820939927188, | |
| "learning_rate": 3.068699633238738e-07, | |
| "loss": 0.2511, | |
| "step": 10912 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.5401850018534633, | |
| "learning_rate": 3.037412927291083e-07, | |
| "loss": 0.2461, | |
| "step": 10944 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.3586152389713673, | |
| "learning_rate": 3.006216798966468e-07, | |
| "loss": 0.2358, | |
| "step": 10976 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 1.4011774529220307, | |
| "learning_rate": 2.975112688033442e-07, | |
| "loss": 0.2169, | |
| "step": 11008 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 1.2241897731555507, | |
| "learning_rate": 2.944102030013763e-07, | |
| "loss": 0.2108, | |
| "step": 11040 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 1.4027474158874935, | |
| "learning_rate": 2.9131862561161293e-07, | |
| "loss": 0.2293, | |
| "step": 11072 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 1.4670817911133367, | |
| "learning_rate": 2.882366793170133e-07, | |
| "loss": 0.2244, | |
| "step": 11104 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 1.3153219602513937, | |
| "learning_rate": 2.8516450635604086e-07, | |
| "loss": 0.22, | |
| "step": 11136 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 1.2976761125367386, | |
| "learning_rate": 2.821022485160991e-07, | |
| "loss": 0.2097, | |
| "step": 11168 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 1.3625162799781199, | |
| "learning_rate": 2.7905004712698645e-07, | |
| "loss": 0.2199, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 1.232550115651448, | |
| "learning_rate": 2.7600804305437474e-07, | |
| "loss": 0.2239, | |
| "step": 11232 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 1.4796643786771864, | |
| "learning_rate": 2.7297637669330847e-07, | |
| "loss": 0.222, | |
| "step": 11264 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 1.2946736681294901, | |
| "learning_rate": 2.699551879617235e-07, | |
| "loss": 0.2246, | |
| "step": 11296 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 1.2068473533560813, | |
| "learning_rate": 2.669446162939909e-07, | |
| "loss": 0.2229, | |
| "step": 11328 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 1.4805315280685087, | |
| "learning_rate": 2.639448006344812e-07, | |
| "loss": 0.2131, | |
| "step": 11360 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 1.1871137311489093, | |
| "learning_rate": 2.6095587943115227e-07, | |
| "loss": 0.2235, | |
| "step": 11392 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 1.088896036136479, | |
| "learning_rate": 2.57977990629159e-07, | |
| "loss": 0.2073, | |
| "step": 11424 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 1.2960712344910867, | |
| "learning_rate": 2.550112716644875e-07, | |
| "loss": 0.2197, | |
| "step": 11456 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 1.2204092718307542, | |
| "learning_rate": 2.520558594576115e-07, | |
| "loss": 0.2282, | |
| "step": 11488 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 1.2693877027082432, | |
| "learning_rate": 2.4911189040717357e-07, | |
| "loss": 0.225, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 1.2846435895566266, | |
| "learning_rate": 2.4617950038369035e-07, | |
| "loss": 0.2234, | |
| "step": 11552 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 1.2777271226205853, | |
| "learning_rate": 2.432588247232811e-07, | |
| "loss": 0.2211, | |
| "step": 11584 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 1.528372242529096, | |
| "learning_rate": 2.4034999822142166e-07, | |
| "loss": 0.22, | |
| "step": 11616 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 1.3339829276713877, | |
| "learning_rate": 2.3745315512672397e-07, | |
| "loss": 0.2195, | |
| "step": 11648 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 1.3101509979598953, | |
| "learning_rate": 2.345684291347403e-07, | |
| "loss": 0.2159, | |
| "step": 11680 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 1.3071577674707573, | |
| "learning_rate": 2.3169595338179192e-07, | |
| "loss": 0.2295, | |
| "step": 11712 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 1.3186809416500118, | |
| "learning_rate": 2.2883586043882548e-07, | |
| "loss": 0.2133, | |
| "step": 11744 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 1.5867314938269408, | |
| "learning_rate": 2.259882823052941e-07, | |
| "loss": 0.2316, | |
| "step": 11776 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 1.1557431014242034, | |
| "learning_rate": 2.2315335040306554e-07, | |
| "loss": 0.2189, | |
| "step": 11808 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 1.2736258434323458, | |
| "learning_rate": 2.203311955703568e-07, | |
| "loss": 0.2199, | |
| "step": 11840 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 1.302216300423092, | |
| "learning_rate": 2.1752194805569552e-07, | |
| "loss": 0.229, | |
| "step": 11872 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 1.3128201457087052, | |
| "learning_rate": 2.147257375119087e-07, | |
| "loss": 0.236, | |
| "step": 11904 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 1.3748881212985564, | |
| "learning_rate": 2.119426929901391e-07, | |
| "loss": 0.2206, | |
| "step": 11936 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 2.0921207372413435, | |
| "learning_rate": 2.0917294293388966e-07, | |
| "loss": 0.2117, | |
| "step": 11968 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 1.2949118526726222, | |
| "learning_rate": 2.0641661517309417e-07, | |
| "loss": 0.2135, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 1.3415735498836403, | |
| "learning_rate": 2.03673836918219e-07, | |
| "loss": 0.2259, | |
| "step": 12032 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 1.55429350265861, | |
| "learning_rate": 2.00944734754392e-07, | |
| "loss": 0.2279, | |
| "step": 12064 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 1.2935837103909433, | |
| "learning_rate": 1.9822943463555948e-07, | |
| "loss": 0.2308, | |
| "step": 12096 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 1.642310611966749, | |
| "learning_rate": 1.9552806187867372e-07, | |
| "loss": 0.2144, | |
| "step": 12128 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 1.2465970815408216, | |
| "learning_rate": 1.9284074115790932e-07, | |
| "loss": 0.2198, | |
| "step": 12160 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 1.6894912323452993, | |
| "learning_rate": 1.9016759649890895e-07, | |
| "loss": 0.2147, | |
| "step": 12192 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 1.1571001402696899, | |
| "learning_rate": 1.8750875127305977e-07, | |
| "loss": 0.2149, | |
| "step": 12224 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 1.311070700271489, | |
| "learning_rate": 1.8486432819179892e-07, | |
| "loss": 0.2135, | |
| "step": 12256 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 1.358941411906666, | |
| "learning_rate": 1.822344493009509e-07, | |
| "loss": 0.2291, | |
| "step": 12288 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 1.2572126595123219, | |
| "learning_rate": 1.7961923597509387e-07, | |
| "loss": 0.2253, | |
| "step": 12320 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 1.4204877616087515, | |
| "learning_rate": 1.7701880891195942e-07, | |
| "loss": 0.212, | |
| "step": 12352 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 1.3082239260119037, | |
| "learning_rate": 1.744332881268607e-07, | |
| "loss": 0.2258, | |
| "step": 12384 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 1.3818606595114133, | |
| "learning_rate": 1.7186279294715382e-07, | |
| "loss": 0.2186, | |
| "step": 12416 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 1.344313917268325, | |
| "learning_rate": 1.6930744200673104e-07, | |
| "loss": 0.2309, | |
| "step": 12448 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 1.2849788562306779, | |
| "learning_rate": 1.667673532405457e-07, | |
| "loss": 0.2119, | |
| "step": 12480 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 1.048703241818117, | |
| "learning_rate": 1.6424264387916842e-07, | |
| "loss": 0.2175, | |
| "step": 12512 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 1.643116827546219, | |
| "learning_rate": 1.6173343044337734e-07, | |
| "loss": 0.2264, | |
| "step": 12544 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 1.2754388516106783, | |
| "learning_rate": 1.5923982873878012e-07, | |
| "loss": 0.2296, | |
| "step": 12576 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 1.2217192268295975, | |
| "learning_rate": 1.567619538504696e-07, | |
| "loss": 0.2281, | |
| "step": 12608 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 1.4770776525365248, | |
| "learning_rate": 1.542999201377119e-07, | |
| "loss": 0.2277, | |
| "step": 12640 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 1.642064071179986, | |
| "learning_rate": 1.5185384122866897e-07, | |
| "loss": 0.218, | |
| "step": 12672 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 1.237204280837322, | |
| "learning_rate": 1.4942383001515386e-07, | |
| "loss": 0.2079, | |
| "step": 12704 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 1.1715182674707414, | |
| "learning_rate": 1.4700999864742136e-07, | |
| "loss": 0.2124, | |
| "step": 12736 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 1.770494939875621, | |
| "learning_rate": 1.4461245852899128e-07, | |
| "loss": 0.2046, | |
| "step": 12768 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 1.2181121499687988, | |
| "learning_rate": 1.4223132031150676e-07, | |
| "loss": 0.2179, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 1.2327845797316905, | |
| "learning_rate": 1.3986669388962819e-07, | |
| "loss": 0.2202, | |
| "step": 12832 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 1.5192364095540325, | |
| "learning_rate": 1.375186883959613e-07, | |
| "loss": 0.2187, | |
| "step": 12864 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 1.1447367117093976, | |
| "learning_rate": 1.3518741219601988e-07, | |
| "loss": 0.2267, | |
| "step": 12896 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 1.374708110513197, | |
| "learning_rate": 1.3287297288322458e-07, | |
| "loss": 0.2192, | |
| "step": 12928 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 1.1547706075025737, | |
| "learning_rate": 1.305754772739377e-07, | |
| "loss": 0.2101, | |
| "step": 12960 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 1.350330342693555, | |
| "learning_rate": 1.2829503140253296e-07, | |
| "loss": 0.2169, | |
| "step": 12992 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 1.416668233872815, | |
| "learning_rate": 1.2603174051650207e-07, | |
| "loss": 0.2181, | |
| "step": 13024 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 1.2796831245124296, | |
| "learning_rate": 1.2378570907159697e-07, | |
| "loss": 0.2264, | |
| "step": 13056 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 1.5438983104227264, | |
| "learning_rate": 1.215570407270095e-07, | |
| "loss": 0.2233, | |
| "step": 13088 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 1.7803269270190754, | |
| "learning_rate": 1.1934583834058658e-07, | |
| "loss": 0.2241, | |
| "step": 13120 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 1.5471141725466007, | |
| "learning_rate": 1.1715220396408426e-07, | |
| "loss": 0.2124, | |
| "step": 13152 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 1.1206628903783873, | |
| "learning_rate": 1.1497623883845614e-07, | |
| "loss": 0.2127, | |
| "step": 13184 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 2.5095477739496967, | |
| "learning_rate": 1.1281804338918239e-07, | |
| "loss": 0.2282, | |
| "step": 13216 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 1.3128197458740387, | |
| "learning_rate": 1.1067771722163404e-07, | |
| "loss": 0.2185, | |
| "step": 13248 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 1.2471858850247612, | |
| "learning_rate": 1.0855535911647673e-07, | |
| "loss": 0.2185, | |
| "step": 13280 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 1.655671236852773, | |
| "learning_rate": 1.064510670251108e-07, | |
| "loss": 0.2246, | |
| "step": 13312 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 1.2165519973679901, | |
| "learning_rate": 1.0436493806515128e-07, | |
| "loss": 0.223, | |
| "step": 13344 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 1.1834223459074134, | |
| "learning_rate": 1.0229706851594561e-07, | |
| "loss": 0.2188, | |
| "step": 13376 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 1.5035939672617133, | |
| "learning_rate": 1.0024755381412997e-07, | |
| "loss": 0.2204, | |
| "step": 13408 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 1.362765323113097, | |
| "learning_rate": 9.821648854922481e-08, | |
| "loss": 0.2165, | |
| "step": 13440 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 1.4329165569522946, | |
| "learning_rate": 9.620396645926937e-08, | |
| "loss": 0.2104, | |
| "step": 13472 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 1.2794493917175387, | |
| "learning_rate": 9.421008042649514e-08, | |
| "loss": 0.2225, | |
| "step": 13504 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 1.365469705607083, | |
| "learning_rate": 9.223492247303999e-08, | |
| "loss": 0.2166, | |
| "step": 13536 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 1.3658393916273586, | |
| "learning_rate": 9.027858375670005e-08, | |
| "loss": 0.2244, | |
| "step": 13568 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 1.6134388977737084, | |
| "learning_rate": 8.834115456672314e-08, | |
| "loss": 0.2238, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 1.2747001650429752, | |
| "learning_rate": 8.642272431964171e-08, | |
| "loss": 0.2117, | |
| "step": 13632 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 1.4495915191793898, | |
| "learning_rate": 8.452338155514644e-08, | |
| "loss": 0.2153, | |
| "step": 13664 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 1.704239796998396, | |
| "learning_rate": 8.264321393199902e-08, | |
| "loss": 0.214, | |
| "step": 13696 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 1.1540455435637955, | |
| "learning_rate": 8.078230822398735e-08, | |
| "loss": 0.2189, | |
| "step": 13728 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 1.4264481560916344, | |
| "learning_rate": 7.894075031592029e-08, | |
| "loss": 0.2142, | |
| "step": 13760 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 1.2771622178109003, | |
| "learning_rate": 7.711862519966416e-08, | |
| "loss": 0.2238, | |
| "step": 13792 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 1.4176109439678235, | |
| "learning_rate": 7.531601697022e-08, | |
| "loss": 0.2141, | |
| "step": 13824 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 1.211872373307708, | |
| "learning_rate": 7.353300882184238e-08, | |
| "loss": 0.2094, | |
| "step": 13856 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 1.6208090587491066, | |
| "learning_rate": 7.176968304420006e-08, | |
| "loss": 0.213, | |
| "step": 13888 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 1.4294182508163742, | |
| "learning_rate": 7.002612101857769e-08, | |
| "loss": 0.2206, | |
| "step": 13920 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 1.5238939549044208, | |
| "learning_rate": 6.830240321412067e-08, | |
| "loss": 0.2187, | |
| "step": 13952 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 1.4817796496161193, | |
| "learning_rate": 6.65986091841203e-08, | |
| "loss": 0.223, | |
| "step": 13984 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 1.4428543645745906, | |
| "learning_rate": 6.491481756234296e-08, | |
| "loss": 0.2178, | |
| "step": 14016 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 1.3986608111205634, | |
| "learning_rate": 6.32511060594007e-08, | |
| "loss": 0.2244, | |
| "step": 14048 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 1.1779478103336172, | |
| "learning_rate": 6.160755145916518e-08, | |
| "loss": 0.2326, | |
| "step": 14080 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 1.4661784823721757, | |
| "learning_rate": 5.99842296152231e-08, | |
| "loss": 0.2257, | |
| "step": 14112 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 1.3997989298044267, | |
| "learning_rate": 5.8381215447376006e-08, | |
| "loss": 0.218, | |
| "step": 14144 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 1.1966150734513934, | |
| "learning_rate": 5.679858293818257e-08, | |
| "loss": 0.2139, | |
| "step": 14176 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 1.2975301799355876, | |
| "learning_rate": 5.523640512954375e-08, | |
| "loss": 0.2298, | |
| "step": 14208 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 1.1520772784392557, | |
| "learning_rate": 5.369475411933222e-08, | |
| "loss": 0.2086, | |
| "step": 14240 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 1.4920063917973507, | |
| "learning_rate": 5.2173701058064454e-08, | |
| "loss": 0.215, | |
| "step": 14272 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 1.4418059613129284, | |
| "learning_rate": 5.067331614561726e-08, | |
| "loss": 0.2108, | |
| "step": 14304 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 1.122573143556837, | |
| "learning_rate": 4.9193668627988073e-08, | |
| "loss": 0.2156, | |
| "step": 14336 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 1.169213395405146, | |
| "learning_rate": 4.7734826794098624e-08, | |
| "loss": 0.2155, | |
| "step": 14368 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 1.4739891927117594, | |
| "learning_rate": 4.629685797264338e-08, | |
| "loss": 0.2197, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 1.1061529994285666, | |
| "learning_rate": 4.487982852898248e-08, | |
| "loss": 0.2161, | |
| "step": 14432 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 1.2665429377042383, | |
| "learning_rate": 4.348380386207884e-08, | |
| "loss": 0.2293, | |
| "step": 14464 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 1.4251977400136069, | |
| "learning_rate": 4.210884840147927e-08, | |
| "loss": 0.219, | |
| "step": 14496 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 1.3125295629014255, | |
| "learning_rate": 4.075502560434158e-08, | |
| "loss": 0.2215, | |
| "step": 14528 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 1.6114888474605462, | |
| "learning_rate": 3.942239795250546e-08, | |
| "loss": 0.2141, | |
| "step": 14560 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 1.3415946209746852, | |
| "learning_rate": 3.8111026949609045e-08, | |
| "loss": 0.2145, | |
| "step": 14592 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 1.5169187816349006, | |
| "learning_rate": 3.682097311825033e-08, | |
| "loss": 0.2241, | |
| "step": 14624 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 1.3176284198519965, | |
| "learning_rate": 3.5552295997193834e-08, | |
| "loss": 0.2238, | |
| "step": 14656 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 1.5611137018278904, | |
| "learning_rate": 3.430505413862284e-08, | |
| "loss": 0.2029, | |
| "step": 14688 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 1.4814939062116772, | |
| "learning_rate": 3.307930510543705e-08, | |
| "loss": 0.2073, | |
| "step": 14720 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 1.4585161010377092, | |
| "learning_rate": 3.187510546859623e-08, | |
| "loss": 0.2158, | |
| "step": 14752 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 1.3111001801067392, | |
| "learning_rate": 3.069251080450863e-08, | |
| "loss": 0.2114, | |
| "step": 14784 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 1.3748879882896838, | |
| "learning_rate": 2.953157569246656e-08, | |
| "loss": 0.2191, | |
| "step": 14816 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 1.2458399054987248, | |
| "learning_rate": 2.8392353712127615e-08, | |
| "loss": 0.2226, | |
| "step": 14848 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 1.183457902413645, | |
| "learning_rate": 2.7274897441041223e-08, | |
| "loss": 0.2276, | |
| "step": 14880 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 1.3996675489557702, | |
| "learning_rate": 2.6179258452222687e-08, | |
| "loss": 0.2223, | |
| "step": 14912 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 1.3797514575768821, | |
| "learning_rate": 2.5105487311772478e-08, | |
| "loss": 0.2346, | |
| "step": 14944 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 1.3453274139513625, | |
| "learning_rate": 2.4053633576542833e-08, | |
| "loss": 0.2148, | |
| "step": 14976 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 1.4296834231597417, | |
| "learning_rate": 2.3023745791850623e-08, | |
| "loss": 0.2252, | |
| "step": 15008 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 1.5820604044938296, | |
| "learning_rate": 2.201587148923656e-08, | |
| "loss": 0.2203, | |
| "step": 15040 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 1.3679161695840207, | |
| "learning_rate": 2.103005718427181e-08, | |
| "loss": 0.2096, | |
| "step": 15072 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 1.2454498587173481, | |
| "learning_rate": 2.0066348374411122e-08, | |
| "loss": 0.228, | |
| "step": 15104 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 1.094395180196629, | |
| "learning_rate": 1.912478953689306e-08, | |
| "loss": 0.2183, | |
| "step": 15136 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 1.2797588189588773, | |
| "learning_rate": 1.8205424126687187e-08, | |
| "loss": 0.2218, | |
| "step": 15168 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 1.4533835000058788, | |
| "learning_rate": 1.7308294574488347e-08, | |
| "loss": 0.2226, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 1.4492805126695452, | |
| "learning_rate": 1.6433442284758902e-08, | |
| "loss": 0.2274, | |
| "step": 15232 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 1.4350039162599688, | |
| "learning_rate": 1.5580907633817475e-08, | |
| "loss": 0.2282, | |
| "step": 15264 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 1.1810113931037964, | |
| "learning_rate": 1.475072996797555e-08, | |
| "loss": 0.2224, | |
| "step": 15296 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 1.1286561070757017, | |
| "learning_rate": 1.3942947601721488e-08, | |
| "loss": 0.215, | |
| "step": 15328 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 1.1871769686911617, | |
| "learning_rate": 1.3157597815952382e-08, | |
| "loss": 0.2214, | |
| "step": 15360 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 1.2452370435442608, | |
| "learning_rate": 1.2394716856253374e-08, | |
| "loss": 0.2122, | |
| "step": 15392 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 1.2518679025330413, | |
| "learning_rate": 1.1654339931224887e-08, | |
| "loss": 0.217, | |
| "step": 15424 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 1.252731324308369, | |
| "learning_rate": 1.0936501210857651e-08, | |
| "loss": 0.2317, | |
| "step": 15456 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 1.3386257390126015, | |
| "learning_rate": 1.0241233824955509e-08, | |
| "loss": 0.2256, | |
| "step": 15488 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 1.1313351017583362, | |
| "learning_rate": 9.568569861606868e-09, | |
| "loss": 0.2341, | |
| "step": 15520 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 1.4449940727667887, | |
| "learning_rate": 8.918540365703331e-09, | |
| "loss": 0.2166, | |
| "step": 15552 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 1.4530241900991612, | |
| "learning_rate": 8.291175337506894e-09, | |
| "loss": 0.2408, | |
| "step": 15584 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 1.2609793071770357, | |
| "learning_rate": 7.68650373126567e-09, | |
| "loss": 0.2046, | |
| "step": 15616 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 1.1344728672758062, | |
| "learning_rate": 7.10455345387756e-09, | |
| "loss": 0.2196, | |
| "step": 15648 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 1.5048112445610096, | |
| "learning_rate": 6.545351363601959e-09, | |
| "loss": 0.2214, | |
| "step": 15680 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 1.4050748695357576, | |
| "learning_rate": 6.00892326882052e-09, | |
| "loss": 0.2265, | |
| "step": 15712 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 1.1997477163250476, | |
| "learning_rate": 5.495293926845834e-09, | |
| "loss": 0.2163, | |
| "step": 15744 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 1.3144891186853807, | |
| "learning_rate": 5.00448704277906e-09, | |
| "loss": 0.224, | |
| "step": 15776 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 1.353851004486482, | |
| "learning_rate": 4.536525268415581e-09, | |
| "loss": 0.2127, | |
| "step": 15808 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 1.4192328708826423, | |
| "learning_rate": 4.091430201199841e-09, | |
| "loss": 0.221, | |
| "step": 15840 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 1.3274920629279923, | |
| "learning_rate": 3.6692223832283653e-09, | |
| "loss": 0.2137, | |
| "step": 15872 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 1.2096820045248131, | |
| "learning_rate": 3.2699213003019588e-09, | |
| "loss": 0.2164, | |
| "step": 15904 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 1.4949483678866275, | |
| "learning_rate": 2.8935453810260413e-09, | |
| "loss": 0.211, | |
| "step": 15936 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 1.4480264990764071, | |
| "learning_rate": 2.5401119959606033e-09, | |
| "loss": 0.2219, | |
| "step": 15968 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 1.3984591809802007, | |
| "learning_rate": 2.209637456817848e-09, | |
| "loss": 0.2187, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 1.2138235012481149, | |
| "learning_rate": 1.9021370157100703e-09, | |
| "loss": 0.2044, | |
| "step": 16032 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 1.4196275071259965, | |
| "learning_rate": 1.6176248644453883e-09, | |
| "loss": 0.2206, | |
| "step": 16064 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 1.4494340877185368, | |
| "learning_rate": 1.3561141338726545e-09, | |
| "loss": 0.2155, | |
| "step": 16096 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 1.3684818624338524, | |
| "learning_rate": 1.117616893275719e-09, | |
| "loss": 0.228, | |
| "step": 16128 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 4.300044658083074, | |
| "learning_rate": 9.02144149816153e-10, | |
| "loss": 0.2138, | |
| "step": 16160 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 1.4879340606023383, | |
| "learning_rate": 7.097058480255991e-10, | |
| "loss": 0.2135, | |
| "step": 16192 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 1.628757008031087, | |
| "learning_rate": 5.403108693462499e-10, | |
| "loss": 0.2169, | |
| "step": 16224 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 1.355596318247962, | |
| "learning_rate": 3.9396703172150936e-10, | |
| "loss": 0.212, | |
| "step": 16256 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 1.1638938575418796, | |
| "learning_rate": 2.706810892348921e-10, | |
| "loss": 0.2177, | |
| "step": 16288 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 1.2545697493468788, | |
| "learning_rate": 1.704587317983286e-10, | |
| "loss": 0.2226, | |
| "step": 16320 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 1.2958637026668016, | |
| "learning_rate": 9.330458488959746e-11, | |
| "loss": 0.2206, | |
| "step": 16352 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 1.1646489691129969, | |
| "learning_rate": 3.922220933882947e-11, | |
| "loss": 0.2219, | |
| "step": 16384 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 1.126290495132723, | |
| "learning_rate": 8.214101164305543e-12, | |
| "loss": 0.2393, | |
| "step": 16416 | |
| } | |
| ], | |
| "logging_steps": 32, | |
| "max_steps": 16443, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 3289, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |