| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.2465230735200596, | |
| "global_step": 325000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03, | |
| "learning_rate": 1.993087621312246e-06, | |
| "loss": 0.2839, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "learning_rate": 1.986175242624492e-06, | |
| "loss": 0.2584, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "learning_rate": 1.979262863936738e-06, | |
| "loss": 0.2684, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "learning_rate": 1.972350485248984e-06, | |
| "loss": 0.2725, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "learning_rate": 1.96543810656123e-06, | |
| "loss": 0.2631, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "eval_accuracy": 0.9506088495254517, | |
| "eval_loss": 0.2637203335762024, | |
| "eval_runtime": 690.6403, | |
| "eval_samples_per_second": 46.495, | |
| "eval_steps_per_second": 11.624, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "learning_rate": 1.958525727873476e-06, | |
| "loss": 0.2808, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "learning_rate": 1.951613349185722e-06, | |
| "loss": 0.2601, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "learning_rate": 1.944700970497968e-06, | |
| "loss": 0.2746, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "learning_rate": 1.937788591810214e-06, | |
| "loss": 0.2688, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "learning_rate": 1.93087621312246e-06, | |
| "loss": 0.2619, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "eval_accuracy": 0.9507022500038147, | |
| "eval_loss": 0.2595760226249695, | |
| "eval_runtime": 690.7368, | |
| "eval_samples_per_second": 46.488, | |
| "eval_steps_per_second": 11.622, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "learning_rate": 1.9239638344347054e-06, | |
| "loss": 0.2765, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "learning_rate": 1.9170514557469514e-06, | |
| "loss": 0.2736, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "learning_rate": 1.9101390770591978e-06, | |
| "loss": 0.2616, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "learning_rate": 1.9032266983714436e-06, | |
| "loss": 0.2558, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "learning_rate": 1.8963143196836895e-06, | |
| "loss": 0.2653, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "eval_accuracy": 0.9514496326446533, | |
| "eval_loss": 0.2495056539773941, | |
| "eval_runtime": 689.7436, | |
| "eval_samples_per_second": 46.555, | |
| "eval_steps_per_second": 11.639, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "learning_rate": 1.8894019409959353e-06, | |
| "loss": 0.2594, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "learning_rate": 1.8824895623081813e-06, | |
| "loss": 0.2678, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "learning_rate": 1.8755771836204273e-06, | |
| "loss": 0.2648, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "learning_rate": 1.8686648049326735e-06, | |
| "loss": 0.2671, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "learning_rate": 1.8617524262449193e-06, | |
| "loss": 0.262, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "eval_accuracy": 0.9511382579803467, | |
| "eval_loss": 0.25494277477264404, | |
| "eval_runtime": 689.2307, | |
| "eval_samples_per_second": 46.59, | |
| "eval_steps_per_second": 11.648, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "learning_rate": 1.8548400475571653e-06, | |
| "loss": 0.273, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "learning_rate": 1.8479276688694113e-06, | |
| "loss": 0.265, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "learning_rate": 1.841015290181657e-06, | |
| "loss": 0.2677, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "learning_rate": 1.8341029114939032e-06, | |
| "loss": 0.2644, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "learning_rate": 1.8271905328061492e-06, | |
| "loss": 0.2667, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "eval_accuracy": 0.9517922401428223, | |
| "eval_loss": 0.2526009976863861, | |
| "eval_runtime": 689.2471, | |
| "eval_samples_per_second": 46.589, | |
| "eval_steps_per_second": 11.647, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "learning_rate": 1.8202781541183952e-06, | |
| "loss": 0.262, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "learning_rate": 1.813365775430641e-06, | |
| "loss": 0.2689, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "learning_rate": 1.806453396742887e-06, | |
| "loss": 0.2713, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "learning_rate": 1.7995410180551332e-06, | |
| "loss": 0.2522, | |
| "step": 145000 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "learning_rate": 1.7926286393673792e-06, | |
| "loss": 0.2425, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "eval_accuracy": 0.9516676664352417, | |
| "eval_loss": 0.2566453516483307, | |
| "eval_runtime": 689.1693, | |
| "eval_samples_per_second": 46.594, | |
| "eval_steps_per_second": 11.649, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "learning_rate": 1.785716260679625e-06, | |
| "loss": 0.2445, | |
| "step": 155000 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "learning_rate": 1.778803881991871e-06, | |
| "loss": 0.2353, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "learning_rate": 1.771891503304117e-06, | |
| "loss": 0.2478, | |
| "step": 165000 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "learning_rate": 1.764979124616363e-06, | |
| "loss": 0.2461, | |
| "step": 170000 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "learning_rate": 1.758066745928609e-06, | |
| "loss": 0.2506, | |
| "step": 175000 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "eval_accuracy": 0.9521347880363464, | |
| "eval_loss": 0.25962916016578674, | |
| "eval_runtime": 689.0728, | |
| "eval_samples_per_second": 46.6, | |
| "eval_steps_per_second": 11.65, | |
| "step": 175000 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "learning_rate": 1.751154367240855e-06, | |
| "loss": 0.245, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "learning_rate": 1.7442419885531009e-06, | |
| "loss": 0.2516, | |
| "step": 185000 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "learning_rate": 1.7373296098653467e-06, | |
| "loss": 0.2392, | |
| "step": 190000 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "learning_rate": 1.7304172311775929e-06, | |
| "loss": 0.2495, | |
| "step": 195000 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "learning_rate": 1.7235048524898389e-06, | |
| "loss": 0.2485, | |
| "step": 200000 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "eval_accuracy": 0.9533492922782898, | |
| "eval_loss": 0.2515329122543335, | |
| "eval_runtime": 689.5664, | |
| "eval_samples_per_second": 46.567, | |
| "eval_steps_per_second": 11.642, | |
| "step": 200000 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "learning_rate": 1.7165924738020846e-06, | |
| "loss": 0.2311, | |
| "step": 205000 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "learning_rate": 1.7096800951143306e-06, | |
| "loss": 0.2335, | |
| "step": 210000 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "learning_rate": 1.7027677164265766e-06, | |
| "loss": 0.2568, | |
| "step": 215000 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "learning_rate": 1.6958553377388226e-06, | |
| "loss": 0.2392, | |
| "step": 220000 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "learning_rate": 1.6889429590510686e-06, | |
| "loss": 0.2411, | |
| "step": 225000 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "eval_accuracy": 0.9527575969696045, | |
| "eval_loss": 0.25586625933647156, | |
| "eval_runtime": 688.9498, | |
| "eval_samples_per_second": 46.609, | |
| "eval_steps_per_second": 11.653, | |
| "step": 225000 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "learning_rate": 1.6820305803633146e-06, | |
| "loss": 0.2474, | |
| "step": 230000 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "learning_rate": 1.6751182016755606e-06, | |
| "loss": 0.2511, | |
| "step": 235000 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "learning_rate": 1.6682058229878063e-06, | |
| "loss": 0.2435, | |
| "step": 240000 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "learning_rate": 1.6612934443000523e-06, | |
| "loss": 0.2485, | |
| "step": 245000 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "learning_rate": 1.6543810656122985e-06, | |
| "loss": 0.234, | |
| "step": 250000 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "eval_accuracy": 0.9526330828666687, | |
| "eval_loss": 0.2587934136390686, | |
| "eval_runtime": 688.8864, | |
| "eval_samples_per_second": 46.613, | |
| "eval_steps_per_second": 11.654, | |
| "step": 250000 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "learning_rate": 1.6474686869245445e-06, | |
| "loss": 0.2534, | |
| "step": 255000 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "learning_rate": 1.6405563082367903e-06, | |
| "loss": 0.2432, | |
| "step": 260000 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "learning_rate": 1.6336439295490363e-06, | |
| "loss": 0.2466, | |
| "step": 265000 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "learning_rate": 1.6267315508612823e-06, | |
| "loss": 0.2393, | |
| "step": 270000 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "learning_rate": 1.6198191721735283e-06, | |
| "loss": 0.242, | |
| "step": 275000 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "eval_accuracy": 0.9526330828666687, | |
| "eval_loss": 0.2543439269065857, | |
| "eval_runtime": 688.3982, | |
| "eval_samples_per_second": 46.646, | |
| "eval_steps_per_second": 11.662, | |
| "step": 275000 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "learning_rate": 1.6129067934857743e-06, | |
| "loss": 0.2455, | |
| "step": 280000 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "learning_rate": 1.6059944147980203e-06, | |
| "loss": 0.2413, | |
| "step": 285000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "learning_rate": 1.5990820361102662e-06, | |
| "loss": 0.241, | |
| "step": 290000 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "learning_rate": 1.592169657422512e-06, | |
| "loss": 0.2401, | |
| "step": 295000 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "learning_rate": 1.5852572787347582e-06, | |
| "loss": 0.217, | |
| "step": 300000 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "eval_accuracy": 0.9530379176139832, | |
| "eval_loss": 0.26496848464012146, | |
| "eval_runtime": 688.7791, | |
| "eval_samples_per_second": 46.62, | |
| "eval_steps_per_second": 11.655, | |
| "step": 300000 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "learning_rate": 1.5783449000470042e-06, | |
| "loss": 0.2281, | |
| "step": 305000 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "learning_rate": 1.57143252135925e-06, | |
| "loss": 0.2203, | |
| "step": 310000 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "learning_rate": 1.564520142671496e-06, | |
| "loss": 0.219, | |
| "step": 315000 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "learning_rate": 1.557607763983742e-06, | |
| "loss": 0.2287, | |
| "step": 320000 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "learning_rate": 1.5506953852959882e-06, | |
| "loss": 0.2272, | |
| "step": 325000 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "eval_accuracy": 0.9527264833450317, | |
| "eval_loss": 0.2502051889896393, | |
| "eval_runtime": 688.3338, | |
| "eval_samples_per_second": 46.65, | |
| "eval_steps_per_second": 11.663, | |
| "step": 325000 | |
| } | |
| ], | |
| "max_steps": 1446680, | |
| "num_train_epochs": 10, | |
| "total_flos": 3.420489354301194e+17, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |