Sparse_Dynamic_MOE / dynamic_moe_config.json
chatsd's picture
Config Files
41dcf48 verified
{
"_description": "Configuration for Dynamic-K Mixture-of-Experts Model",
"_model_type": "DynamicMOELM",
"model": {
"vocab_size": 10000,
"num_layers": 4,
"context_length": 256,
"d_model": 512,
"d_ff": 2048,
"num_heads": 8,
"theta": 10000.0
},
"moe": {
"num_experts": 4,
"confidence_threshold": 0.8
},
"loss_weights": {
"balance_loss_weight": 0.01,
"entropy_loss_weight": 0.001
},
"optimizer": {
"learning_rate": 3e-4,
"beta1": 0.9,
"beta2": 0.95,
"eps": 1e-8,
"weight_decay": 0.1,
"max_grad_norm": 1.0
},
"scheduler": {
"warmup_steps": 2000,
"max_steps": 20000
},
"training": {
"batch_size": 4,
"grad_accum_steps": 1,
"eval_interval": 500,
"log_interval": 100,
"save_interval": 2000,
"eval_steps": 10
},
"paths": {
"train_data_path": "data/train.txt",
"val_data_path": "data/test.txt",
"checkpoint_dir": "checkpoints_dynamic_moe",
"resume_from": null
},
"system": {
"device": "cuda"
},
"logging": {
"use_wandb": true,
"wandb_project": "dynamic-moe-phase2",
"wandb_run_name": null
}
}