{
  "pretrained_model_name_or_path": "Qwen/Qwen3-0.6B",
  "output_dir": "./outputs",
  "seed": 42,
  "tf32": true,
  "model_type": "causal",
  "vocab_size": 4096,
  "hidden_size": 256,
  "num_hidden_layers": 8,
  "num_attention_heads": 8,
  "context_length": 256,
  "prediction_length": 64,
  "tokenizer_class": "NonUniformBins",
  "binning_power": 2.0,
  "exponential_base": 1.01,
  "n_special_tokens": 2,
  "pad_token_id": 0,
  "eos_token_id": 1,
  "use_eos_token": true,
  "min_past": 64,
  "drop_prob": 0.1,
  "shuffle_buffer_length": 20000,
  "per_device_train_batch_size": 32,
  "learning_rate": 0.0001,
  "max_steps": 40000,
  "warmup_ratio": 0.1,
  "lr_scheduler_type": "cosine",
  "optim": "adamw_torch",
  "gradient_accumulation_steps": 4,
  "log_steps": 20,
  "save_steps": 100,
  "dataloader_num_workers": 8,
  "torch_compile": true,
  "logger": "<Logger __main__ (INFO)>",
  "total_train_batch_size": 128,
  "short_model_name": "Qwen3-0.6B",
  "run_name": "Qwen3-0.6B-h256-l8-a8-ctx256-pred64-vocab4096-NonUniform-lr1.0e-04-bs128-steps40000"
}