Sheikh-2.5-Coder / training_config.yaml
likhonsheikh's picture
Add training_config.yaml
d4cfbf8 verified
# Training Configuration for Sheikh-2.5-Coder
# This file contains the training hyperparameters and settings
# Model Architecture
model:
name: "Sheikh-2.5-Coder"
num_parameters: 3.09e9
context_length: 32768
vocab_size: 50257
hidden_size: 3072
num_attention_heads: 16
num_key_value_heads: 2
num_hidden_layers: 36
intermediate_size: 8192
activation: "swiglu"
layer_norm_epsilon: 1e-6
max_position_embeddings: 32768
# Training Data
data:
total_tokens: 5.5e12
sources:
- name: "the-stack-v2"
description: "Diverse programming language dataset"
percentage: 40
- name: "github-code"
description: "High-quality GitHub repositories"
percentage: 25
- name: "synthetic-code-data"
description: "AI-generated code examples"
percentage: 20
- name: "natural-language"
description: "Code documentation and comments"
percentage: 15
# Training Hyperparameters
training:
# Optimization
learning_rate: 1.0e-4
weight_decay: 0.01
beta1: 0.9
beta2: 0.95
eps: 1.0e-8
# Training Schedule
warmup_steps: 2000
max_steps: 100000
train_batch_size: 64
gradient_accumulation_steps: 4
max_grad_norm: 1.0
# Mixed Precision
fp16: true
bf16: true
tf32: true
# Regularization
dropout: 0.1
attention_dropout: 0.1
# Evaluation
eval_steps: 1000
save_steps: 2000
logging_steps: 100
# Instruction Tuning
instruction_tuning:
enabled: true
data_sources:
- "code-instruct"
- "multi-turn-conversations"
- "programming-help"
learning_rate: 5.0e-6
train_batch_size: 16
max_sequence_length: 32768
# Efficiency Optimizations
efficiency:
flash_attention: true
gradient_checkpointing: true
deepspeed: false
fsdp: false
use_cache: true
rope_scaling:
type: "linear"
factor: 8.0
# Hardware Configuration
hardware:
gpus: 8
gpu_type: "A100"
gpu_memory: "80GB"
host_memory: "1TB"
network: "infiniband"
# Checkpointing
checkpointing:
save_total_limit: 3
load_best_model_at_end: true
metric_for_best_model: "loss"
greater_is_better: false
# Distributed Training
distributed:
world_size: 8
rank: 0
master_addr: "localhost"
master_port: 12355
# Logging and Monitoring
logging:
wandb:
enabled: true
project: "sheikh-2.5-coder"
tensorboard:
enabled: true
log_dir: "./logs"
mlflow:
enabled: false
# Evaluation Metrics
evaluation:
benchmarks:
- name: "HumanEval"
evaluation_steps: 1000
batch_size: 10
- name: "MBPP"
evaluation_steps: 1000
batch_size: 10
- name: "MultiPL-E"
evaluation_steps: 2000
batch_size: 5