# Training Configuration for Sheikh-2.5-Coder # This file contains the training hyperparameters and settings # Model Architecture model: name: "Sheikh-2.5-Coder" num_parameters: 3.09e9 context_length: 32768 vocab_size: 50257 hidden_size: 3072 num_attention_heads: 16 num_key_value_heads: 2 num_hidden_layers: 36 intermediate_size: 8192 activation: "swiglu" layer_norm_epsilon: 1e-6 max_position_embeddings: 32768 # Training Data data: total_tokens: 5.5e12 sources: - name: "the-stack-v2" description: "Diverse programming language dataset" percentage: 40 - name: "github-code" description: "High-quality GitHub repositories" percentage: 25 - name: "synthetic-code-data" description: "AI-generated code examples" percentage: 20 - name: "natural-language" description: "Code documentation and comments" percentage: 15 # Training Hyperparameters training: # Optimization learning_rate: 1.0e-4 weight_decay: 0.01 beta1: 0.9 beta2: 0.95 eps: 1.0e-8 # Training Schedule warmup_steps: 2000 max_steps: 100000 train_batch_size: 64 gradient_accumulation_steps: 4 max_grad_norm: 1.0 # Mixed Precision fp16: true bf16: true tf32: true # Regularization dropout: 0.1 attention_dropout: 0.1 # Evaluation eval_steps: 1000 save_steps: 2000 logging_steps: 100 # Instruction Tuning instruction_tuning: enabled: true data_sources: - "code-instruct" - "multi-turn-conversations" - "programming-help" learning_rate: 5.0e-6 train_batch_size: 16 max_sequence_length: 32768 # Efficiency Optimizations efficiency: flash_attention: true gradient_checkpointing: true deepspeed: false fsdp: false use_cache: true rope_scaling: type: "linear" factor: 8.0 # Hardware Configuration hardware: gpus: 8 gpu_type: "A100" gpu_memory: "80GB" host_memory: "1TB" network: "infiniband" # Checkpointing checkpointing: save_total_limit: 3 load_best_model_at_end: true metric_for_best_model: "loss" greater_is_better: false # Distributed Training distributed: world_size: 8 rank: 0 master_addr: "localhost" master_port: 12355 # Logging and Monitoring logging: wandb: enabled: true project: "sheikh-2.5-coder" tensorboard: enabled: true log_dir: "./logs" mlflow: enabled: false # Evaluation Metrics evaluation: benchmarks: - name: "HumanEval" evaluation_steps: 1000 batch_size: 10 - name: "MBPP" evaluation_steps: 1000 batch_size: 10 - name: "MultiPL-E" evaluation_steps: 2000 batch_size: 5