|
|
|
|
|
|
|
|
|
|
|
|
|
|
model: |
|
|
name: "Sheikh-2.5-Coder" |
|
|
num_parameters: 3.09e9 |
|
|
context_length: 32768 |
|
|
vocab_size: 50257 |
|
|
hidden_size: 3072 |
|
|
num_attention_heads: 16 |
|
|
num_key_value_heads: 2 |
|
|
num_hidden_layers: 36 |
|
|
intermediate_size: 8192 |
|
|
activation: "swiglu" |
|
|
layer_norm_epsilon: 1e-6 |
|
|
max_position_embeddings: 32768 |
|
|
|
|
|
|
|
|
data: |
|
|
total_tokens: 5.5e12 |
|
|
sources: |
|
|
- name: "the-stack-v2" |
|
|
description: "Diverse programming language dataset" |
|
|
percentage: 40 |
|
|
- name: "github-code" |
|
|
description: "High-quality GitHub repositories" |
|
|
percentage: 25 |
|
|
- name: "synthetic-code-data" |
|
|
description: "AI-generated code examples" |
|
|
percentage: 20 |
|
|
- name: "natural-language" |
|
|
description: "Code documentation and comments" |
|
|
percentage: 15 |
|
|
|
|
|
|
|
|
training: |
|
|
|
|
|
learning_rate: 1.0e-4 |
|
|
weight_decay: 0.01 |
|
|
beta1: 0.9 |
|
|
beta2: 0.95 |
|
|
eps: 1.0e-8 |
|
|
|
|
|
|
|
|
warmup_steps: 2000 |
|
|
max_steps: 100000 |
|
|
train_batch_size: 64 |
|
|
gradient_accumulation_steps: 4 |
|
|
max_grad_norm: 1.0 |
|
|
|
|
|
|
|
|
fp16: true |
|
|
bf16: true |
|
|
tf32: true |
|
|
|
|
|
|
|
|
dropout: 0.1 |
|
|
attention_dropout: 0.1 |
|
|
|
|
|
|
|
|
eval_steps: 1000 |
|
|
save_steps: 2000 |
|
|
logging_steps: 100 |
|
|
|
|
|
|
|
|
instruction_tuning: |
|
|
enabled: true |
|
|
data_sources: |
|
|
- "code-instruct" |
|
|
- "multi-turn-conversations" |
|
|
- "programming-help" |
|
|
learning_rate: 5.0e-6 |
|
|
train_batch_size: 16 |
|
|
max_sequence_length: 32768 |
|
|
|
|
|
|
|
|
efficiency: |
|
|
flash_attention: true |
|
|
gradient_checkpointing: true |
|
|
deepspeed: false |
|
|
fsdp: false |
|
|
use_cache: true |
|
|
rope_scaling: |
|
|
type: "linear" |
|
|
factor: 8.0 |
|
|
|
|
|
|
|
|
hardware: |
|
|
gpus: 8 |
|
|
gpu_type: "A100" |
|
|
gpu_memory: "80GB" |
|
|
host_memory: "1TB" |
|
|
network: "infiniband" |
|
|
|
|
|
|
|
|
checkpointing: |
|
|
save_total_limit: 3 |
|
|
load_best_model_at_end: true |
|
|
metric_for_best_model: "loss" |
|
|
greater_is_better: false |
|
|
|
|
|
|
|
|
distributed: |
|
|
world_size: 8 |
|
|
rank: 0 |
|
|
master_addr: "localhost" |
|
|
master_port: 12355 |
|
|
|
|
|
|
|
|
logging: |
|
|
wandb: |
|
|
enabled: true |
|
|
project: "sheikh-2.5-coder" |
|
|
tensorboard: |
|
|
enabled: true |
|
|
log_dir: "./logs" |
|
|
mlflow: |
|
|
enabled: false |
|
|
|
|
|
|
|
|
evaluation: |
|
|
benchmarks: |
|
|
- name: "HumanEval" |
|
|
evaluation_steps: 1000 |
|
|
batch_size: 10 |
|
|
- name: "MBPP" |
|
|
evaluation_steps: 1000 |
|
|
batch_size: 10 |
|
|
- name: "MultiPL-E" |
|
|
evaluation_steps: 2000 |
|
|
batch_size: 5 |