Sheikh-2.5-Coder / config.json
likhonsheikh's picture
Add config.json: Model architecture configuration file
230f696 verified
{
"model_type": "phi",
"architecture": "MiniMax-M2",
"vocab_size": 51200,
"max_position_embeddings": 32768,
"num_attention_heads": 16,
"num_key_value_heads": 2,
"num_hidden_layers": 36,
"intermediate_size": 8192,
"hidden_size": 2048,
"rms_norm_epsilon": 1e-6,
"rope_theta": 10000.0,
"pad_token_id": 50256,
"eos_token_id": 50256,
"bos_token_id": 50256,
"torch_dtype": "float16",
"model_specifics": {
"total_parameters": 3090000000,
"non_embedding_parameters": 2770000000,
"embedding_parameters": 320000000,
"parameter_percentage": {
"embedding_layer": 0.104,
"transformer_layers": 0.793,
"layer_norm": 0.003
}
},
"optimization_config": {
"quantization": {
"supported_formats": ["fp32", "fp16", "int8", "int4"],
"recommended": {
"memory_optimized": "int8",
"performance_optimized": "fp16",
"memory_constrained": "int4"
}
},
"memory_requirements": {
"fp32": 12.0,
"fp16": 6.0,
"int8": 3.5,
"int4": 2.0,
"runtime_activation": 0.5
},
"inference_optimization": {
"flash_attention": true,
"gradient_checkpointing": true,
"mixed_precision": true,
"dynamic_batching": false
}
},
"training_config": {
"base_model": "microsoft/phi-2",
"context_length": 32768,
"batch_size": {
"train": 8,
"eval": 8,
"gradient_accumulation": 4
},
"learning_rate": 1e-4,
"num_epochs": 3,
"warmup_steps": 1000,
"max_grad_norm": 1.0,
"weight_decay": 0.01,
"logging_steps": 100,
"save_steps": 1000,
"eval_steps": 1000
},
"specialization": {
"primary_languages": ["javascript", "typescript", "xml", "html", "css", "mdx"],
"domain_focus": "web_development",
"on_device_ready": true,
"memory_optimized": true,
"context_extended": true
},
"evaluation_targets": {
"mmlu_code_score": ">60%",
"humaneval": ">40%",
"codebleu": ">0.65",
"syntax_validity": ">95%",
"semantic_coherence": ">0.80"
},
"tokenization": {
"base_tokenizer": "microsoft/codebert-base",
"tokenizer_max_length": 8192,
"special_tokens": {
"javascript": ["<js>", "</js>", "<function>", "</function>", "<react>", "</react>"],
"xml": ["<xml>", "</xml>", "<element>", "</element>", "<config>", "</config>"],
"mdx": ["<mdx>", "</mdx>", "<component>", "</component>", "<interactive>", "</interactive>"]
}
},
"dataset_distribution": {
"total_training_tokens": "500B",
"language_distribution": {
"javascript_typescript": 0.35,
"xml_html": 0.25,
"mdx_markdown": 0.15,
"css_scss": 0.10,
"other_languages": 0.15
},
"task_distribution": {
"code_completion": 0.40,
"instruction_following": 0.25,
"code_explanation": 0.20,
"generation": 0.10,
"debugging": 0.05
}
},
"quality_metrics": {
"data_quality_threshold": 0.85,
"duplication_rate_max": 0.05,
"language_accuracy": 0.95,
"syntax_validity_min": 0.90,
"semantic_coherence_min": 0.75
},
"deployment_config": {
"target_memory_gb": "6-12",
"quantization_strategies": {
"mobile": "int8",
"edge": "int8",
"desktop": "fp16",
"server": "fp16"
},
"inference_time_target": {
"512_tokens": "<100ms",
"1024_tokens": "<200ms",
"2048_tokens": "<400ms"
}
}
}