{ "model": { "base_model": "Kwaipilot/KAT-Dev", "final_model_path": "./outputs/phased-kat-dev-lora_20251205_000541/final_model" }, "training_config": { "lora_r": 128, "lora_alpha": 256, "lora_dropout": 0.05, "learning_rate": 0.0001, "micro_batch_size": 1, "gradient_accumulation_steps": 6, "sequence_length": 32768, "train_split": 0.95, "val_split": 0.05 }, "hardware": { "num_gpus": 8, "gpu_name": "NVIDIA H200" }, "phases": [ { "phase": 1, "name": "Foundation", "dataset": "phase1_foundation.jsonl", "epochs": 3, "train_metrics": { "train_runtime": 9436.6955, "train_samples_per_second": 2.88, "train_steps_per_second": 0.06, "total_flos": 1.6641615376416768e+16, "train_loss": 0.29176438065012505, "entropy": 0.20519424066310976, "num_tokens": 8882376.0, "mean_token_accuracy": 0.950545616266204, "epoch": 3.0 }, "eval_metrics": { "eval_loss": 0.24340955913066864, "eval_runtime": 62.225, "eval_samples_per_second": 7.666, "eval_steps_per_second": 0.964, "eval_entropy": 0.23551432291666666, "eval_num_tokens": 8882376.0, "eval_mean_token_accuracy": 0.9330763538678487, "epoch": 3.0, "eval_perplexity": 1.2755909470694622 } }, { "phase": 2, "name": "Evolution", "dataset": "phase2_evolution.jsonl", "epochs": 2, "train_metrics": { "train_runtime": 10338.2916, "train_samples_per_second": 2.712, "train_steps_per_second": 0.057, "total_flos": 4.381908490256384e+16, "train_loss": 0.7254828861549039, "entropy": 0.5080015120967742, "num_tokens": 23480858.0, "mean_token_accuracy": 0.8641254305839539, "epoch": 2.0 }, "eval_metrics": { "eval_loss": 0.7661350965499878, "eval_runtime": 92.6646, "eval_samples_per_second": 7.964, "eval_steps_per_second": 1.004, "eval_entropy": 0.7210181451612904, "eval_num_tokens": 23480858.0, "eval_mean_token_accuracy": 0.8110371943443052, "epoch": 2.0, "eval_perplexity": 2.1514350757094487 } }, { "phase": 3, "name": "PR Mastery", "dataset": "phase3_pr_mastery.jsonl", "epochs": 2, "train_metrics": { "train_runtime": 5479.5736, "train_samples_per_second": 2.829, "train_steps_per_second": 0.059, "total_flos": 2.885146660097229e+16, "train_loss": 0.5378207373030391, "entropy": 0.47805059523809523, "num_tokens": 15454180.0, "mean_token_accuracy": 0.8748719777379718, "epoch": 2.0 }, "eval_metrics": { "eval_loss": 0.5605510473251343, "eval_runtime": 49.9945, "eval_samples_per_second": 8.161, "eval_steps_per_second": 1.02, "eval_entropy": 0.5254289215686274, "eval_num_tokens": 15454180.0, "eval_mean_token_accuracy": 0.8568766397588393, "epoch": 2.0, "eval_perplexity": 1.7516374695420183 } } ], "phase_checkpoints": [ "./outputs/phased-kat-dev-lora_20251205_000541/phase1_phase1_foundation/final_checkpoint", "./outputs/phased-kat-dev-lora_20251205_000541/phase2_phase2_evolution/final_checkpoint", "./outputs/phased-kat-dev-lora_20251205_000541/phase3_phase3_pr_mastery/final_checkpoint" ], "summary": { "initial_loss": 0.29176438065012505, "final_loss": 0.5378207373030391, "initial_eval_loss": 0.24340955913066864, "final_eval_loss": 0.5605510473251343, "initial_perplexity": 1.2755909470694622, "final_perplexity": 1.7516374695420183, "total_epochs": 7, "total_phases": 3 }, "timestamp": "20251205_000541", "run_name": "phased-kat-dev-lora", "output_directory": "./outputs/phased-kat-dev-lora_20251205_000541" }