Spaces:
Runtime error
Runtime error
| # 读取文件,解析JSON | |
| import json | |
| import os | |
| from datasets import Dataset | |
| training_file_path = os.path.join(".", "training_data.json") | |
| print("file path:", training_file_path) | |
| training_data_json = {} | |
| with open(training_file_path, "r", encoding='utf-8') as training_file: | |
| training_data_json = json.load(training_file) | |
| # 转载数据集,划分训练/评估/测试 | |
| from datasets import Dataset | |
| training_dataset = Dataset.from_list(training_data_json) | |
| from datasets import DatasetDict | |
| train_test_split = training_dataset.train_test_split(test_size=0.2) | |
| dataset = DatasetDict({ | |
| 'train': train_test_split['train'], | |
| 'test': train_test_split['test'] | |
| }) | |
| print(dataset) | |
| # 选择模型和Tokenizer | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| model_name = "Qwen/Qwen2.5-1.5B-Instruct" # 选择一个预训练模型 | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForCausalLM.from_pretrained(model_name) | |
| print("model loaded") | |
| # 数据预处理 | |
| def preprocess_function(examples): | |
| inputs = [f"自然语言查询: {q}\nSQL查询: " for q in examples['natural_language_query']] | |
| targets = [q for q in examples['sql_query']] | |
| model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding='max_length') | |
| labels = tokenizer(targets, max_length=256, truncation=True, padding='max_length').input_ids | |
| model_inputs["labels"] = labels | |
| return model_inputs | |
| encoded_dataset = dataset.map(preprocess_function, batched=True) | |
| print("data pre-processed") | |
| # 配置训练参数 | |
| from transformers import TrainingArguments, Trainer | |
| fine_tuned_dir = os.path.join("..", "fine-tuned-models") | |
| training_args = TrainingArguments( | |
| output_dir=fine_tuned_dir, | |
| eval_strategy="epoch", | |
| learning_rate=5e-5, | |
| per_device_train_batch_size=8, | |
| per_device_eval_batch_size=8, | |
| num_train_epochs=3, | |
| weight_decay=0.01, | |
| logging_dir='./logs', | |
| logging_steps=10, | |
| save_total_limit=2, | |
| save_steps=500, | |
| prediction_loss_only=True, | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=encoded_dataset['train'], | |
| eval_dataset=encoded_dataset['test'], | |
| ) | |
| print("training parameter set") | |
| # 启动微调 | |
| trainer.train() | |
| print("training completed") | |
| # 评估模型 | |
| results = trainer.evaluate() | |
| print(results) | |
| # 保存模型 | |
| trainer.save_model('./fine_tuned_model') | |
| print("trained model saved") |