Spaces:

diminch
/

ielts-grader-api

Sleeping

App Files Files Community

diminch commited on 18 days ago

Commit

d939bae

0 Parent(s):

Deploy V15 Clean (Removed binary files history)

Browse files

Files changed (11) hide show

.gitignore +31 -0
Dockerfile +13 -0
README.md +24 -0
requirements.txt +21 -0
src/api.py +366 -0
src/clean_external_data.py +506 -0
src/clean_external_data_task1.py +94 -0
src/explore.py +90 -0
src/explore_speaking.py +71 -0
src/pronunciation.py +111 -0
src/train.py +151 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,31 @@

+# File .gitignore
+*.mp3
+*.wav
+*.m4a
+data/
+# Môi trường ảo
+venv/
+# Python cache
+__pycache__/
+*.pyc
+.cache/
+# File hệ điều hành
+.DS_Store
+Thumbs.db
+# File IDE
+.vscode/
+.idea/
+# File dữ liệu local và model đã huấn luyện
+# (Chúng ta chỉ push code, không push data/model nặng)
+*.json
+*.csv
+best_model/
+ielts_grader_model/
+# File .env (chứa API keys)
+.env

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.10
+WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    espeak-ng \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+RUN mkdir -p /app/.cache && chmod 777 /app/.cache
+ENV TRANSFORMERS_CACHE=/app/.cache
+EXPOSE 7860
+CMD ["uvicorn", "src.api:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,24 @@

+---
+title: IELTS Grader API
+emoji: 🚀
+colorFrom: blue
+colorTo: indigo
+sdk: docker
+pinned: false
+app_port: 7860
+---
+# IELTS Grader AI API
+Đây là Backend API chấm điểm IELTS Writing Task 1 và Task 2 sử dụng AI.
+## Các tính năng chính:
+- **Task 1:** Phân tích biểu đồ (Vision AI) và chấm điểm.
+- **Task 2:** Chấm điểm bài luận nghị luận xã hội.
+- **Feedback:** Cung cấp nhận xét chi tiết và hướng dẫn sửa lỗi.
+## Cách sử dụng
+API chạy tại port 7860.
+Endpoint chính: `POST /grade`

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+# Train
+transformers[torch]
+datasets
+scikit-learn
+huggingface_hub
+tqdm
+torch
+pydantic
+# API
+fastapi
+uvicorn[standard]
+httpx
+python-dotenv
+# Library
+openai
+librosa
+openai-whisper
+numpy
+python-multipart

src/api.py ADDED Viewed

	@@ -0,0 +1,366 @@

+import uvicorn
+from fastapi import FastAPI, HTTPException, UploadFile, File, Form
+from pydantic import BaseModel, Field
+from transformers import pipeline
+import torch
+import os
+import json
+import httpx
+import shutil
+import whisper
+import librosa
+import numpy as np
+from dotenv import load_dotenv
+from typing import Optional, List
+import uuid
+try:
+    from src.pronunciation import grade_pronunciation_advanced
+except ImportError:
+    from pronunciation import grade_pronunciation_advanced
+load_dotenv()
+SCORER_MODEL_ID_TASK1 = "diminch/ielts-task1-grader-ai-v2"
+SCORER_MODEL_ID_TASK2 = "diminch/ielts-grader-ai-v2"
+DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
+print(f"API running on: {DEVICE}")
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+OPENAI_API_URL = "https://api.openai.com/v1/chat/completions"
+if not OPENAI_API_KEY:
+    print("WARNING: OPENAI_API_KEY not found in .env")
+print("Loading Whisper...")
+try:
+    whisper_model = whisper.load_model("base", device=DEVICE)
+    print("Whisper Loaded.")
+except Exception as e:
+    print(f"Error loading Whisper: {e}")
+    whisper_model = None
+pipelines = {}
+def load_writing_model(task_name, model_id):
+    try:
+        print(f"Loading {task_name}: {model_id}...")
+        pipelines[task_name] = pipeline(
+            "text-classification", model=model_id, tokenizer=model_id,
+            device=DEVICE, return_all_scores=True
+        )
+        print(f"Loaded {task_name}.")
+    except Exception as e:
+        print(f"Error loading {task_name}: {e}")
+        pipelines[task_name] = None
+load_writing_model("task1", SCORER_MODEL_ID_TASK1)
+load_writing_model("task2", SCORER_MODEL_ID_TASK2)
+class WritingRequest(BaseModel):
+    task_type: int
+    prompt: str
+    essay: str
+    image_url: Optional[str] = None
+class WritingScores(BaseModel):
+    taskResponse: float
+    coherenceCohesion: float
+    lexicalResource: float
+    grammaticalRange: float
+class ShortFeedbackWriting(BaseModel):
+    taskResponse: str
+    coherenceCohesion: str
+    lexicalResource: str
+    grammaticalRange: str
+class WritingResponse(BaseModel):
+    overallScore: float
+    imageDescription: Optional[str] = None
+    criteriaScores: WritingScores
+    shortFeedback: ShortFeedbackWriting
+    detailedFeedback: str
+class SpeakingScores(BaseModel):
+    fluencyCoherence: float
+    lexicalResource: float
+    grammaticalRange: float
+    pronunciation: float
+class PronunciationWord(BaseModel):
+    word: str
+    score: int
+    phonemes_expected: str
+    phonemes_actual: str
+    is_correct: bool
+    error_type: Optional[str] = None
+class SpeakingResponse(BaseModel):
+    overallScore: float
+    transcript: str
+    refinedTranscript: str
+    betterVersion: str
+    criteriaScores: SpeakingScores
+    shortFeedback: dict
+    detailedFeedback: str
+    pronunciationBreakdown: List[PronunciationWord]
+def round_to_half(score: float) -> float:
+    return round(score * 2) / 2
+async def analyze_chart_image(image_url: str, prompt_text: str) -> str:
+    """Vision AI for Task 1"""
+    if not image_url: return "No image provided."
+    print("Analyzing chart image...")
+    headers = { "Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json" }
+    vision_prompt = f"""
+    Act as a data analyst. Describe this IELTS Writing Task 1 image in detail.
+    Focus strictly on the main trends, comparisons, and specific data points mentioned in the prompt: "{prompt_text}".
+    Output a factual description paragraph representing the 'Ground Truth' of the image.
+    """
+    payload = {
+        "model": "gpt-4o",
+        "messages": [{"role": "user", "content": [
+            {"type": "text", "text": vision_prompt},
+            {"type": "image_url", "image_url": {"url": image_url}}
+        ]}],
+        "max_tokens": 500
+    }
+    async with httpx.AsyncClient(timeout=60.0) as client:
+        try:
+            resp = await client.post(OPENAI_API_URL, headers=headers, json=payload)
+            return resp.json()['choices'][0]['message']['content']
+        except Exception as e:
+            print(f"Vision Error: {e}")
+            return ""
+async def generate_writing_feedback(prompt: str, essay: str, scores: WritingScores, task_type: int, img_desc: str = "") -> dict:
+    print("Generating Writing feedback...")
+    scores_dict = scores.model_dump()
+    context_info = ""
+    criterion_1_name = "Task Response"
+    if task_type == 1:
+        context_info = f"IMAGE GROUND TRUTH: {img_desc}\n(Check if the student accurately reported this data)"
+        criterion_1_name = "Task Achievement"
+    system_prompt = f"""
+    You are a strict, expert IELTS Examiner.
+    TASK INFO:
+    - Type: Task {task_type}
+    - Prompt: "{prompt}"
+    {context_info}
+    STUDENT ESSAY:
+    "{essay}"
+    SCORES GIVEN (0-9):
+    {json.dumps(scores_dict)}
+    YOUR GOAL:
+    Provide a deeply analytical and educational feedback JSON.
+    INSTRUCTIONS FOR 'detailedFeedback':
+    The 'detailedFeedback' field MUST be a long Markdown string structured as follows:
+    1. **General Overview**: A brief summary of why the essay got this band score.
+    2. **Strengths & Weaknesses**: Bullet points highlighting what was done well and what was missing in each criteria (one by one, four criterias in total).
+    3. **Specific Corrections (CRITICAL)**:
+       - Identify 3-4 specific errors (grammar, vocab, or data accuracy).
+       - For each error, show the "Original Text" -> "Correction" -> "Explanation".
+       - Example: *Original: "The data shows an increase." -> Better: "The data illustrates a significant upward trend." (Explanation: Use more precise academic vocabulary).*
+    4. **Actionable Advice**: Give 2-3 concrete steps the student should take to improve their score next time.
+    Output JSON format:
+    {{
+        "shortFeedback": {{
+            "{criterion_1_name}": "...",
+            "Coherence and Cohesion": "...",
+            "Lexical Resource": "...",
+            "Grammatical Range and Accuracy": "..."
+        }},
+        "detailedFeedback": "MARKDOWN STRING..."
+    }}
+    """
+    payload = {
+        "model": "gpt-4o-mini",
+        "messages": [{"role": "system", "content": system_prompt}],
+        "response_format": {"type": "json_object"}
+    }
+    async with httpx.AsyncClient(timeout=60.0) as client:
+        resp = await client.post(OPENAI_API_URL, headers={"Authorization": f"Bearer {OPENAI_API_KEY}"}, json=payload)
+        return json.loads(resp.json()['choices'][0]['message']['content'])
+app = FastAPI(title="IELTS Full-Stack AI API (V15.0)")
+@app.post("/grade/writing", response_model=WritingResponse)
+async def grade_writing(request: WritingRequest):
+    model = pipelines.get(f"task{request.task_type}")
+    if not model: raise HTTPException(500, "Model not ready.")
+    image_desc = ""
+    if request.task_type == 1:
+        if not request.image_url: raise HTTPException(400, "Task 1 requires image_url.")
+        image_desc = await analyze_chart_image(request.image_url, request.prompt)
+        final_input = f"PROMPT: {request.prompt}\n\nIMAGE CONTEXT: {image_desc} [SEP] {request.essay}"
+    else:
+        final_input = f"{request.prompt} [SEP] {request.essay}"
+    results = model(final_input, truncation=True, max_length=512)[0]
+    raw = {item['label']: item['score'] for item in results}
+    def r(x): return round(x * 2) / 2
+    scores = WritingScores(
+        taskResponse=r(raw.get('LABEL_0', 1.0)),
+        coherenceCohesion=r(raw.get('LABEL_1', 1.0)),
+        lexicalResource=r(raw.get('LABEL_2', 1.0)),
+        grammaticalRange=r(raw.get('LABEL_3', 1.0))
+    )
+    overall = r((scores.taskResponse + scores.coherenceCohesion +
+                 scores.lexicalResource + scores.grammaticalRange) / 4)
+    # Feedback
+    fb = await generate_writing_feedback(request.prompt, request.essay, scores, request.task_type, image_desc)
+    sf = fb.get("shortFeedback", {})
+    tr_fb = sf.get("Task Response") or sf.get("Task Achievement") or "No feedback"
+    return WritingResponse(
+        overallScore=overall,
+        imageDescription=image_desc if request.task_type == 1 else None,
+        criteriaScores=scores,
+        shortFeedback=ShortFeedbackWriting(
+            taskResponse=tr_fb,
+            coherenceCohesion=sf.get("Coherence and Cohesion", ""),
+            lexicalResource=sf.get("Lexical Resource", ""),
+            grammaticalRange=sf.get("Grammatical Range and Accuracy", "")
+        ),
+        detailedFeedback=fb.get("detailedFeedback", "")
+    )
+async def grade_speaking_with_gpt(transcript: str, metrics: dict, ipa_data: dict, prompt_text: str) -> dict:
+    """
+    Generate Speaking feedback with Pronunciation Breakdown array.
+    """
+    print("Generating Speaking feedback...")
+    system_prompt = f"""
+    You are an expert IELTS Speaking Examiner and Phonetician.
+    INPUT DATA:
+    - Question: "{prompt_text}"
+    - Transcript (Whisper): "{transcript}"
+    - Raw Audio IPA (Actual): /{ipa_data.get('actual_ipa', '')}/
+    - Expected IPA (Standard): /{ipa_data.get('expected_ipa', '')}/
+    METRICS:
+    - Speed: {metrics['wpm']:.1f} WPM
+    - Pauses: {metrics['pause_ratio']*100:.1f}%
+    YOUR TASK:
+    1. Score the 4 criteria (0-9).
+    2. **Pronunciation Breakdown**: Map words from Transcript to the IPA. Identify mispronounced words.
+       - Compare Actual vs Expected IPA for each word.
+       - Assign a score (1-10) for each word's pronunciation.
+       - Flag errors (e.g., 'severe_substitution' if user said 'trip' but meant 'subject').
+    OUTPUT JSON FORMAT (This is sample structure, replace with actual data):
+    {{
+        "scores": {{ "fluencyCoherence": 0.0, "lexicalResource": 0.0, "grammaticalRange": 0.0, "pronunciation": 0.0 }},
+        "shortFeedback": {{ "Fluency": "...", "Vocabulary": "...", "Grammar": "...", "Pronunciation": "..." }},
+        "detailedFeedback": "MARKDOWN string...",
+        "refinedTranscript": "Corrected version...",
+        "betterVersion": "Upgraded Band 8 version...",
+        "pronunciationBreakdown": [
+            {{
+                "word": "subject",
+                "score": 3,
+                "phonemes_expected": "s ʌ b dʒ ɛ k t",
+                "phonemes_actual": "t r ɪ p",
+                "is_correct": false,
+                "error_type": "severe_substitution"
+            }},
+            ... (more words)
+        ]
+    }}
+    """
+    payload = {
+        "model": "gpt-4o-mini",
+        "messages": [{"role": "system", "content": system_prompt}],
+        "response_format": {"type": "json_object"}
+    }
+    async with httpx.AsyncClient(timeout=60.0) as client:
+        resp = await client.post(OPENAI_API_URL, headers={"Authorization": f"Bearer {OPENAI_API_KEY}"}, json=payload)
+        return json.loads(resp.json()['choices'][0]['message']['content'])
+@app.post("/grade/speaking", response_model=SpeakingResponse)
+async def grade_speaking(audio: UploadFile = File(...), prompt: str = Form(...)):
+    temp_filename = f"temp_{uuid.uuid4()}.wav"
+    try:
+        with open(temp_filename, "wb") as buffer:
+            shutil.copyfileobj(audio.file, buffer)
+        # 1. Whisper & Acoustic Metrics
+        if not whisper_model: raise HTTPException(500, "Whisper missing")
+        res = whisper_model.transcribe(temp_filename)
+        transcript = res["text"].strip()
+        y, sr = librosa.load(temp_filename)
+        duration = librosa.get_duration(y=y, sr=sr)
+        word_count = len(transcript.split())
+        wpm = (word_count / duration) * 60 if duration > 0 else 0
+        non_silent = librosa.effects.split(y, top_db=20)
+        silent_time = duration - sum([(e-s)/sr for s,e in non_silent])
+        pause_ratio = silent_time / duration if duration > 0 else 0
+        metrics = {"wpm": wpm, "pause_ratio": pause_ratio}
+        # 2. IPA Analysis (Subprocess based)
+        ipa_data = grade_pronunciation_advanced(temp_filename, transcript)
+        # 3. GPT Analysis
+        gpt_result = await grade_speaking_with_gpt(transcript, metrics, ipa_data, prompt)
+        scores = gpt_result.get("scores", {})
+        # 4. Response
+        criteria = SpeakingScores(
+            fluencyCoherence=round_to_half(scores.get("fluencyCoherence", 0)),
+            lexicalResource=round_to_half(scores.get("lexicalResource", 0)),
+            grammaticalRange=round_to_half(scores.get("grammaticalRange", 0)),
+            pronunciation=round_to_half(scores.get("pronunciation", 0))
+        )
+        overall = round_to_half((criteria.fluencyCoherence + criteria.lexicalResource +
+                                 criteria.grammaticalRange + criteria.pronunciation) / 4)
+        return SpeakingResponse(
+            overallScore=overall,
+            transcript=transcript,
+            refinedTranscript=gpt_result.get("refinedTranscript", ""),
+            betterVersion=gpt_result.get("betterVersion", ""),
+            criteriaScores=criteria,
+            shortFeedback=gpt_result.get("shortFeedback", {}),
+            detailedFeedback=gpt_result.get("detailedFeedback", ""),
+            pronunciationBreakdown=gpt_result.get("pronunciationBreakdown", [])
+        )
+    except Exception as e:
+        print(f"Speaking Error: {e}")
+        import traceback
+        traceback.print_exc()
+        raise HTTPException(500, str(e))
+    finally:
+        if os.path.exists(temp_filename): os.remove(temp_filename)
+@app.get("/")
+def read_root():
+    return {"message": "IELTS API is running."}
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

src/clean_external_data.py ADDED Viewed

	@@ -0,0 +1,506 @@

+import json
+import re
+import os
+from datasets import load_dataset
+from tqdm import tqdm
+# Regex để bắt điểm (ví dụ: 7 hoặc 7.5 hoặc 6.0)
+FLOAT_RE = r"(\d+(?:\.\d+)?)"
+def to_float_safe(x):
+    """Chuyển đổi an toàn sang float, nếu lỗi trả về None"""
+    try:
+        val = float(x)
+        # Kiểm tra điểm hợp lệ (0-9)
+        if 0 <= val <= 9:
+            return val
+        return None
+    except Exception:
+        return None
+def parse_chillies_dataset(dataset):
+    """
+    Parser cho 'chillies/IELTS-writing-task-2-evaluation'.
+    Format: **Task Achievement: [7]** hoặc **Overall Band Score: [7.5]**
+    """
+    print("Đang xử lý dataset 'chillies'...")
+    cleaned = []
+    bad_examples = 0
+    patterns = {
+        "task_response": re.compile(
+            r"\*\*Task Achievement:\s*\[?(" + FLOAT_RE + r")\]?\*\*",
+            re.I
+        ),
+        "coherence_cohesion": re.compile(
+            r"\*\*Coherence and Cohesion:\s*\[?(" + FLOAT_RE + r")\]?\*\*",
+            re.I
+        ),
+        "lexical_resource": re.compile(
+            r"\*\*Lexical Resource:\s*\[?(" + FLOAT_RE + r")\]?\*\*",
+            re.I
+        ),
+        "grammatical_range": re.compile(
+            r"\*\*Grammatical Range and Accuracy:\s*\[?(" + FLOAT_RE + r")\]?\*\*",
+            re.I
+        ),
+    }
+    for item in tqdm(dataset, desc="Parsing chillies"):
+        try:
+            prompt = item.get('prompt', '').strip()
+            essay = item.get('essay', '').strip()
+            evaluation_text = item.get('evaluation', '')
+            if not (prompt and essay and evaluation_text and len(essay) > 50):
+                bad_examples += 1
+                continue
+            scores = {}
+            for key, pattern in patterns.items():
+                match = pattern.search(evaluation_text)
+                if match:
+                    score_str = match.group(1)
+                    scores[key] = to_float_safe(score_str)
+                else:
+                    scores[key] = None
+            if all(scores.values()):
+                standard_scores = {
+                    "task_response": scores["task_response"],
+                    "coherence_cohesion": scores["coherence_cohesion"],
+                    "lexical_resource": scores["lexical_resource"],
+                    "grammatical_range": scores["grammatical_range"]
+                }
+                cleaned.append({
+                    "prompt_text": prompt,
+                    "essay_text": essay,
+                    "scores": standard_scores
+                })
+            else:
+                bad_examples += 1
+        except Exception:
+            bad_examples += 1
+    print(f"  ✓ kept {len(cleaned)} samples, skipped {bad_examples}")
+    return cleaned
+def parse_123harr_dataset(dataset):
+    """
+    Parser cho '123Harr/IELTS-WT2-LLaMa3-1k'.
+    Lấy scores từ 'formatted' field
+    """
+    print("Đang xử lý dataset '123Harr'...")
+    cleaned = []
+    bad_examples = 0
+    prompt_essay_re = re.compile(
+        r"<\|start_header_id\|>user<\|end_header_id\|>\n\n(.*?)<\|eot_id\|>",
+        re.S
+    )
+    score_patterns = {
+        "task_response": re.compile(
+            r"(?:###|##|\*\*)?Task Achievement(?:\*\*)?:[\s\S]*?(?:Suggested Band Score|Band Score)?[\s\S]*?" + FLOAT_RE + r"(?:\s|$)",
+            re.I | re.M
+        ),
+        "coherence_cohesion": re.compile(
+            r"(?:###|##|\*\*)?Coherence and Cohesion(?:\*\*)?:[\s\S]*?(?:Suggested Band Score|Band Score)?[\s\S]*?" + FLOAT_RE + r"(?:\s|$)",
+            re.I | re.M
+        ),
+        "lexical_resource": re.compile(
+            r"(?:###|##|\*\*)?Lexical Resource(?:\s*\(Vocabulary\))?(?:\*\*)?:[\s\S]*?(?:Suggested Band Score|Band Score)?[\s\S]*?" + FLOAT_RE + r"(?:\s|$)",
+            re.I | re.M
+        ),
+        "grammatical_range": re.compile(
+            r"(?:###|##|\*\*)?Grammatical Range and Accuracy(?:\*\*)?:[\s\S]*?(?:Suggested Band Score|Band Score)?[\s\S]*?" + FLOAT_RE + r"(?:\s|$)",
+            re.I | re.M
+        ),
+    }
+    for item in tqdm(dataset, desc="Parsing 123Harr"):
+        try:
+            formatted_text = item.get('formatted', '')
+            if not formatted_text:
+                bad_examples += 1
+                continue
+            matches = prompt_essay_re.findall(formatted_text)
+            if len(matches) < 2:
+                bad_examples += 1
+                continue
+            prompt = matches[0].strip()
+            essay = matches[1].strip()
+            if not prompt or not essay or len(essay) < 50:
+                bad_examples += 1
+                continue
+            scores = {}
+            for key, pattern in score_patterns.items():
+                match = pattern.search(formatted_text)
+                if match:
+                    score_str = match.group(match.lastindex) if match.lastindex else match.group(1)
+                    scores[key] = to_float_safe(score_str)
+                else:
+                    scores[key] = None
+            if all(scores.values()):
+                standard_scores = {
+                    "task_response": scores["task_response"],
+                    "coherence_cohesion": scores["coherence_cohesion"],
+                    "lexical_resource": scores["lexical_resource"],
+                    "grammatical_range": scores["grammatical_range"]
+                }
+                cleaned.append({
+                    "prompt_text": prompt,
+                    "essay_text": essay,
+                    "scores": standard_scores
+                })
+            else:
+                bad_examples += 1
+        except Exception:
+            bad_examples += 1
+    print(f"  ✓ kept {len(cleaned)} samples, skipped {bad_examples}")
+    return cleaned
+def parse_dpo_dataset(dataset):
+    """
+    Parser cho 'chillies/DPO_ielts_writing'.
+    """
+    print("Đang xử lý dataset 'DPO'...")
+    cleaned = []
+    bad_examples = 0
+    patterns_primary = {
+        "task_response": re.compile(
+            r"##\s*Task Achievement:[\s\S]*?Suggested Band Score:\s*" + FLOAT_RE,
+            re.I
+        ),
+        "coherence_cohesion": re.compile(
+            r"##\s*Coherence and Cohesion:[\s\S]*?Suggested Band Score:\s*" + FLOAT_RE,
+            re.I
+        ),
+        "lexical_resource": re.compile(
+            r"##\s*Lexical Resource(?:\s*\(Vocabulary\))?:[\s\S]*?Suggested Band Score:\s*" + FLOAT_RE,
+            re.I
+        ),
+        "grammatical_range": re.compile(
+            r"##\s*Grammatical Range and Accuracy:[\s\S]*?Suggested Band Score:\s*" + FLOAT_RE,
+            re.I
+        ),
+    }
+    patterns_fallback = {
+        "task_response": re.compile(r"(?:\*\*)?Task Achievement(?:\*\*)?:\s*" + FLOAT_RE, re.I),
+        "coherence_cohesion": re.compile(r"(?:\*\*)?Coherence and Cohesion(?:\*\*)?:\s*" + FLOAT_RE, re.I),
+        "lexical_resource": re.compile(r"(?:\*\*)?Lexical Resource(?:\s*\(Vocabulary\))?(?:\*\*)?:\s*" + FLOAT_RE, re.I),
+        "grammatical_range": re.compile(r"(?:\*\*)?Grammatical Range and Accuracy(?:\*\*)?:\s*" + FLOAT_RE, re.I),
+    }
+    for item in tqdm(dataset, desc="Parsing DPO"):
+        try:
+            prompt = item.get('prompt', '').strip()
+            essay = item.get('essay', '').strip()
+            chosen_text = item.get('chosen', '')
+            if not (prompt and essay and chosen_text and len(essay) > 50):
+                bad_examples += 1
+                continue
+            scores = {}
+            for key, pattern in patterns_primary.items():
+                match = pattern.search(chosen_text)
+                if match:
+                    scores[key] = to_float_safe(match.group(1))
+                else:
+                    scores[key] = None
+            if not all(scores.values()):
+                scores = {}
+                for key, pattern in patterns_fallback.items():
+                    match = pattern.search(chosen_text)
+                    if match:
+                        scores[key] = to_float_safe(match.group(1))
+                    else:
+                        scores[key] = None
+            if all(scores.values()):
+                standard_scores = {
+                    "task_response": scores["task_response"],
+                    "coherence_cohesion": scores["coherence_cohesion"],
+                    "lexical_resource": scores["lexical_resource"],
+                    "grammatical_range": scores["grammatical_range"]
+                }
+                cleaned.append({
+                    "prompt_text": prompt,
+                    "essay_text": essay,
+                    "scores": standard_scores
+                })
+            else:
+                bad_examples += 1
+        except Exception:
+            bad_examples += 1
+    print(f"  ✓ kept {len(cleaned)} samples, skipped {bad_examples}")
+    return cleaned
+def parse_hadeel_dataset(dataset):
+    """
+    Parser cho 'hadeelbkh/tokenized-IELTS-writing-task-2-evaluation'.
+    """
+    print("Đang xử lý dataset 'hadeel'...")
+    cleaned = []
+    bad_examples = 0
+    patterns = {
+        "task_response": re.compile(
+            r"(?:\*\*)?task achievement(?:\*\*)?:\s*-?\s*(" + FLOAT_RE + r")",
+            re.I
+        ),
+        "coherence_cohesion": re.compile(
+            r"(?:\*\*)?coherence and cohesion(?:\*\*)?:\s*-?\s*(" + FLOAT_RE + r")",
+            re.I
+        ),
+        "lexical_resource": re.compile(
+            r"(?:\*\*)?lexical resource(?:\s*\(vocabulary\))?(?:\*\*)?:\s*-?\s*(" + FLOAT_RE + r")",
+            re.I
+        ),
+        "grammatical_range": re.compile(
+            r"(?:\*\*)?grammatical range and accuracy(?:\*\*)?:\s*-?\s*(" + FLOAT_RE + r")",
+            re.I
+        ),
+    }
+    for item in tqdm(dataset, desc="Parsing hadeel"):
+        try:
+            prompt = item.get('prompt', '').strip()
+            essay = item.get('essay', '').strip()
+            evaluation_text = item.get('evaluation', '')
+            if not (prompt and essay and evaluation_text and len(essay) > 50):
+                bad_examples += 1
+                continue
+            scores = {}
+            for key, pattern in patterns.items():
+                match = pattern.search(evaluation_text)
+                if match:
+                    score_str = match.group(1)
+                    scores[key] = to_float_safe(score_str)
+                else:
+                    scores[key] = None
+            if all(scores.values()):
+                standard_scores = {
+                    "task_response": scores["task_response"],
+                    "coherence_cohesion": scores["coherence_cohesion"],
+                    "lexical_resource": scores["lexical_resource"],
+                    "grammatical_range": scores["grammatical_range"]
+                }
+                cleaned.append({
+                    "prompt_text": prompt,
+                    "essay_text": essay,
+                    "scores": standard_scores
+                })
+            else:
+                bad_examples += 1
+        except Exception:
+            bad_examples += 1
+    print(f"  ✓ kept {len(cleaned)} samples, skipped {bad_examples}")
+    return cleaned
+def parse_vietanh_dataset(dataset):
+    """
+    Parser cho 'vietanh0802/ielts_writing_training_data_prepared'.
+    Format: <s>[INST] ... ### Prompt: ... ### Essay: ... [/INST] ...
+    """
+    print("Đang xử lý dataset 'vietanh'...")
+    cleaned = []
+    bad_examples = 0
+    prompt_re = re.compile(r"### Prompt:\s*(.*?)(?=### Essay:|$)", re.S | re.I)
+    essay_re = re.compile(r"### Essay:\s*(.*?)(?=\[/INST\]|$)", re.S | re.I)
+    score_patterns = {
+        "task_response": re.compile(
+            r"(?:\*\*)?Task Achievement(?:\*\*)?:\s*\[?(" + FLOAT_RE + r")\]?",
+            re.I
+        ),
+        "coherence_cohesion": re.compile(
+            r"(?:\*\*)?Coherence and Cohesion(?:\*\*)?:\s*\[?(" + FLOAT_RE + r")\]?",
+            re.I
+        ),
+        "lexical_resource": re.compile(
+            r"(?:\*\*)?Lexical Resource(?:\s*\(Vocabulary\))?(?:\*\*)?:\s*\[?(" + FLOAT_RE + r")\]?",
+            re.I
+        ),
+        "grammatical_range": re.compile(
+            r"(?:\*\*)?Grammatical Range and Accuracy(?:\*\*)?:\s*\[?(" + FLOAT_RE + r")\]?",
+            re.I
+        ),
+    }
+    for item in tqdm(dataset, desc="Parsing vietanh"):
+        try:
+            training_text = item.get('training_text', '')
+            if not training_text:
+                bad_examples += 1
+                continue
+            prompt_match = prompt_re.search(training_text)
+            if not prompt_match:
+                bad_examples += 1
+                continue
+            prompt = prompt_match.group(1).strip()
+            essay_match = essay_re.search(training_text)
+            if not essay_match:
+                bad_examples += 1
+                continue
+            essay = essay_match.group(1).strip()
+            if not prompt or not essay or len(essay) < 50:
+                bad_examples += 1
+                continue
+            scores = {}
+            for key, pattern in score_patterns.items():
+                match = pattern.search(training_text)
+                if match:
+                    scores[key] = to_float_safe(match.group(1))
+                else:
+                    scores[key] = None
+            if all(scores.values()):
+                standard_scores = {
+                    "task_response": scores["task_response"],
+                    "coherence_cohesion": scores["coherence_cohesion"],
+                    "lexical_resource": scores["lexical_resource"],
+                    "grammatical_range": scores["grammatical_range"]
+                }
+                cleaned.append({
+                    "prompt_text": prompt,
+                    "essay_text": essay,
+                    "scores": standard_scores
+                })
+            else:
+                bad_examples += 1
+        except Exception:
+            bad_examples += 1
+    print(f"  ✓ kept {len(cleaned)} samples, skipped {bad_examples}")
+    return cleaned
+def main():
+    print("Đang tải các dataset từ Hugging Face...\n")
+    cache_dir = "./.cache/huggingface_datasets"
+    all_data = []
+    # Dataset 1: chillies/IELTS-writing-task-2-evaluation
+    try:
+        ds_chillies = load_dataset(
+            "chillies/IELTS-writing-task-2-evaluation",
+            split="train",
+            cache_dir=cache_dir
+        )
+        all_data.append(("chillies", parse_chillies_dataset(ds_chillies)))
+    except Exception as e:
+        print(f"✗ Lỗi tải chillies: {e}\n")
+    # Dataset 2: 123Harr/IELTS-WT2-LLaMa3-1k
+    try:
+        ds_123harr = load_dataset(
+            "123Harr/IELTS-WT2-LLaMa3-1k",
+            split="train",
+            cache_dir=cache_dir
+        )
+        all_data.append(("123Harr", parse_123harr_dataset(ds_123harr)))
+    except Exception as e:
+        print(f"✗ Lỗi tải 123Harr: {e}\n")
+    # Dataset 3: chillies/DPO_ielts_writing
+    try:
+        ds_chillies_2 = load_dataset(
+            "chillies/DPO_ielts_writing",
+            split="train",
+            cache_dir=cache_dir
+        )
+        all_data.append(("DPO", parse_dpo_dataset(ds_chillies_2)))
+    except Exception as e:
+        print(f"✗ Lỗi tải DPO: {e}\n")
+    # Dataset 4: hadeelbkh/tokenized-IELTS-writing-task-2-evaluation
+    try:
+        ds_hadeel = load_dataset(
+            "hadeelbkh/tokenized-IELTS-writing-task-2-evaluation-DialoGPT-medium",
+            split="train",
+            cache_dir=cache_dir
+        )
+        all_data.append(("hadeel", parse_hadeel_dataset(ds_hadeel)))
+    except Exception as e:
+        print(f"✗ Lỗi tải hadeel: {e}\n")
+    # Dataset 5: vietanh0802/ielts_writing_training_data_prepared
+    try:
+        ds_vietanh = load_dataset(
+            "vietanh0802/ielts_writing_training_data_prepared",
+            split="train",
+            cache_dir=cache_dir
+        )
+        all_data.append(("vietanh", parse_vietanh_dataset(ds_vietanh)))
+    except Exception as e:
+        print(f"✗ Lỗi tải vietanh: {e}\n")
+    # Tính tổng
+    print("\n" + "="*60)
+    print("--- TỔNG HỢP ---")
+    print("="*60)
+    total = 0
+    for name, data in all_data:
+        count = len(data)
+        total += count
+        print(f"Dataset ({name:15}): {count:5d} mẫu")
+    print("="*60)
+    print(f"Tổng cộng mẫu hợp lệ: {total}")
+    print("="*60)
+    final_dataset = []
+    for name, data in all_data:
+        final_dataset.extend(data)
+    if not final_dataset:
+        print("✗ Lỗi: Không có dữ liệu nào được chuẩn hóa. Vui lòng kiểm tra lại script.")
+        return
+    output_dir = "data"
+    output_path = os.path.join(output_dir, "dataset_for_scorer.json")
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+        print(f"✓ Đã tạo thư mục {output_dir}")
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(final_dataset, f, ensure_ascii=False, indent=2)
+    print(f"✓ Đã ghi {len(final_dataset)} mẫu vào file '{output_path}'.")
+    print("\n✓ Hoàn tất! Bây giờ bạn có thể chạy 'src/train.py' trên Colab!")
+if __name__ == "__main__":
+    main()

src/clean_external_data_task1.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# File: src/clean_external_data_task1.py
+import json
+import os
+from datasets import load_dataset
+from tqdm import tqdm
+def to_float_safe(x):
+    """Chuyển đổi string sang float (xử lý cả '9' và '9.0')"""
+    try:
+        if x is None: return None
+        val = float(x)
+        if 0 <= val <= 9: return val
+        return None
+    except ValueError:
+        return None
+def parse_hai2131_dataset(dataset):
+    """
+    Parser chuẩn cho 'hai2131/IELTS-essays-task-1'.
+    Kết hợp: Prompt + Image Description + Essay
+    """
+    print("Đang xử lý dataset 'hai2131'...")
+    cleaned = []
+    bad_examples = 0
+    for item in tqdm(dataset, desc="Parsing hai2131"):
+        try:
+            # 1. Lấy thông tin đầu vào
+            prompt = item.get("subject") or ""
+            # QUAN TRỌNG: Lấy mô tả ảnh
+            img_desc = item.get("image_description") or ""
+            essay = item.get("content") or ""
+            # 2. Tạo input text kết hợp (Prompt + Context + Essay)
+            # Model sẽ đọc toàn bộ chuỗi này
+            full_prompt_text = f"PROMPT: {prompt}\n\nIMAGE CONTEXT: {img_desc}"
+            # 3. Lấy điểm số (Dataset này để điểm dạng string "9")
+            scores = {
+                "task_response": to_float_safe(item.get("task_response_score")),
+                "coherence_cohesion": to_float_safe(item.get("coherence_cohesion_score")),
+                "lexical_resource": to_float_safe(item.get("lexical_resource_score")),
+                "grammatical_range": to_float_safe(item.get("grammatical_range_accuracy_score"))
+            }
+            # 4. Kiểm tra hợp lệ
+            if essay and all(scores.values()):
+                cleaned.append({
+                    "prompt_text": full_prompt_text, # Input đặc biệt cho Task 1
+                    "essay_text": essay,
+                    "scores": scores
+                })
+            else:
+                bad_examples += 1
+        except Exception:
+            bad_examples += 1
+    print(f"hai2131: kept {len(cleaned)} samples, skipped {bad_examples}")
+    return cleaned
+def main():
+    print("🚀 BẮT ĐẦU XỬ LÝ DATASET TASK 1 (hai2131)")
+    # Cache dir để tránh tải lại nhiều lần
+    cache_dir = "./.cache/huggingface_datasets_task1"
+    try:
+        # Tải dataset (lần này sẽ nhanh vì nó nhỏ gọn)
+        dataset = load_dataset("hai2131/IELTS-essays-task-1", split="train", cache_dir=cache_dir)
+        # Xử lý
+        final_dataset = parse_hai2131_dataset(dataset)
+        if not final_dataset:
+            print("LỖI: Không có dữ liệu.")
+            return
+        # Lưu file
+        output_dir = "data"
+        if not os.path.exists(output_dir): os.makedirs(output_dir)
+        output_path = os.path.join(output_dir, "dataset_for_scorer_task1.json")
+        with open(output_path, "w", encoding="utf-8") as f:
+            json.dump(final_dataset, f, ensure_ascii=False, indent=2)
+        print(f"\n✅ HOÀN TẤT! Đã lưu {len(final_dataset)} mẫu vào '{output_path}'.")
+        print("💡 Lưu ý: 'prompt_text' bây giờ chứa cả Đề bài VÀ Mô tả ảnh.")
+    except Exception as e:
+        print(f"❌ Lỗi: {e}")
+if __name__ == "__main__":
+    main()

src/explore.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# File: src/explore.py (SỬA LỖI - CHỈ CHẠY hai2131)
+import json
+import os
+import sys
+from datasets import load_dataset
+from itertools import islice
+import traceback
+DATASET_LIST = {
+    # "trantac": "TraTacXiMuoi/Ielts_writing_task1_academic", # Tạm thời tắt do lỗi mạng
+    "hai2131": "hai2131/IELTS-essays-task-1"
+}
+NUM_SAMPLES_TO_VIEW = 2
+SPLIT_NAME = "train"
+def safe_value_to_string(value):
+    """Chuyển đổi value thành string an toàn"""
+    if value is None:
+        return None
+    if isinstance(value, (str, int, float, bool)):
+        return value
+    if isinstance(value, dict):
+        return value
+    if isinstance(value, list):
+        return value
+    # Đối với các object khác (ảnh, audio, etc)
+    return f"<{type(value).__name__}>"
+def explore_dataset(name: str, path: str, split: str, n: int):
+    """
+    Tải N mẫu đầu tiên của một dataset từ Hugging Face và in cấu trúc của nó.
+    """
+    print("="*80)
+    print(f"🕵️  Đang khám phá dataset: {name}")
+    print(f"    Path: {path}")
+    print(f"    Split: {split}")
+    print("="*80)
+    try:
+        # Tải N mẫu đầu tiên (không dùng streaming nữa,
+        # vì dataset hai2131 chỉ 8MB, tải luôn cho nhanh)
+        dataset = load_dataset(path, split=f"{split}[:{n}]")
+        print(f"\n✅ Tải thành công. Cấu trúc (Features):")
+        # In ra các cột và kiểu dữ liệu
+        print(dataset.features)
+        print(f"\n--- Đang xem {n} mẫu đầu tiên ---")
+        for i, item in enumerate(dataset):
+            print(f"\n--- Mẫu {i+1} ---")
+            printable_item = {}
+            for key, value in item.items():
+                printable_item[key] = safe_value_to_string(value)
+            print(json.dumps(printable_item, ensure_ascii=False, indent=2))
+    except Exception as e:
+        print(f"\n❌ LỖI khi tải hoặc đọc dataset '{name}':")
+        print(f"   {e}")
+        traceback.print_exc()
+def list_available_splits(path):
+    # Hàm này không cần thiết nữa nếu chúng ta tải trực tiếp
+    pass
+def main():
+    print("🚀 BẮT ĐẦU KHÁM PHÁ IELTS DATASETS (CHỈ hai2131)")
+    print("="*80)
+    for name, path in DATASET_LIST.items():
+        try:
+            explore_dataset(name, path, SPLIT_NAME, NUM_SAMPLES_TO_VIEW)
+        except KeyboardInterrupt:
+            print("\n⚠️  Bị gián đoạn bởi người dùng")
+            break
+        except Exception as e:
+            print(f"\n❌ Lỗi không mong muốn: {e}")
+            traceback.print_exc()
+        print("\n" + "-"*80)
+    print("\n" + "="*80)
+    print("✅ KHÁM PHÁ HOÀN TẤT")
+    print("="*80)
+if __name__ == "__main__":
+    main()

src/explore_speaking.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import whisper
+import librosa
+import numpy as np
+import os
+import warnings
+warnings.filterwarnings("ignore")
+def analyze_speaking_audio(audio_path):
+    print(f"🎤 Đang phân tích file: {audio_path}")
+    # --- 1. Load Model Whisper (ASR) ---
+    print("⏳ Đang tải model Whisper (có thể lâu lần đầu)...")
+    model = whisper.load_model("base")
+    # --- 2. Transcribe (Chuyển giọng thành chữ) ---
+    print("📝 Đang chuyển đổi giọng nói...")
+    result = model.transcribe(audio_path, fp16=False)
+    transcript = result["text"].strip()
+    print("\n" + "="*40)
+    print("TRANSCRIPT:")
+    print(f"'{transcript}'")
+    print("="*40 + "\n")
+    # --- 3. Phân tích Fluency (Trôi chảy) ---
+    # Dùng librosa để phân tích tín hiệu âm thanh
+    y, sr = librosa.load(audio_path)
+    duration = librosa.get_duration(y=y, sr=sr)
+    # Đếm số từ
+    word_count = len(transcript.split())
+    # Tính tốc độ nói (Words Per Minute - WPM)
+    wpm = (word_count / duration) * 60
+    # Phát hiện khoảng lặng (Pauses)
+    # top_db: ngưỡng decibel để coi là im lặng
+    non_silent_intervals = librosa.effects.split(y, top_db=20)
+    silent_duration = duration - sum([ (end-start)/sr for start, end in non_silent_intervals ])
+    pause_ratio = silent_duration / duration
+    print("ACOUSTIC METRICS:")
+    print(f"- Thời lượng (Duration): {duration:.2f} giây")
+    print(f"- Số từ (Word Count): {word_count}")
+    print(f"- Tốc độ (Speed): {wpm:.2f} WPM (Chuẩn IELTS 6.0+ thường > 100 WPM)")
+    print(f"- Thời gian im lặng: {silent_duration:.2f} giây ({pause_ratio*100:.1f}%)")
+    # --- 4. Đánh giá sơ bộ ---
+    fluency_score_est = 0
+    if wpm > 120: fluency_score_est = 7.0
+    elif wpm > 100: fluency_score_est = 6.0
+    elif wpm > 80: fluency_score_est = 5.0
+    else: fluency_score_est = 4.0
+    print(f"\n💡 Đánh giá sơ bộ Fluency: ~{fluency_score_est}")
+    return {
+        "transcript": transcript,
+        "wpm": wpm,
+        "pause_ratio": pause_ratio
+    }
+if __name__ == "__main__":
+    sample_audio = "data/test_speaking.m4a"
+    if os.path.exists(sample_audio):
+        analyze_speaking_audio(sample_audio)
+    else:
+        print(f"Không tìm thấy file '{sample_audio}'.")
+        print("Hãy tạo một file ghi âm tiếng Anh, lưu vào đó và chạy lại.")

src/pronunciation.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import torch
+import librosa
+import numpy as np
+import os
+import traceback
+import subprocess
+import shutil
+from transformers import (
+    Wav2Vec2ForCTC,
+    AutoTokenizer,
+    Wav2Vec2FeatureExtractor
+)
+print("Loading Pronunciation module...")
+MODEL_ID = "facebook/wav2vec2-lv-60-espeak-cv-ft"
+model = None
+tokenizer = None
+feature_extractor = None
+def find_espeak_exe():
+    candidates = [
+        r"C:\Program Files\eSpeak NG\espeak-ng.exe",
+        r"C:\Program Files (x86)\eSpeak NG\espeak-ng.exe",
+        r"D:\Program Files\eSpeak NG\espeak-ng.exe"
+    ]
+    path_in_env = shutil.which("espeak-ng")
+    if path_in_env: return path_in_env
+    for path in candidates:
+        if os.path.exists(path):
+            return path
+    return None
+ESPEAK_PATH = find_espeak_exe()
+if ESPEAK_PATH:
+    print(f"Found eSpeak at: {ESPEAK_PATH}")
+else:
+    print("WARNING: eSpeak-ng not found. IPA generation will fail.")
+try:
+    print("Loading Feature Extractor...")
+    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_ID)
+    print("Loading Tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    print("Loading Acoustic Model...")
+    model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
+    print("Pronunciation module ready.")
+except Exception as e:
+    print(f"Failed to load AI model: {e}")
+def get_expected_ipa(text):
+    """Gọi subprocess espeak-ng.exe để lấy IPA chuẩn từ văn bản."""
+    if not ESPEAK_PATH:
+        return "N/A"
+    try:
+        cmd = [ESPEAK_PATH, "-v", "en-us", "-q", "--ipa", text]
+        startupinfo = None
+        if os.name == 'nt':
+            startupinfo = subprocess.STARTUPINFO()
+            startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            encoding='utf-8',
+            startupinfo=startupinfo
+        )
+        if result.returncode == 0:
+            return result.stdout.strip().replace('\n', ' ')
+        else:
+            return "N/A"
+    except Exception as e:
+        print(f"Subprocess error: {e}")
+        return "N/A"
+def grade_pronunciation_advanced(audio_path, reference_text):
+    """
+    Trả về chuỗi IPA thực tế (Audio) và IPA chuẩn (Text).
+    """
+    actual_ipa = "N/A"
+    if model and tokenizer and feature_extractor:
+        try:
+            y, sr = librosa.load(audio_path, sr=16000)
+            input_values = feature_extractor(y, sampling_rate=16000, return_tensors="pt").input_values
+            with torch.no_grad():
+                logits = model(input_values).logits
+            predicted_ids = torch.argmax(logits, dim=-1)
+            actual_ipa = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        except Exception as e:
+            print(f"AI IPA Error: {e}")
+            actual_ipa = "Error"
+    expected_ipa = get_expected_ipa(reference_text)
+    return {
+        "actual_ipa": actual_ipa,
+        "expected_ipa": expected_ipa
+    }

src/train.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import json
+import numpy as np
+from datasets import Dataset, DatasetDict
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    TrainingArguments,
+    Trainer,
+    EvalPrediction
+)
+from sklearn.metrics import mean_squared_error, mean_absolute_error
+from huggingface_hub import HfFolder, notebook_login
+MODEL_NAME = "roberta-base"
+DATASET_PATH = "/content/data/dataset_for_scorer.json"
+MODEL_OUTPUT_DIR = "./ielts_grader_model"
+HUB_MODEL_ID = "diminch/ielts-grader-ai"
+def load_and_prepare_data(dataset_path):
+    print(f"Đang tải dữ liệu từ {dataset_path}...")
+    with open(dataset_path, "r", encoding="utf-8") as f:
+        raw_data = json.load(f)
+    processed_data = []
+    for item in raw_data:
+        text = item['prompt_text'] + " [SEP] " + item['essay_text']
+        labels = [
+            float(item['scores']['task_response']),
+            float(item['scores']['coherence_cohesion']),
+            float(item['scores']['lexical_resource']),
+            float(item['scores']['grammatical_range'])
+        ]
+        processed_data.append({"text": text, "label": labels})
+    print(f"Tổng cộng {len(processed_data)} mẫu.")
+    dataset = Dataset.from_list(processed_data)
+    train_test_split = dataset.train_test_split(test_size=0.1)
+    dataset_dict = DatasetDict({
+        'train': train_test_split['train'],
+        'test': train_test_split['test']
+    })
+    return dataset_dict
+def tokenize_data(dataset_dict, tokenizer):
+    print("Đang tokenize dữ liệu...")
+    def tokenize_function(examples):
+        return tokenizer(
+            examples['text'],
+            padding="max_length",
+            truncation=True,
+            max_length=512
+        )
+    tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
+    return tokenized_datasets
+def compute_metrics(p: EvalPrediction):
+    preds = p.predictions
+    labels = p.label_ids
+    rmse_tr = np.sqrt(mean_squared_error(labels[:, 0], preds[:, 0]))
+    rmse_cc = np.sqrt(mean_squared_error(labels[:, 1], preds[:, 1]))
+    rmse_lr = np.sqrt(mean_squared_error(labels[:, 2], preds[:, 2]))
+    rmse_gra = np.sqrt(mean_squared_error(labels[:, 3], preds[:, 3]))
+    mae_tr = mean_absolute_error(labels[:, 0], preds[:, 0])
+    mae_cc = mean_absolute_error(labels[:, 1], preds[:, 1])
+    mae_lr = mean_absolute_error(labels[:, 2], preds[:, 2])
+    mae_gra = mean_absolute_error(labels[:, 3], preds[:, 3])
+    avg_rmse = np.mean([rmse_tr, rmse_cc, rmse_lr, rmse_gra])
+    return {
+        "avg_rmse": avg_rmse,
+        "rmse_task_response": rmse_tr,
+        "rmse_coherence_cohesion": rmse_cc,
+        "rmse_lexical_resource": rmse_lr,
+        "rmse_grammatical_range": rmse_gra,
+        "mae_task_response": mae_tr,
+        "mae_coherence_cohesion": mae_cc,
+        # ... có thể thêm các MAE khác
+    }
+def main():
+    print("Vui lòng dán token Hugging Face (quyền 'write') của bạn:")
+    # (Nếu chạy trên Colab, nó sẽ hiện ô input)
+    # notebook_login()
+    # Hoặc nếu chạy local, dùng 'huggingface-cli login' trước
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    dataset_dict = load_and_prepare_data(DATASET_PATH)
+    tokenized_datasets = tokenize_data(dataset_dict, tokenizer)
+    print("Đang tải mô hình nền tảng...")
+    model = AutoModelForSequenceClassification.from_pretrained(
+        MODEL_NAME,
+        num_labels=4,
+        problem_type="regression"
+    )
+    training_args = TrainingArguments(
+        output_dir=MODEL_OUTPUT_DIR,
+        learning_rate=2e-5,
+        per_device_train_batch_size=8,
+        per_device_eval_batch_size=8,
+        num_train_epochs=3,
+        weight_decay=0.01,
+        eval_strategy="epoch", # Changed evaluation_strategy to eval_strategy
+        save_strategy="epoch",
+        load_best_model_at_end=True,
+        metric_for_best_model="avg_rmse",
+        greater_is_better=False,
+        push_to_hub=True,
+        hub_model_id=HUB_MODEL_ID,
+        hub_strategy="end",
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_datasets["train"],
+        eval_dataset=tokenized_datasets["test"],
+        compute_metrics=compute_metrics,
+        tokenizer=tokenizer,
+    )
+    print("--- BẮT ĐẦU HUẤN LUYỆN ---")
+    trainer.train()
+    print("--- HUẤN LUYỆN HOÀN TẤT ---")
+    print("--- ĐÁNH GIÁ TRÊN TẬP TEST ---")
+    eval_results = trainer.evaluate()
+    print(json.dumps(eval_results, indent=2))
+    print("Đang đẩy model tốt nhất lên Hugging Face Hub...")
+    trainer.push_to_hub()
+    print(f"Hoàn tất! Model của bạn đã ở trên Hub: https://huggingface.co/{HUB_MODEL_ID}")
+if __name__ == "__main__":
+    import os
+    if not os.path.exists(DATASET_PATH):
+        print(f"LỖI: Không tìm thấy file {DATASET_PATH}.")
+    else:
+        main()