File size: 3,125 Bytes
d939bae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import torch
import librosa
import numpy as np
import os
import traceback
import subprocess
import shutil

from transformers import (
    Wav2Vec2ForCTC, 
    AutoTokenizer, 
    Wav2Vec2FeatureExtractor
)

print("Loading Pronunciation module...")

MODEL_ID = "facebook/wav2vec2-lv-60-espeak-cv-ft"
model = None
tokenizer = None
feature_extractor = None

def find_espeak_exe():
    candidates = [
        r"C:\Program Files\eSpeak NG\espeak-ng.exe",
        r"C:\Program Files (x86)\eSpeak NG\espeak-ng.exe",
        r"D:\Program Files\eSpeak NG\espeak-ng.exe"
    ]
    path_in_env = shutil.which("espeak-ng")
    if path_in_env: return path_in_env
    
    for path in candidates:
        if os.path.exists(path):
            return path
    return None

ESPEAK_PATH = find_espeak_exe()
if ESPEAK_PATH:
    print(f"Found eSpeak at: {ESPEAK_PATH}")
else:
    print("WARNING: eSpeak-ng not found. IPA generation will fail.")

try:
    print("Loading Feature Extractor...")
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_ID)
    
    print("Loading Tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    
    print("Loading Acoustic Model...")
    model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
    
    print("Pronunciation module ready.")
except Exception as e:
    print(f"Failed to load AI model: {e}")

def get_expected_ipa(text):
    """Gọi subprocess espeak-ng.exe để lấy IPA chuẩn từ văn bản."""
    if not ESPEAK_PATH:
        return "N/A"
    
    try:
        cmd = [ESPEAK_PATH, "-v", "en-us", "-q", "--ipa", text]
        
        startupinfo = None
        if os.name == 'nt':
            startupinfo = subprocess.STARTUPINFO()
            startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
            
        result = subprocess.run(
            cmd, 
            capture_output=True, 
            text=True, 
            encoding='utf-8',
            startupinfo=startupinfo
        )
        
        if result.returncode == 0:
            return result.stdout.strip().replace('\n', ' ')
        else:
            return "N/A"
            
    except Exception as e:
        print(f"Subprocess error: {e}")
        return "N/A"

def grade_pronunciation_advanced(audio_path, reference_text):
    """
    Trả về chuỗi IPA thực tế (Audio) và IPA chuẩn (Text).
    """
    actual_ipa = "N/A"
    if model and tokenizer and feature_extractor:
        try:
            y, sr = librosa.load(audio_path, sr=16000)
            
            input_values = feature_extractor(y, sampling_rate=16000, return_tensors="pt").input_values
            with torch.no_grad():
                logits = model(input_values).logits
            
            predicted_ids = torch.argmax(logits, dim=-1)
            actual_ipa = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
            
        except Exception as e:
            print(f"AI IPA Error: {e}")
            actual_ipa = "Error"

    expected_ipa = get_expected_ipa(reference_text)

    return {
        "actual_ipa": actual_ipa,
        "expected_ipa": expected_ipa
    }