File size: 3,316 Bytes
5b30a24
 
 
 
 
2bc52c8
 
5b30a24
 
 
 
 
 
2bc52c8
5b30a24
 
 
 
 
 
 
 
2bc52c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b30a24
2bc52c8
 
5b30a24
 
2bc52c8
5b30a24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import gradio as gr
import cv2
import torch
from pipelines.pipeline import InferencePipeline
import time
from huggingface_hub import hf_hub_download
import os


class ChaplinGradio:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.vsr_model = None
        self.download_models()
        self.load_models()
        
        # Video params
        self.fps = 16
        self.frame_interval = 1 / self.fps
        self.frame_compression = 25
        self.last_frame_time = time.time()

    def download_models(self):
        """Download required model files from HuggingFace"""
        # Create directories if they don't exist
        os.makedirs("benchmarks/LRS3/models/LRS3_V_WER19.1", exist_ok=True)
        os.makedirs("benchmarks/LRS3/language_models/lm_en_subword", exist_ok=True)
        
        # Download VSR model files
        hf_hub_download(repo_id="willwade/LRS3_V_WER19.1", 
                       filename="model.pth",
                       local_dir="benchmarks/LRS3/models/LRS3_V_WER19.1")
        hf_hub_download(repo_id="willwade/LRS3_V_WER19.1", 
                       filename="model.json",
                       local_dir="benchmarks/LRS3/models/LRS3_V_WER19.1")
        
        # Download language model files
        hf_hub_download(repo_id="willwade/lm_en_subword", 
                       filename="model.pth",
                       local_dir="benchmarks/LRS3/language_models/lm_en_subword")
        hf_hub_download(repo_id="willwade/lm_en_subword", 
                       filename="model.json",
                       local_dir="benchmarks/LRS3/language_models/lm_en_subword")
        
        print("Models downloaded successfully!")

    def load_models(self):
        """Load models using the InferencePipeline with LRS3 config"""
        config_path = "configs/LRS3_V_WER19.1.ini"
        
        self.vsr_model = InferencePipeline(
            config_path,
            device=self.device,
            detector="mediapipe",
            face_track=True
        )
        print("Model loaded successfully!")

    def process_frame(self, frame):
        """Process a single frame with rate limiting and compression"""
        current_time = time.time()
        
        if current_time - self.last_frame_time < self.frame_interval:
            return None
            
        self.last_frame_time = current_time
        
        if frame is None:
            return "No video input detected"
        
        # Compress frame
        encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), self.frame_compression]
        _, buffer = cv2.imencode('.jpg', frame, encode_param)
        compressed_frame = cv2.imdecode(buffer, cv2.IMREAD_GRAYSCALE)
        
        # Run inference using the VSR model
        predicted_text = self.vsr_model.process_frame(compressed_frame)
        
        return predicted_text


# Create Gradio interface
chaplin = ChaplinGradio()

iface = gr.Interface(
    fn=chaplin.process_frame,
    inputs=gr.Image(source="webcam", streaming=True),
    outputs=gr.Textbox(label="Predicted Text"),
    title="Chaplin - Live Visual Speech Recognition",
    description="Use your webcam to perform real-time visual speech recognition.",
    live=True
)

if __name__ == "__main__":
    iface.launch()