Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import cv2 | |
| import torch | |
| from pipelines.pipeline import InferencePipeline | |
| import time | |
| from huggingface_hub import hf_hub_download | |
| import os | |
| class ChaplinGradio: | |
| def __init__(self): | |
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| self.vsr_model = None | |
| self.download_models() | |
| self.load_models() | |
| # Video params | |
| self.fps = 16 | |
| self.frame_interval = 1 / self.fps | |
| self.frame_compression = 25 | |
| self.last_frame_time = time.time() | |
| def download_models(self): | |
| """Download required model files from HuggingFace""" | |
| # Create directories if they don't exist | |
| os.makedirs("benchmarks/LRS3/models/LRS3_V_WER19.1", exist_ok=True) | |
| os.makedirs("benchmarks/LRS3/language_models/lm_en_subword", exist_ok=True) | |
| # Download VSR model files | |
| hf_hub_download(repo_id="willwade/LRS3_V_WER19.1", | |
| filename="model.pth", | |
| local_dir="benchmarks/LRS3/models/LRS3_V_WER19.1") | |
| hf_hub_download(repo_id="willwade/LRS3_V_WER19.1", | |
| filename="model.json", | |
| local_dir="benchmarks/LRS3/models/LRS3_V_WER19.1") | |
| # Download language model files | |
| hf_hub_download(repo_id="willwade/lm_en_subword", | |
| filename="model.pth", | |
| local_dir="benchmarks/LRS3/language_models/lm_en_subword") | |
| hf_hub_download(repo_id="willwade/lm_en_subword", | |
| filename="model.json", | |
| local_dir="benchmarks/LRS3/language_models/lm_en_subword") | |
| print("Models downloaded successfully!") | |
| def load_models(self): | |
| """Load models using the InferencePipeline with LRS3 config""" | |
| config_path = "configs/LRS3_V_WER19.1.ini" | |
| self.vsr_model = InferencePipeline( | |
| config_path, | |
| device=self.device, | |
| detector="mediapipe", | |
| face_track=True | |
| ) | |
| print("Model loaded successfully!") | |
| def process_frame(self, frame): | |
| """Process a single frame with rate limiting and compression""" | |
| current_time = time.time() | |
| if current_time - self.last_frame_time < self.frame_interval: | |
| return None | |
| self.last_frame_time = current_time | |
| if frame is None: | |
| return "No video input detected" | |
| # Compress frame | |
| encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), self.frame_compression] | |
| _, buffer = cv2.imencode('.jpg', frame, encode_param) | |
| compressed_frame = cv2.imdecode(buffer, cv2.IMREAD_GRAYSCALE) | |
| # Run inference using the VSR model | |
| predicted_text = self.vsr_model.process_frame(compressed_frame) | |
| return predicted_text | |
| # Create Gradio interface | |
| chaplin = ChaplinGradio() | |
| iface = gr.Interface( | |
| fn=chaplin.process_frame, | |
| inputs=gr.Image(source="webcam", streaming=True), | |
| outputs=gr.Textbox(label="Predicted Text"), | |
| title="Chaplin - Live Visual Speech Recognition", | |
| description="Use your webcam to perform real-time visual speech recognition.", | |
| live=True | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() |