caixiaoshun commited on
Commit
0015b84
·
verified ·
1 Parent(s): 04dcef6

Create extract_emotions_from_metadata.py

Browse files
Files changed (1) hide show
  1. extract_emotions_from_metadata.py +351 -0
extract_emotions_from_metadata.py ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import json
4
+ import numpy as np
5
+ import torch
6
+ import argparse
7
+ from facenet_pytorch import MTCNN
8
+ from hsemotion.facial_emotions import HSEmotionRecognizer
9
+ from PIL import Image
10
+
11
+
12
+ def detect_largest_face(frame, mtcnn):
13
+ """Detect faces in a frame using MTCNN and return only the largest face"""
14
+ bounding_boxes, probs = mtcnn.detect(frame, landmarks=False)
15
+ if bounding_boxes is not None and probs is not None:
16
+ # Filter boxes by probability
17
+ valid_indices = probs > 0.9
18
+ bounding_boxes = bounding_boxes[valid_indices]
19
+
20
+ if len(bounding_boxes) > 0:
21
+ # Calculate areas of all detected faces
22
+ areas = []
23
+ for bbox in bounding_boxes:
24
+ x1, y1, x2, y2 = bbox[0:4]
25
+ area = (x2 - x1) * (y2 - y1)
26
+ areas.append(area)
27
+
28
+ # Find the index of the largest face
29
+ largest_index = np.argmax(areas)
30
+
31
+ # Return only the largest face
32
+ return bounding_boxes[largest_index:largest_index+1]
33
+
34
+ return None
35
+
36
+
37
+ def softmax(x):
38
+ """Compute softmax values for x"""
39
+ e_x = np.exp(x - np.max(x))
40
+ return e_x / e_x.sum(axis=0)
41
+
42
+
43
+ def process_image_for_emotions(image_path, fer, mtcnn):
44
+ """Process a single image file and extract emotions"""
45
+ try:
46
+ # Load image using OpenCV for consistency with video processing
47
+ frame_bgr = cv2.imread(image_path)
48
+ if frame_bgr is None:
49
+ print(f"Error: Could not load image {image_path}")
50
+ return None
51
+
52
+ # Convert BGR to RGB
53
+ frame = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
54
+
55
+ # Detect the largest face
56
+ bounding_boxes = detect_largest_face(frame, mtcnn)
57
+
58
+ if bounding_boxes is not None and len(bounding_boxes) > 0:
59
+ # Use the largest detected face
60
+ bbox = bounding_boxes[0]
61
+ box = bbox.astype(int)
62
+ x1, y1, x2, y2 = box[0:4]
63
+
64
+ # Ensure coordinates are within image bounds
65
+ x1 = max(0, x1)
66
+ y1 = max(0, y1)
67
+ x2 = min(frame.shape[1], x2)
68
+ y2 = min(frame.shape[0], y2)
69
+
70
+ # Extract face region
71
+ if x2 > x1 and y2 > y1:
72
+ face_img = frame[y1:y2, x1:x2, :]
73
+
74
+ # Predict emotions with logits=False to get probabilities
75
+ emotion, scores = fer.predict_emotions(face_img, logits=False)
76
+
77
+ # For MTL models, we need to use only the emotion scores (not VA scores)
78
+ if fer.is_mtl:
79
+ scores = scores[:-2] # Remove last 2 elements (VA scores)
80
+
81
+ # Apply softmax to ensure probabilities sum to 1
82
+ probabilities = softmax(scores)
83
+
84
+ # Convert to list for processing
85
+ prob_list = probabilities.tolist() if isinstance(probabilities, np.ndarray) else list(probabilities)
86
+
87
+ # Create emotion probabilities dictionary using idx_to_class from fer
88
+ emotion_probabilities = {}
89
+ for i in range(len(fer.idx_to_class)):
90
+ label = fer.idx_to_class[i]
91
+ emotion_probabilities[label] = round(prob_list[i], 4) if i < len(prob_list) else 0.0
92
+
93
+ # Get emotion index
94
+ dominant_emotion_index = -1
95
+ for idx, label in fer.idx_to_class.items():
96
+ if label == emotion:
97
+ dominant_emotion_index = idx
98
+ break
99
+
100
+ return {
101
+ "emotion_probabilities": emotion_probabilities,
102
+ "dominant_emotion": emotion,
103
+ "dominant_emotion_index": dominant_emotion_index
104
+ }
105
+
106
+ # If no face detected
107
+ print(f"No faces detected in image {image_path}")
108
+ return {
109
+ "emotion_probabilities": None,
110
+ "dominant_emotion": None,
111
+ "dominant_emotion_index": -1
112
+ }
113
+ except Exception as e:
114
+ print(f"Error processing image {image_path}: {e}")
115
+ return None
116
+
117
+
118
+ def process_video_for_emotions(video_path, fer, mtcnn, skip_frames=5):
119
+ """Process a single video file and extract average emotions"""
120
+ # Open video file
121
+ cap = cv2.VideoCapture(video_path)
122
+
123
+ if not cap.isOpened():
124
+ print(f"Error: Could not open video {video_path}")
125
+ return None
126
+
127
+ # Get video properties
128
+ fps = cap.get(cv2.CAP_PROP_FPS)
129
+ frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
130
+
131
+ # Store emotion data (without fps and frame_count)
132
+ emotion_data = {
133
+ "emotion_probabilities": None,
134
+ "dominant_emotion": None,
135
+ "dominant_emotion_index": None
136
+ }
137
+
138
+ # For averaging emotions across frames
139
+ all_scores = []
140
+ emotion_counts = {}
141
+
142
+ frame_idx = 0
143
+
144
+ while True:
145
+ ret, frame_bgr = cap.read()
146
+ if not ret:
147
+ break
148
+
149
+ # Skip frames if needed
150
+ if frame_idx % skip_frames != 0:
151
+ frame_idx += 1
152
+ continue
153
+
154
+ # Convert BGR to RGB
155
+ frame = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
156
+
157
+ # Detect the largest face
158
+ bounding_boxes = detect_largest_face(frame, mtcnn)
159
+
160
+ if bounding_boxes is not None:
161
+ # Use the largest detected face
162
+ bbox = bounding_boxes[0]
163
+ box = bbox.astype(int)
164
+ x1, y1, x2, y2 = box[0:4]
165
+
166
+ # Ensure coordinates are within frame bounds
167
+ x1 = max(0, x1)
168
+ y1 = max(0, y1)
169
+ x2 = min(frame.shape[1], x2)
170
+ y2 = min(frame.shape[0], y2)
171
+
172
+ # Extract face region
173
+ if x2 > x1 and y2 > y1:
174
+ face_img = frame[y1:y2, x1:x2, :]
175
+
176
+ # Predict emotions with logits=False to get probabilities
177
+ try:
178
+ emotion, scores = fer.predict_emotions(face_img, logits=False)
179
+
180
+ # For MTL models, we need to use only the emotion scores (not VA scores)
181
+ if fer.is_mtl:
182
+ scores = scores[:-2] # Remove last 2 elements (VA scores)
183
+
184
+ # Convert scores to list for JSON serialization
185
+ scores_list = scores.tolist() if isinstance(scores, np.ndarray) else list(scores)
186
+
187
+ # Store scores for averaging
188
+ all_scores.append(scores_list)
189
+
190
+ # Count emotions
191
+ if emotion in emotion_counts:
192
+ emotion_counts[emotion] += 1
193
+ else:
194
+ emotion_counts[emotion] = 1
195
+
196
+ except Exception as e:
197
+ print(f"Error predicting emotions for frame {frame_idx}: {e}")
198
+
199
+ frame_idx += 1
200
+
201
+ # Print progress
202
+ if frame_idx % 30 == 0:
203
+ print(f"Processed {frame_idx}/{frame_count} frames")
204
+
205
+ cap.release()
206
+
207
+ # Calculate average emotions if we have data
208
+ if all_scores:
209
+ # Calculate average scores
210
+ avg_scores = np.mean(np.array(all_scores), axis=0)
211
+
212
+ # Apply softmax to ensure probabilities sum to 1
213
+ probabilities = softmax(avg_scores)
214
+
215
+ # Convert to list for JSON serialization
216
+ prob_list = probabilities.tolist() if isinstance(probabilities, np.ndarray) else list(probabilities)
217
+
218
+ # Find dominant emotion
219
+ dominant_emotion = max(emotion_counts, key=emotion_counts.get)
220
+
221
+ # Create emotion probabilities dictionary using idx_to_class from fer
222
+ emotion_probabilities = {}
223
+ for i in range(len(fer.idx_to_class)):
224
+ label = fer.idx_to_class[i]
225
+ emotion_probabilities[label] = round(prob_list[i], 4) if i < len(prob_list) else 0.0
226
+
227
+ # Get emotion index
228
+ dominant_emotion_index = -1
229
+ for idx, label in fer.idx_to_class.items():
230
+ if label == dominant_emotion:
231
+ dominant_emotion_index = idx
232
+ break
233
+
234
+ # Store in emotion_data
235
+ emotion_data["emotion_probabilities"] = emotion_probabilities
236
+ emotion_data["dominant_emotion"] = dominant_emotion
237
+ emotion_data["dominant_emotion_index"] = dominant_emotion_index
238
+ emotion_data["emotion_distribution"] = emotion_counts
239
+
240
+ return emotion_data
241
+ else:
242
+ print("No faces detected in the video")
243
+ return None
244
+
245
+
246
+ def process_metadata_file(metadata_path, output_path, model_name='enet_b0_8_best_afew', root_path=None, skip_frames=5):
247
+ """Process files listed in metadata.json and add emotion information"""
248
+ # Check if CUDA is available
249
+ use_cuda = torch.cuda.is_available()
250
+ device = 'cuda' if use_cuda else 'cpu'
251
+ print(f"Using device: {device}")
252
+
253
+ # Initialize MTCNN for face detection
254
+ mtcnn = MTCNN(keep_all=False, post_process=False, min_face_size=40, device=device)
255
+
256
+ # Initialize HSEmotionRecognizer
257
+ fer = HSEmotionRecognizer(model_name=model_name, device=device)
258
+
259
+ # Read metadata file
260
+ with open(metadata_path, 'r') as f:
261
+ metadata_list = json.load(f)
262
+
263
+ # Process each file in metadata and add emotion information
264
+ for metadata in metadata_list:
265
+ file_path = metadata.get("file_path")
266
+ file_type = metadata.get("type", "unknown")
267
+
268
+ # If root_path is provided, join it with the file_path
269
+ if root_path and file_path:
270
+ file_path = os.path.join(root_path, file_path)
271
+
272
+ if not file_path or not os.path.exists(file_path):
273
+ print(f"File not found: {file_path}")
274
+ # Add empty emotion data
275
+ metadata["emotion"] = {
276
+ "emotion_probabilities": None,
277
+ "dominant_emotion": None,
278
+ "dominant_emotion_index": -1
279
+ }
280
+ continue
281
+
282
+ print(f"Processing {file_type} file: {file_path}")
283
+
284
+ try:
285
+ if file_type == "video":
286
+ result = process_video_for_emotions(file_path, fer, mtcnn, skip_frames)
287
+ elif file_type == "image":
288
+ result = process_image_for_emotions(file_path, fer, mtcnn)
289
+ else:
290
+ print(f"Unknown file type: {file_type}")
291
+ # Add empty emotion data
292
+ metadata["emotion"] = {
293
+ "emotion_probabilities": None,
294
+ "dominant_emotion": None,
295
+ "dominant_emotion_index": -1
296
+ }
297
+ continue
298
+
299
+ # Add emotion data to metadata
300
+ if result:
301
+ metadata["emotion"] = result
302
+ else:
303
+ # Add empty emotion data if processing failed
304
+ metadata["emotion"] = {
305
+ "emotion_probabilities": None,
306
+ "dominant_emotion": None,
307
+ "dominant_emotion_index": -1
308
+ }
309
+ except Exception as e:
310
+ print(f"Error processing file {file_path}: {e}")
311
+ # Add empty emotion data if processing failed
312
+ metadata["emotion"] = {
313
+ "emotion_probabilities": None,
314
+ "dominant_emotion": None,
315
+ "dominant_emotion_index": -1
316
+ }
317
+
318
+ # Save updated metadata to output file
319
+ with open(output_path, 'w') as f:
320
+ json.dump(metadata_list, f, indent=2)
321
+
322
+ print(f"Updated metadata with emotion data saved to {output_path}")
323
+ print(f"Processed {len(metadata_list)} files")
324
+ return output_path
325
+
326
+
327
+ def main():
328
+ # Create argument parser
329
+ parser = argparse.ArgumentParser(description='Extract emotions from files listed in metadata.json')
330
+ parser.add_argument('--input', '-i', type=str, default='metadata.json',
331
+ help='Input metadata JSON file (default: metadata.json)')
332
+ parser.add_argument('--output', '-o', type=str, default='metadata_with_emotions.json',
333
+ help='Output metadata JSON file with emotion data (default: metadata_with_emotions.json)')
334
+ parser.add_argument('--model', '-m', type=str, default='enet_b0_8_best_afew',
335
+ choices=['enet_b0_8_best_afew', 'enet_b0_8_best_vgaf', 'enet_b0_8_va_mtl', 'enet_b2_8', 'enet_b2_7'],
336
+ help='Model to use for emotion recognition')
337
+ parser.add_argument('--root', '-r', type=str, default=None,
338
+ help='Root path to prepend to file paths (default: None)')
339
+ parser.add_argument('--skip-frames', '-s', type=int, default=5,
340
+ help='Number of frames to skip between emotion detections (default: 5)')
341
+
342
+ # Parse arguments
343
+ args = parser.parse_args()
344
+
345
+ # Process files
346
+ output_path = process_metadata_file(args.input, args.output, args.model, args.root, args.skip_frames)
347
+
348
+
349
+
350
+ if __name__ == "__main__":
351
+ main()