VeuReu commited on
Commit
c705b37
·
verified ·
1 Parent(s): bc0dfdc

Upload 3 files

Browse files
generacion_clusters_video_nuevo.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from sklearn.cluster import DBSCAN
3
+ import numpy as np
4
+ import json
5
+
6
+ class DataHub:
7
+ def __init__(self, video_analysis_json_path: str):
8
+ print("DataHub inicializando con JSON:", video_analysis_json_path)
9
+ self.video = json.loads(Path(video_analysis_json_path).read_text(encoding='utf-8'))
10
+
11
+ class get_face_clusters:
12
+ def __init__(self, data: DataHub):
13
+ self.data = data
14
+
15
+ def get_clusters(self, eps: float, min_samples: int):
16
+ caras = self.data.video.get("caras", {})
17
+
18
+ embeddings_caras = []
19
+
20
+ for cara in caras:
21
+ embeddings_caras.append(cara['embeddings'])
22
+
23
+ X = np.array(embeddings_caras)
24
+
25
+ clustering = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean').fit(X)
26
+ labels_caras = clustering.labels_
27
+ print(labels_caras)
28
+
29
+ return labels_caras
30
+
31
+ class get_voices_clusters:
32
+ def __init__(self, data: DataHub):
33
+ self.data = data
34
+
35
+ def get_clusters(self, eps: float, min_samples: int):
36
+ voices = self.data.video.get("voices", {})
37
+
38
+ embeddings_voices = []
39
+
40
+ for voice in voices:
41
+ embeddings_voices.append(voice['embeddings'])
42
+
43
+ X = np.array(embeddings_voices)
44
+
45
+ clustering = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean').fit(X)
46
+ labels_voices = clustering.labels_
47
+ print(labels_voices)
48
+
49
+ return labels_voices
50
+
51
+ class get_scene_clusters:
52
+ def __init__(self, data: DataHub):
53
+ self.data = data
54
+
55
+ def get_clusters(self, eps: float, min_samples: int):
56
+ scenes = self.data.video.get("escenas", {})
57
+
58
+ embeddings_scenes = []
59
+
60
+ for scene in scenes:
61
+ embeddings_scenes.append(scene['embeddings'])
62
+
63
+ X = np.array(embeddings_scenes)
64
+
65
+ clustering = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean').fit(X)
66
+ labels_scenes = clustering.labels_
67
+ print(labels_scenes)
68
+
69
+ return labels_scenes
70
+
71
+ video = "dif_catala_1_2"
72
+ analysis_path = f"/home/acasado/bsc/proyecto_bsc/{video}/analysis.json"
73
+
74
+ datahub = DataHub(analysis_path)
75
+
76
+ face_clusterer = get_face_clusters(datahub)
77
+ voice_clusterer = get_voices_clusters(datahub)
78
+ scene_clusterer = get_scene_clusters(datahub)
79
+
80
+ labels_caras = face_clusterer.get_clusters(eps=0.4, min_samples=2)
81
+ labels_voces = voice_clusterer.get_clusters(eps=1.3, min_samples=1)
82
+ labels_escenas = scene_clusterer.get_clusters(eps=1.3, min_samples=2)
generacion_video_nuevo.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import os
3
+ import json
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ from libs.vision_tools_salamandra_2 import FaceOfImageEmbedding_video_nuevo
8
+ from libs.audio_tools_ana_2 import extract_audio_ffmpeg, diarize_audio, embed_voice_segments
9
+ from libs.vision_tools_salamandra_2 import ImageEmbedding, keyframe_conditional_extraction_ana
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+ def faces_embedding_extraction(video_path: str, output_dir_caras: Path):
15
+ extract_every = 1.0
16
+ embedder = FaceOfImageEmbedding_video_nuevo()
17
+ video = cv2.VideoCapture(video_path)
18
+ fps = int(video.get(cv2.CAP_PROP_FPS))
19
+ frame_interval = int(fps * extract_every)
20
+ frame_count = 0
21
+ saved_count = 0
22
+
23
+ embeddings_caras = []
24
+
25
+ while True:
26
+ ret, frame = video.read()
27
+ if not ret:
28
+ break
29
+
30
+ if frame_count % frame_interval == 0:
31
+ temp_path = output_dir_caras / "temp_frame.jpg"
32
+ cv2.imwrite(str(temp_path), frame)
33
+ resultados = embedder.encode_image(temp_path)
34
+
35
+ if resultados:
36
+ for i, r in enumerate(resultados):
37
+ embedding = r['embedding']
38
+ cara = r['face_crop']
39
+ save_path = output_dir_caras / f"frame_{saved_count:04d}.jpg"
40
+ cv2.imwrite(str(save_path), cv2.cvtColor(cara, cv2.COLOR_RGB2BGR))
41
+ embeddings_caras.append({"embeddings":embedding, "path": str(save_path)})
42
+ saved_count += 1
43
+
44
+ os.remove(temp_path)
45
+
46
+ frame_count += 1
47
+ video.release()
48
+
49
+ return embeddings_caras
50
+
51
+ def voices_embedding_extraction(video_path: str, output_dir_audio: Path):
52
+ sr = 16000
53
+ fmt = "wav"
54
+
55
+ wav_path = extract_audio_ffmpeg(
56
+ video_path,
57
+ output_dir_audio / f"{Path(video_path).stem}.{fmt}",
58
+ sr=sr
59
+ )
60
+ min_dur = 0.5
61
+ max_dur = 10.0
62
+
63
+ clip_paths, diar_segs = diarize_audio(
64
+ wav_path,
65
+ output_dir_audio,
66
+ "clips",
67
+ min_dur,
68
+ max_dur
69
+ )
70
+
71
+ embeddings_voices = []
72
+
73
+ embeddings = embed_voice_segments(clip_paths)
74
+
75
+ for i, emb in enumerate(embeddings):
76
+ embeddings_voices.append({"embeddings": emb, "path": str(clip_paths[i])})
77
+
78
+ return embeddings_voices
79
+
80
+ def scenes_embedding_extraction(video_path: str, output_dir_scenes: Path):
81
+ keyframes_final =keyframe_conditional_extraction_ana(
82
+ video_path=video_path,
83
+ output_dir=output_dir_scenes,
84
+ threshold=30.0,
85
+ )
86
+
87
+ image_embedder = ImageEmbedding()
88
+
89
+ embeddings_escenas = []
90
+
91
+ for keyframe in keyframes_final:
92
+ frame_path = keyframe["path"]
93
+ embedding = image_embedder.encode_image(frame_path)
94
+ embeddings_escenas.append({"embeddings": embedding, "path": str(frame_path)})
95
+
96
+ return embeddings_escenas
97
+
98
+ video_path = "/home/acasado/bsc/proyecto_bsc/base_datos_dif_catala_1/dif_catala_1.mp4"
99
+ video_concreto = "dif_catala_1_2"
100
+ output_dir_caras = Path(f"/home/acasado/bsc/proyecto_bsc/{video_concreto}/frames")
101
+ output_dir_caras.mkdir(parents=True, exist_ok=True)
102
+ output_dir_audio = Path(f"/home/acasado/bsc/proyecto_bsc/{video_concreto}/audio")
103
+ output_dir_audio.mkdir(parents=True, exist_ok=True)
104
+ output_dir_escenas = Path(f"/home/acasado/bsc/proyecto_bsc/{video_concreto}/escenas")
105
+ output_dir_escenas.mkdir(parents=True, exist_ok=True)
106
+
107
+ embeddings_caras = faces_embedding_extraction(video_path, output_dir_caras)
108
+ embeddings_voices = voices_embedding_extraction(video_path, output_dir_audio)
109
+ embeddings_escenas = scenes_embedding_extraction(video_path, output_dir_escenas)
110
+
111
+ embeddings_finales = {
112
+ "caras": embeddings_caras,
113
+ "voices": embeddings_voices,
114
+ "escenas": embeddings_escenas
115
+ }
116
+
117
+ analysis_path = f"/home/acasado/bsc/proyecto_bsc/{video_concreto}/analysis.json"
118
+
119
+ try:
120
+ with open(analysis_path, "w", encoding="utf-8") as f:
121
+ json.dump(embeddings_finales, f, indent=2, ensure_ascii=False)
122
+ logger.info("Analysis JSON saved: %s", analysis_path)
123
+ except Exception as e:
124
+ logger.warning(f"Failed to write analysis JSON: {e}")
vision_tools.py CHANGED
@@ -39,7 +39,7 @@ from scenedetect.detectors import ContentDetector
39
 
40
  import os, base64, requests, subprocess, contextlib, time
41
 
42
- from transformers import AutoProcessor, LlavaOneForConditionalGeneration
43
  from PIL import Image
44
 
45
  from libs.audio_tools_ana_2 import process_audio_for_video
@@ -263,7 +263,7 @@ def describe_montage_sequence(
263
  processor = AutoProcessor.from_pretrained(path_model)
264
  device = "cuda" if torch.cuda.is_available() else "cpu"
265
  dtype = torch.float16 if device == "cuda" else torch.float32
266
- model = LlavaOneForConditionalGeneration.from_pretrained(
267
  path_model,
268
  torch_dtype=dtype,
269
  low_cpu_mem_usage=True
 
39
 
40
  import os, base64, requests, subprocess, contextlib, time
41
 
42
+ from transformers import AutoProcessor, LlavaForConditionalGeneration
43
  from PIL import Image
44
 
45
  from libs.audio_tools_ana_2 import process_audio_for_video
 
263
  processor = AutoProcessor.from_pretrained(path_model)
264
  device = "cuda" if torch.cuda.is_available() else "cpu"
265
  dtype = torch.float16 if device == "cuda" else torch.float32
266
+ model = LlavaForConditionalGeneration.from_pretrained(
267
  path_model,
268
  torch_dtype=dtype,
269
  low_cpu_mem_usage=True