File size: 4,117 Bytes
cc083dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# ===========================
# Veureu Engine – config.yaml
# ===========================

engine:
  output_root: "results"

api:
  cors_allow_origins: ["*"]
  sync_timeout_seconds: 3600

database:
  enabled: true
  persist_directory: "chroma_db"
  enable_face_recognition: true
  enable_voice_recognition: true
  face_collection: "index_faces"
  voice_collection: "index_voices"

jobs:
  enabled: false    # si activas cola async, cámbialo a true y añade JobManager en main_api.py
  max_workers: 1
  result_ttl_seconds: 86400

video_processing:
  keyframes:
    conditional_extraction:
      enable: true
      min_scene_length_seconds: 1.5
      difference_threshold: 28.0

  frames_per_second:
    enable: true
    fps: 1.0   # Frecuencia de frames de análisis

  ocr:
    engine: "tesseract"   # "tesseract" | "easyocr"
    language_hint: "spa"
    tesseract_cmd: ""     # si no está en PATH, deja la ruta

  faces:
    detector_model: "mtcnn"        # ajusta a tu vision_tools
    embedding_model: "Facenet512"  # usado por FaceOfImageEmbedding
    min_face_size: 32
    detection_confidence: 0.85

  ocr_clustering:
    method: "sequential_similarity"
    sentence_transformer: "all-MiniLM-L6-v2"
    similarity_threshold: 0.60     # mayor ⇒ menos clusters

audio_processing:
  sample_rate: 16000
  format: "wav"

  diarization:
    enabled: true
    force_silence_only: true       # Use silence-based segmentation (no pyannote)
    min_segment_duration: 0.5      # en segundos (clips cortos)
    max_segment_duration: 10.0
    silence_thresh: -40            # dBFS threshold for silence detection
    min_silence_len: 500           # milliseconds

  enable_voice_embeddings: true     # SpeechBrain ECAPA
  speaker_embedding:
    enabled: true

  # Identificación de hablantes (clustering + Chroma)
  voice_processing:
    speaker_identification:
      enabled: true
      find_optimal_clusters: true
      min_speakers: 1
      max_speakers: 5
      distance_threshold: 0.40

asr:
  # Controla la transcripción del audio completo además de los clips (útil para contexto global)
  enable_full_transcription: true

background_descriptor:
  montage:
    enable: true
    max_frames: 12
    grid: "auto"

  description:
    model: "salamandra-vision"  # o "gpt-4o-mini"
    max_tokens: 512
    temperature: 0.2

identity:
  timeline_mapping:
    per_second_frames_source: "frames_per_second"
    attach_faces_to:
      - "keyframes"
      - "audio_segments"
    out_key: "persona"

narration:
  model: "salamandra-instruct"   # "salamandra-instruct" | "gpt-4o-mini"
  une_guidelines_path: "UNE_153010.txt"
  timing:
    max_ad_duration_ratio: 0.60
    min_gap_seconds: 1.20
    min_ad_seconds: 0.80
  llm:
    max_tokens: 1024
    temperature: 0.2

subtitles:
  max_chars_per_line: 42
  max_lines_per_cue: 10
  speaker_display: "brackets"  # "brackets" | "prefix" | "none"

models:
  # alias de tarea → modelo
  instruct: "salamandra-instruct"
  vision: "salamandra-vision"
  tools: "salamandra-tools"
  asr: "whisper-catalan"  # apunta al Space veureu/asr (Aina: faster-whisper-large-v3-ca-3catparla)

  routing:
    use_remote_for:
      - "salamandra-instruct"
      - "salamandra-vision"
      - "salamandra-tools"
      - "whisper-catalan"

remote_spaces:
  user: "veureu"

  endpoints:
    salamandra-instruct:
      space: "schat"
      base_url: "https://veureu-schat.hf.space"
      client: "gradio"
      predict_route: "/predict"

    salamandra-vision:
      space: "svision"
      base_url: "https://veureu-svision.hf.space"
      client: "gradio"
      predict_route: "/predict"

    salamandra-tools:
      space: "stools"
      base_url: "https://veureu-stools.hf.space"
      client: "gradio"
      predict_route: "/predict"

    whisper-catalan:
      space: "asr"
      base_url: "https://veureu-asr.hf.space"
      client: "gradio"
      predict_route: "/predict"

  http:
    timeout_seconds: 180
    retries: 3
    backoff_seconds: 2.0

security:
  use_hf_token: true
  hf_token_env: "HF_TOKEN"
  allow_insecure_tls: false

logging:
  level: "INFO"
  json: false

stools: false