# =========================== # Veureu Engine – config.yaml # =========================== engine: output_root: "results" api: cors_allow_origins: ["*"] sync_timeout_seconds: 3600 database: enabled: true persist_directory: "chroma_db" enable_face_recognition: true enable_voice_recognition: true face_collection: "index_faces" voice_collection: "index_voices" jobs: enabled: false # si activas cola async, cámbialo a true y añade JobManager en main_api.py max_workers: 1 result_ttl_seconds: 86400 video_processing: keyframes: conditional_extraction: enable: true min_scene_length_seconds: 1.5 difference_threshold: 28.0 frames_per_second: enable: true fps: 1.0 # Frecuencia de frames de análisis ocr: engine: "tesseract" # "tesseract" | "easyocr" language_hint: "spa" tesseract_cmd: "" # si no está en PATH, deja la ruta faces: detector_model: "mtcnn" # ajusta a tu vision_tools embedding_model: "Facenet512" # usado por FaceOfImageEmbedding min_face_size: 32 detection_confidence: 0.85 ocr_clustering: method: "sequential_similarity" sentence_transformer: "all-MiniLM-L6-v2" similarity_threshold: 0.60 # mayor ⇒ menos clusters audio_processing: sample_rate: 16000 format: "wav" diarization: enabled: true force_silence_only: true # Use silence-based segmentation (no pyannote) min_segment_duration: 0.5 # en segundos (clips cortos) max_segment_duration: 10.0 silence_thresh: -40 # dBFS threshold for silence detection min_silence_len: 500 # milliseconds enable_voice_embeddings: true # SpeechBrain ECAPA speaker_embedding: enabled: true # Identificación de hablantes (clustering + Chroma) voice_processing: speaker_identification: enabled: true find_optimal_clusters: true min_speakers: 1 max_speakers: 5 distance_threshold: 0.40 asr: # Controla la transcripción del audio completo además de los clips (útil para contexto global) enable_full_transcription: true background_descriptor: montage: enable: true max_frames: 12 grid: "auto" description: model: "salamandra-vision" # o "gpt-4o-mini" max_tokens: 512 temperature: 0.2 identity: timeline_mapping: per_second_frames_source: "frames_per_second" attach_faces_to: - "keyframes" - "audio_segments" out_key: "persona" narration: model: "salamandra-instruct" # "salamandra-instruct" | "gpt-4o-mini" une_guidelines_path: "UNE_153010.txt" timing: max_ad_duration_ratio: 0.60 min_gap_seconds: 1.20 min_ad_seconds: 0.80 llm: max_tokens: 1024 temperature: 0.2 subtitles: max_chars_per_line: 42 max_lines_per_cue: 10 speaker_display: "brackets" # "brackets" | "prefix" | "none" models: # alias de tarea → modelo instruct: "salamandra-instruct" vision: "salamandra-vision" tools: "salamandra-tools" asr: "whisper-catalan" # apunta al Space veureu/asr (Aina: faster-whisper-large-v3-ca-3catparla) routing: use_remote_for: - "salamandra-instruct" - "salamandra-vision" - "salamandra-tools" - "whisper-catalan" remote_spaces: user: "veureu" endpoints: salamandra-instruct: space: "schat" base_url: "https://veureu-schat.hf.space" client: "gradio" predict_route: "/predict" salamandra-vision: space: "svision" base_url: "https://veureu-svision.hf.space" client: "gradio" predict_route: "/predict" salamandra-tools: space: "stools" base_url: "https://veureu-stools.hf.space" client: "gradio" predict_route: "/predict" whisper-catalan: space: "asr" base_url: "https://veureu-asr.hf.space" client: "gradio" predict_route: "/predict" http: timeout_seconds: 180 retries: 3 backoff_seconds: 2.0 security: use_hf_token: true hf_token_env: "HF_TOKEN" allow_insecure_tls: false logging: level: "INFO" json: false stools: false