#!/usr/bin/env python3 """Evaluate Argus-Colqwen3.5-9B on ViDoRe V1 + V2 using the official ``vidore-benchmark`` library straight from the HuggingFace hub. Why this wrapper exists ----------------------- The reference evaluators live in https://github.com/illuin-tech/vidore-benchmark — every ColPali / Nemotron / vidore leaderboard submission is scored against ``ViDoReEvaluatorQA`` / ``ViDoReEvaluatorBEIR``. By delegating to those evaluators here (instead of re-implementing nDCG/Recall/MRR locally) we guarantee: - ``None`` queries are filtered correctly (Shift, all SyntheticDocQA subsets). - The full image corpus is preserved (distractors stay in the retrieval pool). - MTEB-style metrics (ndcg/map/recall/precision/mrr at every k) match the canonical leaderboard numbers bit-for-bit. Usage ----- pip install vidore-benchmark # or: pip install git+https://github.com/illuin-tech/vidore-benchmark python eval_vidore_v1_v2.py \\ --model ./argus-colqwen3.5-9b-v0 \\ --benchmarks v1 v2 \\ --batch-query 4 \\ --batch-passage 2 Use ``--model DataScience-UIBK/Argus-Colqwen3.5-9B-v0`` once uploaded. """ from __future__ import annotations import argparse import json from pathlib import Path from typing import Dict import torch # ---------------------- ViDoRe dataset catalog ---------------------- # # ViDoRe V1 (QA format). Each HF dataset has a single ``test`` split with # columns: query, image, image_filename. Some rows contain ``query=None`` # (distractors); the library handles this. V1_DATASETS: Dict[str, str] = { "ArxivQ": "vidore/arxivqa_test_subsampled", "DocQ": "vidore/docvqa_test_subsampled", "InfoQ": "vidore/infovqa_test_subsampled", "TabF": "vidore/tabfquad_test_subsampled", "TATQ": "vidore/tatdqa_test", "Shift": "vidore/shiftproject_test", "AI": "vidore/syntheticDocQA_artificial_intelligence_test", "Energy": "vidore/syntheticDocQA_energy_test", "Gov": "vidore/syntheticDocQA_government_reports_test", "Health": "vidore/syntheticDocQA_healthcare_industry_test", } # ViDoRe V2 (BEIR format). Each HF repo exposes 3 dataset configs: # ``corpus`` (images + corpus-id), ``queries`` (query text + query-id), and # ``qrels`` (query-id, corpus-id, score). The library's ``ViDoReEvaluatorBEIR`` # expects that exact shape. V2_DATASETS: Dict[str, str] = { "MIT_Biomedical_Multi": "vidore/biomedical_lectures_v2", "Economics_Macro_Multi": "vidore/economics_reports_v2", "ESG_Restaurant_Human_EN": "vidore/esg_reports_human_labeled_v2", "ESG_Restaurant_Synth_Multi": "vidore/esg_reports_v2", } # ---------------------- helpers ---------------------- # def _load_model_and_processor(args: argparse.Namespace): from transformers import AutoModel, AutoProcessor dtype = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}[args.dtype] print(f"[eval] loading model: {args.model} ({args.dtype}, attn={args.attn_implementation})") # ``dtype`` on transformers >= 4.57; older builds still use ``torch_dtype``. load_kwargs = {"trust_remote_code": True, "attn_implementation": args.attn_implementation} try: model = AutoModel.from_pretrained(args.model, dtype=dtype, **load_kwargs).eval().cuda() except TypeError: model = AutoModel.from_pretrained(args.model, torch_dtype=dtype, **load_kwargs).eval().cuda() processor = AutoProcessor.from_pretrained( args.model, trust_remote_code=True, max_num_visual_tokens=args.max_num_visual_tokens, ) return model, processor class _EmbeddingOnlyWrapper(torch.nn.Module): """Adapter that exposes the plain embeddings tensor to vidore-benchmark. ``VisionRetriever.forward_queries`` / ``forward_passages`` call ``self.model(**batch).to("cpu")``, i.e. they assume the model returns a Tensor. ``ArgusForRetrieval.forward`` returns an ``ArgusOutput`` dataclass (embeddings + region_embeddings + routing info) to keep the MoE analysis surface. This wrapper unwraps ``.embeddings`` so the library sees the expected shape without us having to touch the model class. """ def __init__(self, inner: torch.nn.Module): super().__init__() self.inner = inner def __getattr__(self, name): # Delegate .device / .dtype / .eval() / etc. to the wrapped model. try: return super().__getattr__(name) except AttributeError: return getattr(self.inner, name) def forward(self, **kwargs) -> torch.Tensor: return self.inner(**kwargs).embeddings def _build_retriever(model, processor): from vidore_benchmark.retrievers import VisionRetriever wrapped = _EmbeddingOnlyWrapper(model).eval() # Older vidore-benchmark releases don't accept ``num_workers`` at all; # newer ones do. Try-with-kwarg for portability. try: return VisionRetriever(model=wrapped, processor=processor, num_workers=0) except TypeError: return VisionRetriever(model=wrapped, processor=processor) def _eval_v1(retriever, args: argparse.Namespace) -> Dict[str, Dict[str, float]]: from datasets import load_dataset from vidore_benchmark.evaluation.vidore_evaluators import ViDoReEvaluatorQA evaluator = ViDoReEvaluatorQA(retriever) results: Dict[str, Dict[str, float]] = {} print("\n========== V1 ==========") for short, repo_id in V1_DATASETS.items(): if args.datasets and short not in args.datasets: continue print(f"\n[V1:{short}] {repo_id}") ds = load_dataset(repo_id, split="test") metrics = evaluator.evaluate_dataset( ds, batch_query=args.batch_query, batch_passage=args.batch_passage, batch_score=args.batch_score, ) results[short] = metrics print(f" nDCG@5 = {metrics.get('ndcg_at_5', 0.0):.4f}") return results def _eval_v2(retriever, args: argparse.Namespace) -> Dict[str, Dict[str, float]]: from datasets import load_dataset from vidore_benchmark.evaluation.vidore_evaluators import ViDoReEvaluatorBEIR evaluator = ViDoReEvaluatorBEIR(retriever) results: Dict[str, Dict[str, float]] = {} print("\n========== V2 ==========") for short, repo_id in V2_DATASETS.items(): if args.datasets and short not in args.datasets: continue print(f"\n[V2:{short}] {repo_id}") ds = { "corpus": load_dataset(repo_id, "corpus", split="test"), "queries": load_dataset(repo_id, "queries", split="test"), "qrels": load_dataset(repo_id, "qrels", split="test"), } metrics = evaluator.evaluate_dataset( ds, batch_query=args.batch_query, batch_passage=args.batch_passage, batch_score=args.batch_score, ) results[short] = metrics print(f" nDCG@5 = {metrics.get('ndcg_at_5', 0.0):.4f}") return results # ---------------------- main ---------------------- # def run(args: argparse.Namespace) -> None: model, processor = _load_model_and_processor(args) retriever = _build_retriever(model, processor) all_results: Dict[str, Dict[str, Dict[str, float]]] = {"v1": {}, "v2": {}} if "v1" in args.benchmarks: all_results["v1"] = _eval_v1(retriever, args) if "v2" in args.benchmarks: all_results["v2"] = _eval_v2(retriever, args) # Summary print("\n========== summary ==========") for bench, per_ds in all_results.items(): if not per_ds: continue avg = sum(m.get("ndcg_at_5", 0.0) for m in per_ds.values()) / max(len(per_ds), 1) print(f"{bench.upper()} avg nDCG@5 = {avg:.4f} ({len(per_ds)} datasets)") if args.output_json: Path(args.output_json).write_text(json.dumps(all_results, indent=2, default=float)) print(f"[eval] saved: {args.output_json}") def parse_args() -> argparse.Namespace: p = argparse.ArgumentParser() p.add_argument("--model", required=True, help="HF repo id or local release folder.") p.add_argument("--benchmarks", nargs="+", default=["v1", "v2"], choices=["v1", "v2"]) p.add_argument("--datasets", nargs="*", default=None, help="Optional subset by short key (e.g. ArxivQ DocQ Shift).") p.add_argument("--batch-query", type=int, default=4) p.add_argument("--batch-passage", type=int, default=2) p.add_argument("--batch-score", type=int, default=4) p.add_argument("--max-num-visual-tokens", type=int, default=2048) p.add_argument("--attn-implementation", default="flash_attention_2", choices=["flash_attention_2", "sdpa", "eager"]) p.add_argument("--dtype", default="bfloat16", choices=["bfloat16", "float16", "float32"]) p.add_argument("--output-json", default=None) return p.parse_args() if __name__ == "__main__": run(parse_args())