AtsuMiyai commited on
Commit
9f962ea
·
1 Parent(s): 3136a24
Files changed (3) hide show
  1. app.py +25 -238
  2. constants.py +21 -10
  3. result.csv +28 -0
app.py CHANGED
@@ -1,219 +1,30 @@
1
- __all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
2
 
3
  import gradio as gr
4
  import pandas as pd
5
- import re
6
- import pandas as pd
7
- import numpy as np
8
- from collections import defaultdict
9
- from constants import *
10
- import os
11
- from huggingface_hub import Repository
12
- import json
13
-
14
-
15
- global data_component, filter_component
16
-
17
-
18
- TOKEN = os.environ.get("TOKEN")
19
- repo = Repository(local_dir="./download_from_dataset", clone_from="JMMMU/leaderboard_result", repo_type="dataset", use_auth_token=TOKEN)
20
-
21
- current_directory = os.getcwd()
22
-
23
-
24
- def validate_model_size(s):
25
- pattern = r'^\d+B$|^-$'
26
- if re.match(pattern, s):
27
- return s
28
- else:
29
- return '-'
30
-
31
-
32
- def upload_file(files):
33
- file_paths = [file.name for file in files]
34
- return file_paths
35
-
36
-
37
- def get_acc(data, subject_list):
38
- acc = 0
39
- for subject in subject_list:
40
- acc += data["results"][subject]['jmmmu_acc,none']
41
- acc = acc/len(subject_list)
42
- acc = acc * 100
43
- acc = round(acc, 1)
44
- return acc
45
-
46
-
47
- def calculate_score(input_file):
48
- json_string = input_file.decode('utf-8')
49
- data = json.loads(json_string)
50
- result_dict = {}
51
-
52
- overall = data["results"]["jmmmu"]['jmmmu_acc,none']*100
53
- ca = data["results"]["culture_agnostic"]['jmmmu_acc,none']*100
54
- cs = data["results"]["culture_specific"]['jmmmu_acc,none']*100
55
- overall = round(overall, 1)
56
- ca = round(ca, 1)
57
- cs = round(cs, 1)
58
- # Art_Psychology
59
- art_psychology_subject_list = ["jmmmu_design", "jmmmu_music", "jmmmu_psychology"]
60
- # Science
61
- science_subject_list = ["jmmmu_biology", "jmmmu_chemistry", "jmmmu_physics", "jmmmu_math"]
62
- # Business
63
- business_subject_list = ["jmmmu_accounting", "jmmmu_economics", "jmmmu_finance", "jmmmu_manage", "jmmmu_marketing"]
64
- # Medicine
65
- medicine_subject_list = ["jmmmu_basic_medical_science", "jmmmu_clinical_medicine", "jmmmu_diagnostics_and_laboratory_medicine", "jmmmu_pharmacy", "jmmmu_public_health"]
66
- # Tech_Eng.
67
- tech_eng_subject_list = ["jmmmu_agriculture", "jmmmu_architecture_and_engineering", "jmmmu_computer_science", "jmmmu_electronics", "jmmmu_energy_and_power", "jmmmu_materials", "jmmmu_mechanical_engineering"]
68
-
69
- jmmmu_japanese_art_subject_list = ["jmmmu_japanese_art"]
70
- jmmmu_japanese_heritage_subject_list = ["jmmmu_japanese_heritage"]
71
- jmmmu_japanese_history_subject_list = ["jmmmu_japanese_history"]
72
- jmmmu_world_history_subject_list = ["jmmmu_world_history"]
73
-
74
- art_psychology = get_acc(data, art_psychology_subject_list)
75
- science = get_acc(data, science_subject_list)
76
- business = get_acc(data, business_subject_list)
77
- medicine = get_acc(data, medicine_subject_list)
78
- tech_eng = get_acc(data, tech_eng_subject_list)
79
- japanese_art = get_acc(data, jmmmu_japanese_art_subject_list)
80
- japanese_heritage = get_acc(data, jmmmu_japanese_heritage_subject_list)
81
- japanese_history = get_acc(data, jmmmu_japanese_history_subject_list)
82
- world_history = get_acc(data, jmmmu_world_history_subject_list)
83
-
84
- result_dict =\
85
- {
86
- "overall": overall,
87
- "cultureSpecific": cs,
88
- "cultureAgnostic": ca,
89
- "japaneseArt": japanese_art,
90
- "japaneseHeritage": japanese_heritage,
91
- "japaneseHistory": japanese_history,
92
- "worldHistory": world_history,
93
- "artPsychology": art_psychology,
94
- "business": business,
95
- "science": science,
96
- "healthMedicine": medicine,
97
- "techEngineering": tech_eng
98
- }
99
- return result_dict
100
-
101
-
102
- def add_new_eval(
103
- input_file,
104
- model_type: str,
105
- model_name_textbox: str,
106
- revision_name_textbox: str,
107
- model_link: str,
108
- model_size: str,
109
- # upd_type: str,
110
- # question_type: str
111
-
112
- ):
113
-
114
- if input_file is None:
115
- warning_text = "Error! Empty file!"
116
- print(warning_text)
117
- return warning_text
118
- else:
119
- model_size = validate_model_size(model_size)
120
- # if upd_type == 'AAD':
121
- csv_path = CSV_RESULT_PATH
122
-
123
- # validity_check(input_file)
124
 
125
- csv_data = pd.read_csv(csv_path)
126
 
127
- result_dict = calculate_score(input_file)
128
-
129
- if revision_name_textbox == '':
130
- col = csv_data.shape[0]
131
- model_name = model_name_textbox
132
- else:
133
- model_name = revision_name_textbox
134
- model_name_list = csv_data['Model']
135
- name_list = [name.split(']')[0][1:] for name in model_name_list]
136
- if revision_name_textbox not in name_list:
137
- col = csv_data.shape[0]
138
- else:
139
- col = name_list.index(revision_name_textbox)
140
- model_name_wo_link = model_name
141
- if model_link == '':
142
- model_name = model_name # no url
143
- else:
144
- model_name = '[' + model_name + '](' + model_link + ')'
145
-
146
- # add new data
147
- new_data = [
148
- model_type,
149
- model_name,
150
- model_size,
151
- result_dict["overall"],
152
- result_dict["cultureSpecific"],
153
- result_dict["cultureAgnostic"],
154
- result_dict["japaneseArt"],
155
- result_dict["japaneseHeritage"],
156
- result_dict["japaneseHistory"],
157
- result_dict["worldHistory"],
158
- result_dict["artPsychology"],
159
- result_dict["business"],
160
- result_dict["science"],
161
- result_dict["healthMedicine"],
162
- result_dict["techEngineering"]
163
- ]
164
-
165
- # If the same data already exists, return an error.
166
- if new_data in csv_data.values.tolist():
167
- warning_text = "Error! The same data already exists!"
168
- print(warning_text)
169
- return warning_text
170
- # If the same model name already exists, return an error.
171
- elif new_data[:5] in csv_data.values.tolist():
172
- warning_text = "Error! The same data already exists! Please fill revision_name."
173
- print(warning_text)
174
- return warning_text
175
-
176
- csv_data.loc[col] = new_data
177
- csv_data = csv_data.to_csv(csv_path, index=False)
178
-
179
- absolute_result_path = os.path.abspath(csv_path)
180
- if not os.path.exists(absolute_result_path):
181
- raise FileNotFoundError(f"File {absolute_result_path} not found")
182
-
183
- repo.git_pull()
184
- repo.git_add(absolute_result_path)
185
-
186
- save_path = os.path.join(CSV_QUEUE_DIR, f"{model_name_wo_link}.json")
187
- with open(save_path, "wb") as f:
188
- f.write(input_file)
189
-
190
- absolute_queue_path = os.path.abspath(save_path)
191
-
192
- repo.git_add(absolute_queue_path)
193
- repo.git_commit(f"add {model_name_wo_link} results")
194
- repo.git_push()
195
- print(f"Success! Your {model_name_wo_link} has been added!")
196
-
197
- return 0
198
-
199
-
200
- def get_baseline_df():
201
- repo.git_pull()
202
  df = pd.read_csv(CSV_RESULT_PATH)
203
  df = df.sort_values(by="Overall", ascending=False)
204
- present_columns = MODEL_INFO + checkbox_group.value
205
  df = df[present_columns]
206
  return df
207
 
208
 
209
  def get_all_df():
210
- repo.git_pull()
211
  df = pd.read_csv(CSV_RESULT_PATH)
212
  df = df.sort_values(by="Overall", ascending=False)
213
  return df
214
 
215
 
216
-
217
  block = gr.Blocks()
218
 
219
 
@@ -230,18 +41,9 @@ with block:
230
  value=AVG_INFO,
231
  label="Evaluation Dimension",
232
  interactive=True,
233
- ) # user can select the evaluation dimension
234
-
235
- with gr.Row():
236
- # selection for model size part:
237
- model_size = gr.CheckboxGroup(
238
- choices=MODEL_SIZE,
239
- value=MODEL_SIZE,
240
- label="Model Size",
241
- interactive=True,
242
- )
243
 
244
- baseline_value = get_baseline_df()
245
  baseline_header = MODEL_INFO + checkbox_group.value
246
  baseline_datatype = ['markdown'] * 2 + ['number'] * len(checkbox_group.value)
247
 
@@ -254,35 +56,23 @@ with block:
254
  visible=True,
255
  )
256
 
257
- def on_filter_model_size_method_change(selected_model_size, selected_columns):
258
-
259
  updated_data = get_all_df()
260
- # model_size
261
-
262
- def custom_filter(row, model_size_filters):
263
- model_size = row['Model Size']
264
- model_size = model_size.upper()
265
-
266
- if model_size == '-':
267
- size_filter = '-' in model_size_filters
268
- elif 'B' in model_size:
269
- size = float(model_size.replace('B', ''))
270
- size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
271
- else:
272
- size_filter = False
273
-
274
- return size_filter
275
-
276
- mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size)
277
- updated_data = updated_data[mask]
278
 
279
  # columns:
280
- selected_columns = [item for item in TASK_INFO if item in selected_columns]
 
 
281
  present_columns = MODEL_INFO + selected_columns
282
  updated_data = updated_data[present_columns]
283
- updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
 
 
284
  updated_headers = present_columns
285
- update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
 
 
 
286
 
287
  filter_component = gr.components.Dataframe(
288
  value=updated_data,
@@ -294,13 +84,10 @@ with block:
294
  )
295
  return filter_component
296
 
297
-
298
- model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, checkbox_group], outputs=data_component)
299
- checkbox_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, checkbox_group], outputs=data_component)
300
 
301
  def refresh_data():
302
- value = get_baseline_df()
303
-
304
  return value
305
 
306
  with gr.Row():
 
1
+ __all__ = ['block']
2
 
3
  import gradio as gr
4
  import pandas as pd
5
+ from constants import (
6
+ MODEL_INFO, TASK_INFO, AVG_INFO, DATA_TITILE_TYPE,
7
+ COLUMN_NAMES, CSV_RESULT_PATH, LEADERBORAD_INTRODUCTION,
8
+ CITATION_BUTTON_TEXT, CITATION_BUTTON_LABEL
9
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
 
11
 
12
+ def get_baseline_df(selected_columns=None):
13
+ if selected_columns is None:
14
+ selected_columns = AVG_INFO
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  df = pd.read_csv(CSV_RESULT_PATH)
16
  df = df.sort_values(by="Overall", ascending=False)
17
+ present_columns = MODEL_INFO + selected_columns
18
  df = df[present_columns]
19
  return df
20
 
21
 
22
  def get_all_df():
 
23
  df = pd.read_csv(CSV_RESULT_PATH)
24
  df = df.sort_values(by="Overall", ascending=False)
25
  return df
26
 
27
 
 
28
  block = gr.Blocks()
29
 
30
 
 
41
  value=AVG_INFO,
42
  label="Evaluation Dimension",
43
  interactive=True,
44
+ ) # user can select the evaluation dimension
 
 
 
 
 
 
 
 
 
45
 
46
+ baseline_value = get_baseline_df(checkbox_group.value)
47
  baseline_header = MODEL_INFO + checkbox_group.value
48
  baseline_datatype = ['markdown'] * 2 + ['number'] * len(checkbox_group.value)
49
 
 
56
  visible=True,
57
  )
58
 
59
+ def on_filter_method_change(selected_columns):
 
60
  updated_data = get_all_df()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  # columns:
63
+ selected_columns = [
64
+ item for item in TASK_INFO if item in selected_columns
65
+ ]
66
  present_columns = MODEL_INFO + selected_columns
67
  updated_data = updated_data[present_columns]
68
+ updated_data = updated_data.sort_values(
69
+ by=selected_columns[0], ascending=False
70
+ )
71
  updated_headers = present_columns
72
+ update_datatype = [
73
+ DATA_TITILE_TYPE[COLUMN_NAMES.index(x)]
74
+ for x in updated_headers
75
+ ]
76
 
77
  filter_component = gr.components.Dataframe(
78
  value=updated_data,
 
84
  )
85
  return filter_component
86
 
87
+ checkbox_group.change(fn=on_filter_method_change, inputs=[checkbox_group], outputs=data_component)
 
 
88
 
89
  def refresh_data():
90
+ value = get_baseline_df(checkbox_group.value)
 
91
  return value
92
 
93
  with gr.Row():
constants.py CHANGED
@@ -1,18 +1,29 @@
1
  # this is .py for store constants
2
- MODEL_INFO = ["Model Type", "Model"]
3
- MODEL_SIZE = ["<10B", ">=10B", "-"]
4
- LEADERBOARD_VERSION = ["Version1"]
5
- TASK_INFO = ["Overall", "Culture-Specific", "Culture-Agnostic", "Japanese Art", "Japanese Heritage", "Japanese History", "World History", "Art & Psychology", "Business", "Science", "Health & Medicine", "Tech & Engineering"]
6
- # Overall, Culture-Specific, Culture-Agnostic, English Original, Japanese Art, Japanese Heritage, Japanese History, World History, Art & Psychology, Business, Science, Health & Medicine, Tech & Engineering
 
 
 
 
 
 
 
 
7
  AVG_INFO = ["Overall"]
8
 
 
 
 
 
 
9
 
10
- DATA_TITILE_TYPE = ["markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
11
-
12
- CSV_RESULT_PATH = "./download_from_dataset/result.csv"
13
  COLUMN_NAMES = MODEL_INFO + TASK_INFO
14
 
15
- LEADERBORAD_VERSION = ["JMMMU"]
16
 
17
 
18
  LEADERBORAD_INTRODUCTION = """
@@ -34,7 +45,7 @@ Following the evolution from MMMU to MMMU-Pro, JMMMU-Pro extends JMMMU by compos
34
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
35
  CITATION_BUTTON_TEXT = r"""@article{miyai2025jmmmu-pro,
36
  author = {Miyai, Atsuyuki and Onohara, Shota and Baek, Jeonghun and Aizawa, Kiyoharu},
37
- title = {JMMMU-Pro: Vibe Benchmark Construction of Image-based Japanese Multi-discipline Multimodal Understanding Benchmark},
38
  journal = {TBD},
39
  year = {2025},
40
  }"""
 
1
  # this is .py for store constants
2
+ MODEL_INFO = ["model_name", "prompt"]
3
+ TASK_INFO = [
4
+ "Overall", "culture-specific", "culture-agnostic",
5
+ "Japanese_Art", "Japanese_Heritage", "Japanese_History",
6
+ "World_History", "Accounting", "Agriculture",
7
+ "Architecture_and_Engineering", "Basic_Medical_Science",
8
+ "Biology", "Chemistry", "Clinical_Medicine",
9
+ "Computer_Science", "Design",
10
+ "Diagnostics_and_Laboratory_Medicine", "Economics",
11
+ "Electronics", "Energy_and_Power", "Finance", "Manage",
12
+ "Marketing", "Materials", "Math", "Mechanical_Engineering",
13
+ "Music", "Pharmacy", "Physics", "Psychology", "Public_Health"
14
+ ]
15
  AVG_INFO = ["Overall"]
16
 
17
+ # Data types for each column:
18
+ # model_name (markdown), prompt (markdown), then all numbers
19
+ DATA_TITILE_TYPE = (
20
+ ["markdown", "markdown"] + ["number"] * (len(TASK_INFO))
21
+ )
22
 
23
+ CSV_RESULT_PATH = "./result.csv"
 
 
24
  COLUMN_NAMES = MODEL_INFO + TASK_INFO
25
 
26
+ LEADERBORAD_VERSION = ["JMMMU_Pro"]
27
 
28
 
29
  LEADERBORAD_INTRODUCTION = """
 
45
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
46
  CITATION_BUTTON_TEXT = r"""@article{miyai2025jmmmu-pro,
47
  author = {Miyai, Atsuyuki and Onohara, Shota and Baek, Jeonghun and Aizawa, Kiyoharu},
48
+ title = {JMMMU-Pro: Image-based Japanese Multi-discipline Multimodal Understanding Benchmark via Vibe Benchmark Construction},
49
  journal = {TBD},
50
  year = {2025},
51
  }"""
result.csv ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name,prompt,Overall,culture-specific,culture-agnostic,Japanese_Art,Japanese_Heritage,Japanese_History,World_History,Accounting,Agriculture,Architecture_and_Engineering,Basic_Medical_Science,Biology,Chemistry,Clinical_Medicine,Computer_Science,Design,Diagnostics_and_Laboratory_Medicine,Economics,Electronics,Energy_and_Power,Finance,Manage,Marketing,Materials,Math,Mechanical_Engineering,Music,Pharmacy,Physics,Psychology,Public_Health
2
+ Aya-Vision-8B,Direct,22.424,23.833,21.25,26.0,26.667,20.0,22.667,30.0,20.0,20.0,30.0,16.667,20.0,26.667,13.333,23.333,23.333,26.667,10.0,23.333,26.667,30.0,16.667,10.0,20.0,20.0,16.667,20.0,16.667,20.0,30.0
3
+ Heron-NVILA-Lite-15B,Direct,26.97,26.667,27.222,26.0,27.333,31.333,22.0,23.333,30.0,30.0,33.333,26.667,20.0,23.333,33.333,40.0,30.0,30.0,10.0,30.0,26.667,26.667,16.667,26.667,30.0,23.333,13.333,26.667,30.0,23.333,50.0
4
+ Phi-4-multimodal,Direct,31.818,28.833,34.306,29.333,18.0,30.0,38.0,26.667,33.333,20.0,46.667,23.333,26.667,30.0,56.667,20.0,13.333,43.333,50.0,40.0,20.0,30.0,43.333,46.667,33.333,46.667,33.333,36.667,26.667,33.333,43.333
5
+ InternVL2.5-8B,Direct,25.076,23.833,26.111,26.0,18.0,24.667,26.667,30.0,13.333,30.0,30.0,23.333,23.333,20.0,30.0,23.333,33.333,26.667,26.667,36.667,26.667,30.0,33.333,16.667,23.333,36.667,26.667,13.333,30.0,23.333,20.0
6
+ Qwen2.5-VL-7B,Direct,44.697,50.167,40.139,42.667,44.0,38.667,75.333,30.0,53.333,30.0,60.0,40.0,36.667,36.667,50.0,53.333,40.0,30.0,43.333,33.333,46.667,23.333,36.667,33.333,36.667,33.333,36.667,53.333,33.333,50.0,43.333
7
+ Qwen3-VL-8B,Direct,45.833,47.0,44.861,39.333,37.333,40.667,70.667,50.0,53.333,46.667,73.333,40.0,26.667,40.0,43.333,53.333,40.0,53.333,43.333,33.333,36.667,43.333,50.0,40.0,33.333,53.333,40.0,46.667,43.333,53.333,40.0
8
+ Gemini3.0,Direct,87.045,95.0,80.417,91.333,96.667,95.333,96.667,80.0,66.667,93.333,90.0,70.0,90.0,83.333,86.667,80.0,56.667,83.333,83.333,86.667,90.0,73.333,86.667,73.333,90.0,80.0,46.667,83.333,86.667,83.333,86.667
9
+ GPT-5.2,Direct,78.409,87.167,71.111,79.333,94.0,87.333,88.0,73.333,56.667,40.0,83.333,56.667,73.333,80.0,80.0,76.667,56.667,86.667,66.667,56.667,83.333,66.667,90.0,60.0,66.667,56.667,30.0,96.667,96.667,80.0,93.333
10
+ gpt-5.2,Direct,83.333,88.333,79.167,80.0,92.667,90.0,90.667,80.0,63.333,86.667,93.333,70.0,90.0,60.0,83.333,83.333,53.333,80.0,83.333,73.333,90.0,76.667,90.0,76.667,93.333,50.0,60.0,86.667,93.333,86.667,96.667
11
+ LLaVA-OneVision-1.5-8B,Direct,29.924,26.333,32.917,34.0,22.0,20.667,28.667,33.333,23.333,26.667,43.333,16.667,33.333,23.333,33.333,23.333,23.333,36.667,43.333,33.333,26.667,36.667,43.333,46.667,33.333,43.333,43.333,23.333,40.0,26.667,33.333
12
+ LLaVA-OV-7B,Direct,27.348,26.5,28.056,30.667,26.667,26.0,22.667,46.667,26.667,30.0,33.333,26.667,30.0,26.667,23.333,26.667,23.333,30.0,33.333,36.667,16.667,23.333,16.667,33.333,23.333,33.333,36.667,33.333,13.333,20.0,30.0
13
+ Pangea-7B,Direct,19.545,23.0,16.667,28.0,24.667,20.0,19.333,13.333,16.667,23.333,16.667,10.0,3.333,3.333,13.333,10.0,10.0,20.0,13.333,16.667,20.0,33.333,26.667,13.333,20.0,20.0,30.0,23.333,13.333,16.667,13.333
14
+ Sarashina2-V-14B,Direct,30.682,32.333,29.306,32.0,27.333,31.333,38.667,33.333,33.333,23.333,30.0,20.0,23.333,20.0,26.667,26.667,36.667,36.667,33.333,43.333,30.0,20.0,30.0,20.0,43.333,20.0,20.0,40.0,26.667,23.333,43.333
15
+ Sarashina2-V-8B,Direct,27.879,27.0,28.611,29.333,24.0,28.0,26.667,40.0,30.0,26.667,36.667,26.667,23.333,16.667,23.333,36.667,26.667,33.333,40.0,23.333,20.0,26.667,13.333,26.667,40.0,20.0,20.0,40.0,30.0,26.667,40.0
16
+ Sarashina2.2-V-3B,Direct,38.03,40.167,36.25,42.0,29.333,36.0,53.333,36.667,50.0,36.667,53.333,33.333,33.333,30.0,50.0,43.333,40.0,43.333,46.667,33.333,30.0,20.0,30.0,33.333,36.667,26.667,26.667,36.667,23.333,33.333,43.333
17
+ Aya-Vision-8B,CoT,26.742,27.0,26.528,22.0,30.667,26.667,28.667,23.333,13.333,23.333,40.0,26.667,13.333,13.333,30.0,36.667,26.667,30.0,26.667,33.333,26.667,43.333,30.0,30.0,16.667,36.667,26.667,26.667,16.667,20.0,26.667
18
+ Heron-NVILA-Lite-15B,CoT,5.303,1.0,8.889,1.333,1.333,0.667,0.667,10.0,13.333,0.0,3.333,10.0,6.667,0.0,10.0,6.667,3.333,3.333,23.333,6.667,13.333,0.0,10.0,10.0,10.0,23.333,10.0,3.333,0.0,16.667,20.0
19
+ Phi-4-multimodal,CoT,24.167,22.0,25.972,19.333,18.667,17.333,32.667,33.333,20.0,26.667,36.667,20.0,13.333,16.667,30.0,10.0,20.0,43.333,40.0,33.333,26.667,23.333,26.667,26.667,20.0,23.333,23.333,33.333,20.0,23.333,33.333
20
+ InternVL2.5-8B,CoT,31.212,29.0,33.056,29.333,22.667,28.0,36.0,33.333,26.667,23.333,43.333,36.667,26.667,43.333,40.0,40.0,23.333,40.0,33.333,33.333,36.667,26.667,36.667,16.667,30.0,40.0,13.333,40.0,40.0,23.333,46.667
21
+ Qwen2.5-VL-7B,CoT,45.0,46.667,43.611,34.0,41.333,43.333,68.0,43.333,30.0,33.333,83.333,40.0,36.667,53.333,36.667,33.333,33.333,46.667,46.667,26.667,53.333,33.333,50.0,36.667,43.333,36.667,40.0,60.0,43.333,43.333,63.333
22
+ Qwen3-VL-8B,CoT,47.273,47.5,47.083,42.667,34.0,44.667,68.667,53.333,53.333,10.0,63.333,36.667,43.333,40.0,43.333,46.667,36.667,60.0,33.333,30.0,43.333,56.667,80.0,23.333,56.667,36.667,26.667,63.333,66.667,66.667,60.0
23
+ LLaVA-OneVision-1.5-8B,CoT,31.97,28.0,35.278,28.0,20.667,29.333,34.0,30.0,16.667,40.0,46.667,36.667,23.333,30.0,50.0,30.0,20.0,36.667,43.333,43.333,60.0,23.333,30.0,40.0,36.667,46.667,30.0,26.667,33.333,26.667,46.667
24
+ LLaVA-OV-7B,CoT,14.091,14.333,13.889,11.333,14.0,12.667,19.333,13.333,13.333,6.667,13.333,6.667,13.333,13.333,10.0,23.333,0.0,20.0,3.333,10.0,33.333,3.333,13.333,13.333,26.667,16.667,20.0,20.0,10.0,10.0,20.0
25
+ Pangea-7B,CoT,23.409,21.667,24.861,22.0,22.0,24.0,18.667,13.333,26.667,26.667,33.333,16.667,10.0,23.333,23.333,40.0,20.0,26.667,33.333,30.0,30.0,20.0,30.0,36.667,23.333,13.333,16.667,36.667,20.0,20.0,26.667
26
+ Sarashina2-V-14B,CoT,30.0,30.5,29.583,30.0,26.0,29.333,36.667,36.667,33.333,13.333,40.0,23.333,13.333,13.333,30.0,33.333,36.667,43.333,36.667,46.667,33.333,20.0,23.333,30.0,36.667,20.0,23.333,43.333,26.667,20.0,33.333
27
+ Sarashina2-V-8B,CoT,27.273,25.333,28.889,24.0,24.667,31.333,21.333,36.667,23.333,26.667,36.667,23.333,23.333,16.667,30.0,36.667,26.667,30.0,43.333,23.333,30.0,36.667,16.667,26.667,30.0,20.0,16.667,40.0,23.333,30.0,46.667
28
+ Sarashina2.2-V-3B,CoT,42.879,54.0,33.611,48.0,51.333,51.333,65.333,36.667,40.0,23.333,63.333,30.0,13.333,23.333,30.0,33.333,26.667,50.0,36.667,23.333,43.333,16.667,26.667,33.333,30.0,20.0,23.333,53.333,43.333,40.0,46.667