Spaces:
Running
Running
AtsuMiyai
commited on
Commit
·
9f962ea
1
Parent(s):
3136a24
update
Browse files- app.py +25 -238
- constants.py +21 -10
- result.csv +28 -0
app.py
CHANGED
|
@@ -1,219 +1,30 @@
|
|
| 1 |
-
__all__ = ['block'
|
| 2 |
|
| 3 |
import gradio as gr
|
| 4 |
import pandas as pd
|
| 5 |
-
import
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
import os
|
| 11 |
-
from huggingface_hub import Repository
|
| 12 |
-
import json
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
global data_component, filter_component
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
TOKEN = os.environ.get("TOKEN")
|
| 19 |
-
repo = Repository(local_dir="./download_from_dataset", clone_from="JMMMU/leaderboard_result", repo_type="dataset", use_auth_token=TOKEN)
|
| 20 |
-
|
| 21 |
-
current_directory = os.getcwd()
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
def validate_model_size(s):
|
| 25 |
-
pattern = r'^\d+B$|^-$'
|
| 26 |
-
if re.match(pattern, s):
|
| 27 |
-
return s
|
| 28 |
-
else:
|
| 29 |
-
return '-'
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
def upload_file(files):
|
| 33 |
-
file_paths = [file.name for file in files]
|
| 34 |
-
return file_paths
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
def get_acc(data, subject_list):
|
| 38 |
-
acc = 0
|
| 39 |
-
for subject in subject_list:
|
| 40 |
-
acc += data["results"][subject]['jmmmu_acc,none']
|
| 41 |
-
acc = acc/len(subject_list)
|
| 42 |
-
acc = acc * 100
|
| 43 |
-
acc = round(acc, 1)
|
| 44 |
-
return acc
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
def calculate_score(input_file):
|
| 48 |
-
json_string = input_file.decode('utf-8')
|
| 49 |
-
data = json.loads(json_string)
|
| 50 |
-
result_dict = {}
|
| 51 |
-
|
| 52 |
-
overall = data["results"]["jmmmu"]['jmmmu_acc,none']*100
|
| 53 |
-
ca = data["results"]["culture_agnostic"]['jmmmu_acc,none']*100
|
| 54 |
-
cs = data["results"]["culture_specific"]['jmmmu_acc,none']*100
|
| 55 |
-
overall = round(overall, 1)
|
| 56 |
-
ca = round(ca, 1)
|
| 57 |
-
cs = round(cs, 1)
|
| 58 |
-
# Art_Psychology
|
| 59 |
-
art_psychology_subject_list = ["jmmmu_design", "jmmmu_music", "jmmmu_psychology"]
|
| 60 |
-
# Science
|
| 61 |
-
science_subject_list = ["jmmmu_biology", "jmmmu_chemistry", "jmmmu_physics", "jmmmu_math"]
|
| 62 |
-
# Business
|
| 63 |
-
business_subject_list = ["jmmmu_accounting", "jmmmu_economics", "jmmmu_finance", "jmmmu_manage", "jmmmu_marketing"]
|
| 64 |
-
# Medicine
|
| 65 |
-
medicine_subject_list = ["jmmmu_basic_medical_science", "jmmmu_clinical_medicine", "jmmmu_diagnostics_and_laboratory_medicine", "jmmmu_pharmacy", "jmmmu_public_health"]
|
| 66 |
-
# Tech_Eng.
|
| 67 |
-
tech_eng_subject_list = ["jmmmu_agriculture", "jmmmu_architecture_and_engineering", "jmmmu_computer_science", "jmmmu_electronics", "jmmmu_energy_and_power", "jmmmu_materials", "jmmmu_mechanical_engineering"]
|
| 68 |
-
|
| 69 |
-
jmmmu_japanese_art_subject_list = ["jmmmu_japanese_art"]
|
| 70 |
-
jmmmu_japanese_heritage_subject_list = ["jmmmu_japanese_heritage"]
|
| 71 |
-
jmmmu_japanese_history_subject_list = ["jmmmu_japanese_history"]
|
| 72 |
-
jmmmu_world_history_subject_list = ["jmmmu_world_history"]
|
| 73 |
-
|
| 74 |
-
art_psychology = get_acc(data, art_psychology_subject_list)
|
| 75 |
-
science = get_acc(data, science_subject_list)
|
| 76 |
-
business = get_acc(data, business_subject_list)
|
| 77 |
-
medicine = get_acc(data, medicine_subject_list)
|
| 78 |
-
tech_eng = get_acc(data, tech_eng_subject_list)
|
| 79 |
-
japanese_art = get_acc(data, jmmmu_japanese_art_subject_list)
|
| 80 |
-
japanese_heritage = get_acc(data, jmmmu_japanese_heritage_subject_list)
|
| 81 |
-
japanese_history = get_acc(data, jmmmu_japanese_history_subject_list)
|
| 82 |
-
world_history = get_acc(data, jmmmu_world_history_subject_list)
|
| 83 |
-
|
| 84 |
-
result_dict =\
|
| 85 |
-
{
|
| 86 |
-
"overall": overall,
|
| 87 |
-
"cultureSpecific": cs,
|
| 88 |
-
"cultureAgnostic": ca,
|
| 89 |
-
"japaneseArt": japanese_art,
|
| 90 |
-
"japaneseHeritage": japanese_heritage,
|
| 91 |
-
"japaneseHistory": japanese_history,
|
| 92 |
-
"worldHistory": world_history,
|
| 93 |
-
"artPsychology": art_psychology,
|
| 94 |
-
"business": business,
|
| 95 |
-
"science": science,
|
| 96 |
-
"healthMedicine": medicine,
|
| 97 |
-
"techEngineering": tech_eng
|
| 98 |
-
}
|
| 99 |
-
return result_dict
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
def add_new_eval(
|
| 103 |
-
input_file,
|
| 104 |
-
model_type: str,
|
| 105 |
-
model_name_textbox: str,
|
| 106 |
-
revision_name_textbox: str,
|
| 107 |
-
model_link: str,
|
| 108 |
-
model_size: str,
|
| 109 |
-
# upd_type: str,
|
| 110 |
-
# question_type: str
|
| 111 |
-
|
| 112 |
-
):
|
| 113 |
-
|
| 114 |
-
if input_file is None:
|
| 115 |
-
warning_text = "Error! Empty file!"
|
| 116 |
-
print(warning_text)
|
| 117 |
-
return warning_text
|
| 118 |
-
else:
|
| 119 |
-
model_size = validate_model_size(model_size)
|
| 120 |
-
# if upd_type == 'AAD':
|
| 121 |
-
csv_path = CSV_RESULT_PATH
|
| 122 |
-
|
| 123 |
-
# validity_check(input_file)
|
| 124 |
|
| 125 |
-
csv_data = pd.read_csv(csv_path)
|
| 126 |
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
col = csv_data.shape[0]
|
| 131 |
-
model_name = model_name_textbox
|
| 132 |
-
else:
|
| 133 |
-
model_name = revision_name_textbox
|
| 134 |
-
model_name_list = csv_data['Model']
|
| 135 |
-
name_list = [name.split(']')[0][1:] for name in model_name_list]
|
| 136 |
-
if revision_name_textbox not in name_list:
|
| 137 |
-
col = csv_data.shape[0]
|
| 138 |
-
else:
|
| 139 |
-
col = name_list.index(revision_name_textbox)
|
| 140 |
-
model_name_wo_link = model_name
|
| 141 |
-
if model_link == '':
|
| 142 |
-
model_name = model_name # no url
|
| 143 |
-
else:
|
| 144 |
-
model_name = '[' + model_name + '](' + model_link + ')'
|
| 145 |
-
|
| 146 |
-
# add new data
|
| 147 |
-
new_data = [
|
| 148 |
-
model_type,
|
| 149 |
-
model_name,
|
| 150 |
-
model_size,
|
| 151 |
-
result_dict["overall"],
|
| 152 |
-
result_dict["cultureSpecific"],
|
| 153 |
-
result_dict["cultureAgnostic"],
|
| 154 |
-
result_dict["japaneseArt"],
|
| 155 |
-
result_dict["japaneseHeritage"],
|
| 156 |
-
result_dict["japaneseHistory"],
|
| 157 |
-
result_dict["worldHistory"],
|
| 158 |
-
result_dict["artPsychology"],
|
| 159 |
-
result_dict["business"],
|
| 160 |
-
result_dict["science"],
|
| 161 |
-
result_dict["healthMedicine"],
|
| 162 |
-
result_dict["techEngineering"]
|
| 163 |
-
]
|
| 164 |
-
|
| 165 |
-
# If the same data already exists, return an error.
|
| 166 |
-
if new_data in csv_data.values.tolist():
|
| 167 |
-
warning_text = "Error! The same data already exists!"
|
| 168 |
-
print(warning_text)
|
| 169 |
-
return warning_text
|
| 170 |
-
# If the same model name already exists, return an error.
|
| 171 |
-
elif new_data[:5] in csv_data.values.tolist():
|
| 172 |
-
warning_text = "Error! The same data already exists! Please fill revision_name."
|
| 173 |
-
print(warning_text)
|
| 174 |
-
return warning_text
|
| 175 |
-
|
| 176 |
-
csv_data.loc[col] = new_data
|
| 177 |
-
csv_data = csv_data.to_csv(csv_path, index=False)
|
| 178 |
-
|
| 179 |
-
absolute_result_path = os.path.abspath(csv_path)
|
| 180 |
-
if not os.path.exists(absolute_result_path):
|
| 181 |
-
raise FileNotFoundError(f"File {absolute_result_path} not found")
|
| 182 |
-
|
| 183 |
-
repo.git_pull()
|
| 184 |
-
repo.git_add(absolute_result_path)
|
| 185 |
-
|
| 186 |
-
save_path = os.path.join(CSV_QUEUE_DIR, f"{model_name_wo_link}.json")
|
| 187 |
-
with open(save_path, "wb") as f:
|
| 188 |
-
f.write(input_file)
|
| 189 |
-
|
| 190 |
-
absolute_queue_path = os.path.abspath(save_path)
|
| 191 |
-
|
| 192 |
-
repo.git_add(absolute_queue_path)
|
| 193 |
-
repo.git_commit(f"add {model_name_wo_link} results")
|
| 194 |
-
repo.git_push()
|
| 195 |
-
print(f"Success! Your {model_name_wo_link} has been added!")
|
| 196 |
-
|
| 197 |
-
return 0
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
def get_baseline_df():
|
| 201 |
-
repo.git_pull()
|
| 202 |
df = pd.read_csv(CSV_RESULT_PATH)
|
| 203 |
df = df.sort_values(by="Overall", ascending=False)
|
| 204 |
-
present_columns = MODEL_INFO +
|
| 205 |
df = df[present_columns]
|
| 206 |
return df
|
| 207 |
|
| 208 |
|
| 209 |
def get_all_df():
|
| 210 |
-
repo.git_pull()
|
| 211 |
df = pd.read_csv(CSV_RESULT_PATH)
|
| 212 |
df = df.sort_values(by="Overall", ascending=False)
|
| 213 |
return df
|
| 214 |
|
| 215 |
|
| 216 |
-
|
| 217 |
block = gr.Blocks()
|
| 218 |
|
| 219 |
|
|
@@ -230,18 +41,9 @@ with block:
|
|
| 230 |
value=AVG_INFO,
|
| 231 |
label="Evaluation Dimension",
|
| 232 |
interactive=True,
|
| 233 |
-
)
|
| 234 |
-
|
| 235 |
-
with gr.Row():
|
| 236 |
-
# selection for model size part:
|
| 237 |
-
model_size = gr.CheckboxGroup(
|
| 238 |
-
choices=MODEL_SIZE,
|
| 239 |
-
value=MODEL_SIZE,
|
| 240 |
-
label="Model Size",
|
| 241 |
-
interactive=True,
|
| 242 |
-
)
|
| 243 |
|
| 244 |
-
baseline_value = get_baseline_df()
|
| 245 |
baseline_header = MODEL_INFO + checkbox_group.value
|
| 246 |
baseline_datatype = ['markdown'] * 2 + ['number'] * len(checkbox_group.value)
|
| 247 |
|
|
@@ -254,35 +56,23 @@ with block:
|
|
| 254 |
visible=True,
|
| 255 |
)
|
| 256 |
|
| 257 |
-
def
|
| 258 |
-
|
| 259 |
updated_data = get_all_df()
|
| 260 |
-
# model_size
|
| 261 |
-
|
| 262 |
-
def custom_filter(row, model_size_filters):
|
| 263 |
-
model_size = row['Model Size']
|
| 264 |
-
model_size = model_size.upper()
|
| 265 |
-
|
| 266 |
-
if model_size == '-':
|
| 267 |
-
size_filter = '-' in model_size_filters
|
| 268 |
-
elif 'B' in model_size:
|
| 269 |
-
size = float(model_size.replace('B', ''))
|
| 270 |
-
size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
|
| 271 |
-
else:
|
| 272 |
-
size_filter = False
|
| 273 |
-
|
| 274 |
-
return size_filter
|
| 275 |
-
|
| 276 |
-
mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size)
|
| 277 |
-
updated_data = updated_data[mask]
|
| 278 |
|
| 279 |
# columns:
|
| 280 |
-
selected_columns = [
|
|
|
|
|
|
|
| 281 |
present_columns = MODEL_INFO + selected_columns
|
| 282 |
updated_data = updated_data[present_columns]
|
| 283 |
-
updated_data = updated_data.sort_values(
|
|
|
|
|
|
|
| 284 |
updated_headers = present_columns
|
| 285 |
-
update_datatype = [
|
|
|
|
|
|
|
|
|
|
| 286 |
|
| 287 |
filter_component = gr.components.Dataframe(
|
| 288 |
value=updated_data,
|
|
@@ -294,13 +84,10 @@ with block:
|
|
| 294 |
)
|
| 295 |
return filter_component
|
| 296 |
|
| 297 |
-
|
| 298 |
-
model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, checkbox_group], outputs=data_component)
|
| 299 |
-
checkbox_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, checkbox_group], outputs=data_component)
|
| 300 |
|
| 301 |
def refresh_data():
|
| 302 |
-
value = get_baseline_df()
|
| 303 |
-
|
| 304 |
return value
|
| 305 |
|
| 306 |
with gr.Row():
|
|
|
|
| 1 |
+
__all__ = ['block']
|
| 2 |
|
| 3 |
import gradio as gr
|
| 4 |
import pandas as pd
|
| 5 |
+
from constants import (
|
| 6 |
+
MODEL_INFO, TASK_INFO, AVG_INFO, DATA_TITILE_TYPE,
|
| 7 |
+
COLUMN_NAMES, CSV_RESULT_PATH, LEADERBORAD_INTRODUCTION,
|
| 8 |
+
CITATION_BUTTON_TEXT, CITATION_BUTTON_LABEL
|
| 9 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
|
|
|
| 11 |
|
| 12 |
+
def get_baseline_df(selected_columns=None):
|
| 13 |
+
if selected_columns is None:
|
| 14 |
+
selected_columns = AVG_INFO
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
df = pd.read_csv(CSV_RESULT_PATH)
|
| 16 |
df = df.sort_values(by="Overall", ascending=False)
|
| 17 |
+
present_columns = MODEL_INFO + selected_columns
|
| 18 |
df = df[present_columns]
|
| 19 |
return df
|
| 20 |
|
| 21 |
|
| 22 |
def get_all_df():
|
|
|
|
| 23 |
df = pd.read_csv(CSV_RESULT_PATH)
|
| 24 |
df = df.sort_values(by="Overall", ascending=False)
|
| 25 |
return df
|
| 26 |
|
| 27 |
|
|
|
|
| 28 |
block = gr.Blocks()
|
| 29 |
|
| 30 |
|
|
|
|
| 41 |
value=AVG_INFO,
|
| 42 |
label="Evaluation Dimension",
|
| 43 |
interactive=True,
|
| 44 |
+
) # user can select the evaluation dimension
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
+
baseline_value = get_baseline_df(checkbox_group.value)
|
| 47 |
baseline_header = MODEL_INFO + checkbox_group.value
|
| 48 |
baseline_datatype = ['markdown'] * 2 + ['number'] * len(checkbox_group.value)
|
| 49 |
|
|
|
|
| 56 |
visible=True,
|
| 57 |
)
|
| 58 |
|
| 59 |
+
def on_filter_method_change(selected_columns):
|
|
|
|
| 60 |
updated_data = get_all_df()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
# columns:
|
| 63 |
+
selected_columns = [
|
| 64 |
+
item for item in TASK_INFO if item in selected_columns
|
| 65 |
+
]
|
| 66 |
present_columns = MODEL_INFO + selected_columns
|
| 67 |
updated_data = updated_data[present_columns]
|
| 68 |
+
updated_data = updated_data.sort_values(
|
| 69 |
+
by=selected_columns[0], ascending=False
|
| 70 |
+
)
|
| 71 |
updated_headers = present_columns
|
| 72 |
+
update_datatype = [
|
| 73 |
+
DATA_TITILE_TYPE[COLUMN_NAMES.index(x)]
|
| 74 |
+
for x in updated_headers
|
| 75 |
+
]
|
| 76 |
|
| 77 |
filter_component = gr.components.Dataframe(
|
| 78 |
value=updated_data,
|
|
|
|
| 84 |
)
|
| 85 |
return filter_component
|
| 86 |
|
| 87 |
+
checkbox_group.change(fn=on_filter_method_change, inputs=[checkbox_group], outputs=data_component)
|
|
|
|
|
|
|
| 88 |
|
| 89 |
def refresh_data():
|
| 90 |
+
value = get_baseline_df(checkbox_group.value)
|
|
|
|
| 91 |
return value
|
| 92 |
|
| 93 |
with gr.Row():
|
constants.py
CHANGED
|
@@ -1,18 +1,29 @@
|
|
| 1 |
# this is .py for store constants
|
| 2 |
-
MODEL_INFO = ["
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
AVG_INFO = ["Overall"]
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
CSV_RESULT_PATH = "./download_from_dataset/result.csv"
|
| 13 |
COLUMN_NAMES = MODEL_INFO + TASK_INFO
|
| 14 |
|
| 15 |
-
LEADERBORAD_VERSION = ["
|
| 16 |
|
| 17 |
|
| 18 |
LEADERBORAD_INTRODUCTION = """
|
|
@@ -34,7 +45,7 @@ Following the evolution from MMMU to MMMU-Pro, JMMMU-Pro extends JMMMU by compos
|
|
| 34 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 35 |
CITATION_BUTTON_TEXT = r"""@article{miyai2025jmmmu-pro,
|
| 36 |
author = {Miyai, Atsuyuki and Onohara, Shota and Baek, Jeonghun and Aizawa, Kiyoharu},
|
| 37 |
-
title = {JMMMU-Pro:
|
| 38 |
journal = {TBD},
|
| 39 |
year = {2025},
|
| 40 |
}"""
|
|
|
|
| 1 |
# this is .py for store constants
|
| 2 |
+
MODEL_INFO = ["model_name", "prompt"]
|
| 3 |
+
TASK_INFO = [
|
| 4 |
+
"Overall", "culture-specific", "culture-agnostic",
|
| 5 |
+
"Japanese_Art", "Japanese_Heritage", "Japanese_History",
|
| 6 |
+
"World_History", "Accounting", "Agriculture",
|
| 7 |
+
"Architecture_and_Engineering", "Basic_Medical_Science",
|
| 8 |
+
"Biology", "Chemistry", "Clinical_Medicine",
|
| 9 |
+
"Computer_Science", "Design",
|
| 10 |
+
"Diagnostics_and_Laboratory_Medicine", "Economics",
|
| 11 |
+
"Electronics", "Energy_and_Power", "Finance", "Manage",
|
| 12 |
+
"Marketing", "Materials", "Math", "Mechanical_Engineering",
|
| 13 |
+
"Music", "Pharmacy", "Physics", "Psychology", "Public_Health"
|
| 14 |
+
]
|
| 15 |
AVG_INFO = ["Overall"]
|
| 16 |
|
| 17 |
+
# Data types for each column:
|
| 18 |
+
# model_name (markdown), prompt (markdown), then all numbers
|
| 19 |
+
DATA_TITILE_TYPE = (
|
| 20 |
+
["markdown", "markdown"] + ["number"] * (len(TASK_INFO))
|
| 21 |
+
)
|
| 22 |
|
| 23 |
+
CSV_RESULT_PATH = "./result.csv"
|
|
|
|
|
|
|
| 24 |
COLUMN_NAMES = MODEL_INFO + TASK_INFO
|
| 25 |
|
| 26 |
+
LEADERBORAD_VERSION = ["JMMMU_Pro"]
|
| 27 |
|
| 28 |
|
| 29 |
LEADERBORAD_INTRODUCTION = """
|
|
|
|
| 45 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 46 |
CITATION_BUTTON_TEXT = r"""@article{miyai2025jmmmu-pro,
|
| 47 |
author = {Miyai, Atsuyuki and Onohara, Shota and Baek, Jeonghun and Aizawa, Kiyoharu},
|
| 48 |
+
title = {JMMMU-Pro: Image-based Japanese Multi-discipline Multimodal Understanding Benchmark via Vibe Benchmark Construction},
|
| 49 |
journal = {TBD},
|
| 50 |
year = {2025},
|
| 51 |
}"""
|
result.csv
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_name,prompt,Overall,culture-specific,culture-agnostic,Japanese_Art,Japanese_Heritage,Japanese_History,World_History,Accounting,Agriculture,Architecture_and_Engineering,Basic_Medical_Science,Biology,Chemistry,Clinical_Medicine,Computer_Science,Design,Diagnostics_and_Laboratory_Medicine,Economics,Electronics,Energy_and_Power,Finance,Manage,Marketing,Materials,Math,Mechanical_Engineering,Music,Pharmacy,Physics,Psychology,Public_Health
|
| 2 |
+
Aya-Vision-8B,Direct,22.424,23.833,21.25,26.0,26.667,20.0,22.667,30.0,20.0,20.0,30.0,16.667,20.0,26.667,13.333,23.333,23.333,26.667,10.0,23.333,26.667,30.0,16.667,10.0,20.0,20.0,16.667,20.0,16.667,20.0,30.0
|
| 3 |
+
Heron-NVILA-Lite-15B,Direct,26.97,26.667,27.222,26.0,27.333,31.333,22.0,23.333,30.0,30.0,33.333,26.667,20.0,23.333,33.333,40.0,30.0,30.0,10.0,30.0,26.667,26.667,16.667,26.667,30.0,23.333,13.333,26.667,30.0,23.333,50.0
|
| 4 |
+
Phi-4-multimodal,Direct,31.818,28.833,34.306,29.333,18.0,30.0,38.0,26.667,33.333,20.0,46.667,23.333,26.667,30.0,56.667,20.0,13.333,43.333,50.0,40.0,20.0,30.0,43.333,46.667,33.333,46.667,33.333,36.667,26.667,33.333,43.333
|
| 5 |
+
InternVL2.5-8B,Direct,25.076,23.833,26.111,26.0,18.0,24.667,26.667,30.0,13.333,30.0,30.0,23.333,23.333,20.0,30.0,23.333,33.333,26.667,26.667,36.667,26.667,30.0,33.333,16.667,23.333,36.667,26.667,13.333,30.0,23.333,20.0
|
| 6 |
+
Qwen2.5-VL-7B,Direct,44.697,50.167,40.139,42.667,44.0,38.667,75.333,30.0,53.333,30.0,60.0,40.0,36.667,36.667,50.0,53.333,40.0,30.0,43.333,33.333,46.667,23.333,36.667,33.333,36.667,33.333,36.667,53.333,33.333,50.0,43.333
|
| 7 |
+
Qwen3-VL-8B,Direct,45.833,47.0,44.861,39.333,37.333,40.667,70.667,50.0,53.333,46.667,73.333,40.0,26.667,40.0,43.333,53.333,40.0,53.333,43.333,33.333,36.667,43.333,50.0,40.0,33.333,53.333,40.0,46.667,43.333,53.333,40.0
|
| 8 |
+
Gemini3.0,Direct,87.045,95.0,80.417,91.333,96.667,95.333,96.667,80.0,66.667,93.333,90.0,70.0,90.0,83.333,86.667,80.0,56.667,83.333,83.333,86.667,90.0,73.333,86.667,73.333,90.0,80.0,46.667,83.333,86.667,83.333,86.667
|
| 9 |
+
GPT-5.2,Direct,78.409,87.167,71.111,79.333,94.0,87.333,88.0,73.333,56.667,40.0,83.333,56.667,73.333,80.0,80.0,76.667,56.667,86.667,66.667,56.667,83.333,66.667,90.0,60.0,66.667,56.667,30.0,96.667,96.667,80.0,93.333
|
| 10 |
+
gpt-5.2,Direct,83.333,88.333,79.167,80.0,92.667,90.0,90.667,80.0,63.333,86.667,93.333,70.0,90.0,60.0,83.333,83.333,53.333,80.0,83.333,73.333,90.0,76.667,90.0,76.667,93.333,50.0,60.0,86.667,93.333,86.667,96.667
|
| 11 |
+
LLaVA-OneVision-1.5-8B,Direct,29.924,26.333,32.917,34.0,22.0,20.667,28.667,33.333,23.333,26.667,43.333,16.667,33.333,23.333,33.333,23.333,23.333,36.667,43.333,33.333,26.667,36.667,43.333,46.667,33.333,43.333,43.333,23.333,40.0,26.667,33.333
|
| 12 |
+
LLaVA-OV-7B,Direct,27.348,26.5,28.056,30.667,26.667,26.0,22.667,46.667,26.667,30.0,33.333,26.667,30.0,26.667,23.333,26.667,23.333,30.0,33.333,36.667,16.667,23.333,16.667,33.333,23.333,33.333,36.667,33.333,13.333,20.0,30.0
|
| 13 |
+
Pangea-7B,Direct,19.545,23.0,16.667,28.0,24.667,20.0,19.333,13.333,16.667,23.333,16.667,10.0,3.333,3.333,13.333,10.0,10.0,20.0,13.333,16.667,20.0,33.333,26.667,13.333,20.0,20.0,30.0,23.333,13.333,16.667,13.333
|
| 14 |
+
Sarashina2-V-14B,Direct,30.682,32.333,29.306,32.0,27.333,31.333,38.667,33.333,33.333,23.333,30.0,20.0,23.333,20.0,26.667,26.667,36.667,36.667,33.333,43.333,30.0,20.0,30.0,20.0,43.333,20.0,20.0,40.0,26.667,23.333,43.333
|
| 15 |
+
Sarashina2-V-8B,Direct,27.879,27.0,28.611,29.333,24.0,28.0,26.667,40.0,30.0,26.667,36.667,26.667,23.333,16.667,23.333,36.667,26.667,33.333,40.0,23.333,20.0,26.667,13.333,26.667,40.0,20.0,20.0,40.0,30.0,26.667,40.0
|
| 16 |
+
Sarashina2.2-V-3B,Direct,38.03,40.167,36.25,42.0,29.333,36.0,53.333,36.667,50.0,36.667,53.333,33.333,33.333,30.0,50.0,43.333,40.0,43.333,46.667,33.333,30.0,20.0,30.0,33.333,36.667,26.667,26.667,36.667,23.333,33.333,43.333
|
| 17 |
+
Aya-Vision-8B,CoT,26.742,27.0,26.528,22.0,30.667,26.667,28.667,23.333,13.333,23.333,40.0,26.667,13.333,13.333,30.0,36.667,26.667,30.0,26.667,33.333,26.667,43.333,30.0,30.0,16.667,36.667,26.667,26.667,16.667,20.0,26.667
|
| 18 |
+
Heron-NVILA-Lite-15B,CoT,5.303,1.0,8.889,1.333,1.333,0.667,0.667,10.0,13.333,0.0,3.333,10.0,6.667,0.0,10.0,6.667,3.333,3.333,23.333,6.667,13.333,0.0,10.0,10.0,10.0,23.333,10.0,3.333,0.0,16.667,20.0
|
| 19 |
+
Phi-4-multimodal,CoT,24.167,22.0,25.972,19.333,18.667,17.333,32.667,33.333,20.0,26.667,36.667,20.0,13.333,16.667,30.0,10.0,20.0,43.333,40.0,33.333,26.667,23.333,26.667,26.667,20.0,23.333,23.333,33.333,20.0,23.333,33.333
|
| 20 |
+
InternVL2.5-8B,CoT,31.212,29.0,33.056,29.333,22.667,28.0,36.0,33.333,26.667,23.333,43.333,36.667,26.667,43.333,40.0,40.0,23.333,40.0,33.333,33.333,36.667,26.667,36.667,16.667,30.0,40.0,13.333,40.0,40.0,23.333,46.667
|
| 21 |
+
Qwen2.5-VL-7B,CoT,45.0,46.667,43.611,34.0,41.333,43.333,68.0,43.333,30.0,33.333,83.333,40.0,36.667,53.333,36.667,33.333,33.333,46.667,46.667,26.667,53.333,33.333,50.0,36.667,43.333,36.667,40.0,60.0,43.333,43.333,63.333
|
| 22 |
+
Qwen3-VL-8B,CoT,47.273,47.5,47.083,42.667,34.0,44.667,68.667,53.333,53.333,10.0,63.333,36.667,43.333,40.0,43.333,46.667,36.667,60.0,33.333,30.0,43.333,56.667,80.0,23.333,56.667,36.667,26.667,63.333,66.667,66.667,60.0
|
| 23 |
+
LLaVA-OneVision-1.5-8B,CoT,31.97,28.0,35.278,28.0,20.667,29.333,34.0,30.0,16.667,40.0,46.667,36.667,23.333,30.0,50.0,30.0,20.0,36.667,43.333,43.333,60.0,23.333,30.0,40.0,36.667,46.667,30.0,26.667,33.333,26.667,46.667
|
| 24 |
+
LLaVA-OV-7B,CoT,14.091,14.333,13.889,11.333,14.0,12.667,19.333,13.333,13.333,6.667,13.333,6.667,13.333,13.333,10.0,23.333,0.0,20.0,3.333,10.0,33.333,3.333,13.333,13.333,26.667,16.667,20.0,20.0,10.0,10.0,20.0
|
| 25 |
+
Pangea-7B,CoT,23.409,21.667,24.861,22.0,22.0,24.0,18.667,13.333,26.667,26.667,33.333,16.667,10.0,23.333,23.333,40.0,20.0,26.667,33.333,30.0,30.0,20.0,30.0,36.667,23.333,13.333,16.667,36.667,20.0,20.0,26.667
|
| 26 |
+
Sarashina2-V-14B,CoT,30.0,30.5,29.583,30.0,26.0,29.333,36.667,36.667,33.333,13.333,40.0,23.333,13.333,13.333,30.0,33.333,36.667,43.333,36.667,46.667,33.333,20.0,23.333,30.0,36.667,20.0,23.333,43.333,26.667,20.0,33.333
|
| 27 |
+
Sarashina2-V-8B,CoT,27.273,25.333,28.889,24.0,24.667,31.333,21.333,36.667,23.333,26.667,36.667,23.333,23.333,16.667,30.0,36.667,26.667,30.0,43.333,23.333,30.0,36.667,16.667,26.667,30.0,20.0,16.667,40.0,23.333,30.0,46.667
|
| 28 |
+
Sarashina2.2-V-3B,CoT,42.879,54.0,33.611,48.0,51.333,51.333,65.333,36.667,40.0,23.333,63.333,30.0,13.333,23.333,30.0,33.333,26.667,50.0,36.667,23.333,43.333,16.667,26.667,33.333,30.0,20.0,23.333,53.333,43.333,40.0,46.667
|