Spaces:

JMMMU
/

JMMMU-Pro_Leaderboard

Running

App Files Files Community

AtsuMiyai commited on Dec 16, 2025

Commit

9f962ea

1 Parent(s): 3136a24

update

Browse files

Files changed (3) hide show

app.py +25 -238
constants.py +21 -10
result.csv +28 -0

app.py CHANGED Viewed

@@ -1,219 +1,30 @@
-__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
 import gradio as gr
 import pandas as pd
-import re
-import pandas as pd
-import numpy as np
-from collections import defaultdict
-from constants import *
-import os
-from huggingface_hub import Repository
-import json
-global data_component, filter_component
-TOKEN = os.environ.get("TOKEN")
-repo = Repository(local_dir="./download_from_dataset", clone_from="JMMMU/leaderboard_result", repo_type="dataset", use_auth_token=TOKEN)
-current_directory = os.getcwd()
-def validate_model_size(s):
-    pattern = r'^\d+B$|^-$'
-    if re.match(pattern, s):
-        return s
-    else:
-        return '-'
-def upload_file(files):
-    file_paths = [file.name for file in files]
-    return file_paths
-def get_acc(data, subject_list):
-    acc = 0
-    for subject in subject_list:
-        acc += data["results"][subject]['jmmmu_acc,none']
-    acc = acc/len(subject_list)
-    acc = acc * 100
-    acc = round(acc, 1)
-    return acc
-def calculate_score(input_file):
-    json_string = input_file.decode('utf-8')
-    data = json.loads(json_string)
-    result_dict = {}
-    overall = data["results"]["jmmmu"]['jmmmu_acc,none']*100
-    ca = data["results"]["culture_agnostic"]['jmmmu_acc,none']*100
-    cs = data["results"]["culture_specific"]['jmmmu_acc,none']*100
-    overall = round(overall, 1)
-    ca = round(ca, 1)
-    cs = round(cs, 1)
-    # Art_Psychology
-    art_psychology_subject_list = ["jmmmu_design", "jmmmu_music", "jmmmu_psychology"]
-    # Science
-    science_subject_list = ["jmmmu_biology", "jmmmu_chemistry", "jmmmu_physics", "jmmmu_math"]
-    # Business
-    business_subject_list = ["jmmmu_accounting", "jmmmu_economics", "jmmmu_finance", "jmmmu_manage", "jmmmu_marketing"]
-    # Medicine
-    medicine_subject_list = ["jmmmu_basic_medical_science", "jmmmu_clinical_medicine", "jmmmu_diagnostics_and_laboratory_medicine", "jmmmu_pharmacy", "jmmmu_public_health"]
-    # Tech_Eng.
-    tech_eng_subject_list = ["jmmmu_agriculture", "jmmmu_architecture_and_engineering", "jmmmu_computer_science", "jmmmu_electronics", "jmmmu_energy_and_power", "jmmmu_materials", "jmmmu_mechanical_engineering"]
-    jmmmu_japanese_art_subject_list = ["jmmmu_japanese_art"]
-    jmmmu_japanese_heritage_subject_list = ["jmmmu_japanese_heritage"]
-    jmmmu_japanese_history_subject_list = ["jmmmu_japanese_history"]
-    jmmmu_world_history_subject_list = ["jmmmu_world_history"]
-    art_psychology = get_acc(data, art_psychology_subject_list)
-    science = get_acc(data, science_subject_list)
-    business = get_acc(data, business_subject_list)
-    medicine = get_acc(data, medicine_subject_list)
-    tech_eng = get_acc(data, tech_eng_subject_list)
-    japanese_art = get_acc(data, jmmmu_japanese_art_subject_list)
-    japanese_heritage = get_acc(data, jmmmu_japanese_heritage_subject_list)
-    japanese_history = get_acc(data, jmmmu_japanese_history_subject_list)
-    world_history = get_acc(data, jmmmu_world_history_subject_list)
-    result_dict =\
-        {
-            "overall": overall,
-            "cultureSpecific": cs,
-            "cultureAgnostic": ca,
-            "japaneseArt": japanese_art,
-            "japaneseHeritage": japanese_heritage,
-            "japaneseHistory": japanese_history,
-            "worldHistory": world_history,
-            "artPsychology": art_psychology,
-            "business": business,
-            "science": science,
-            "healthMedicine": medicine,
-            "techEngineering": tech_eng
-        }
-    return result_dict
-def add_new_eval(
-    input_file,
-    model_type: str,
-    model_name_textbox: str,
-    revision_name_textbox: str,
-    model_link: str,
-    model_size: str,
-    # upd_type: str,
-    # question_type: str
-):
-    if input_file is None:
-        warning_text = "Error! Empty file!"
-        print(warning_text)
-        return warning_text
-    else:
-        model_size = validate_model_size(model_size)
-        # if upd_type == 'AAD':
-        csv_path = CSV_RESULT_PATH
-        # validity_check(input_file)
-        csv_data = pd.read_csv(csv_path)
-        result_dict = calculate_score(input_file)
-        if revision_name_textbox == '':
-            col = csv_data.shape[0]
-            model_name = model_name_textbox
-        else:
-            model_name = revision_name_textbox
-            model_name_list = csv_data['Model']
-            name_list = [name.split(']')[0][1:] for name in model_name_list]
-            if revision_name_textbox not in name_list:
-                col = csv_data.shape[0]
-            else:
-                col = name_list.index(revision_name_textbox)
-        model_name_wo_link = model_name
-        if model_link == '':
-            model_name = model_name  # no url
-        else:
-            model_name = '[' + model_name + '](' + model_link + ')'
-        # add new data
-        new_data = [
-            model_type,
-            model_name,
-            model_size,
-            result_dict["overall"],
-            result_dict["cultureSpecific"],
-            result_dict["cultureAgnostic"],
-            result_dict["japaneseArt"],
-            result_dict["japaneseHeritage"],
-            result_dict["japaneseHistory"],
-            result_dict["worldHistory"],
-            result_dict["artPsychology"],
-            result_dict["business"],
-            result_dict["science"],
-            result_dict["healthMedicine"],
-            result_dict["techEngineering"]
-            ]
-        # If the same data already exists, return an error.
-        if new_data in csv_data.values.tolist():
-            warning_text = "Error! The same data already exists!"
-            print(warning_text)
-            return warning_text
-        # If the same model name already exists, return an error.
-        elif new_data[:5] in csv_data.values.tolist():
-            warning_text = "Error! The same data already exists! Please fill revision_name."
-            print(warning_text)
-            return warning_text
-        csv_data.loc[col] = new_data
-        csv_data = csv_data.to_csv(csv_path, index=False)
-        absolute_result_path = os.path.abspath(csv_path)
-        if not os.path.exists(absolute_result_path):
-            raise FileNotFoundError(f"File {absolute_result_path} not found")
-        repo.git_pull()
-        repo.git_add(absolute_result_path)
-        save_path = os.path.join(CSV_QUEUE_DIR, f"{model_name_wo_link}.json")
-        with open(save_path, "wb") as f:
-            f.write(input_file)
-        absolute_queue_path = os.path.abspath(save_path)
-        repo.git_add(absolute_queue_path)
-        repo.git_commit(f"add {model_name_wo_link} results")
-        repo.git_push()
-        print(f"Success! Your {model_name_wo_link} has been added!")
-    return 0
-def get_baseline_df():
-    repo.git_pull()
     df = pd.read_csv(CSV_RESULT_PATH)
     df = df.sort_values(by="Overall", ascending=False)
-    present_columns = MODEL_INFO + checkbox_group.value
     df = df[present_columns]
     return df
 def get_all_df():
-    repo.git_pull()
     df = pd.read_csv(CSV_RESULT_PATH)
     df = df.sort_values(by="Overall", ascending=False)
     return df
 block = gr.Blocks()
@@ -230,18 +41,9 @@ with block:
                 value=AVG_INFO,
                 label="Evaluation Dimension",
                 interactive=True,
-            ) # user can select the evaluation dimension
-            with gr.Row():
-                # selection for model size part:
-                model_size = gr.CheckboxGroup(
-                    choices=MODEL_SIZE,
-                    value=MODEL_SIZE,
-                    label="Model Size",
-                    interactive=True,
-                )
-            baseline_value = get_baseline_df()
             baseline_header = MODEL_INFO + checkbox_group.value
             baseline_datatype = ['markdown'] * 2 + ['number'] * len(checkbox_group.value)
@@ -254,35 +56,23 @@ with block:
                 visible=True,
                 )
-            def on_filter_model_size_method_change(selected_model_size, selected_columns):
                 updated_data = get_all_df()
-                # model_size
-                def custom_filter(row, model_size_filters):
-                    model_size = row['Model Size']
-                    model_size = model_size.upper()
-                    if model_size == '-':
-                        size_filter = '-' in model_size_filters
-                    elif 'B' in model_size:
-                        size = float(model_size.replace('B', ''))
-                        size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
-                    else:
-                        size_filter = False
-                    return size_filter
-                mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size)
-                updated_data = updated_data[mask]
                 # columns:
-                selected_columns = [item for item in TASK_INFO if item in selected_columns]
                 present_columns = MODEL_INFO + selected_columns
                 updated_data = updated_data[present_columns]
-                updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
                 updated_headers = present_columns
-                update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
                 filter_component = gr.components.Dataframe(
                     value=updated_data,
@@ -294,13 +84,10 @@ with block:
                     )
                 return filter_component
-            model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, checkbox_group], outputs=data_component)
-            checkbox_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, checkbox_group], outputs=data_component)
     def refresh_data():
-        value = get_baseline_df()
         return value
     with gr.Row():

+__all__ = ['block']
 import gradio as gr
 import pandas as pd
+from constants import (
+    MODEL_INFO, TASK_INFO, AVG_INFO, DATA_TITILE_TYPE,
+    COLUMN_NAMES, CSV_RESULT_PATH, LEADERBORAD_INTRODUCTION,
+    CITATION_BUTTON_TEXT, CITATION_BUTTON_LABEL
+)
+def get_baseline_df(selected_columns=None):
+    if selected_columns is None:
+        selected_columns = AVG_INFO
     df = pd.read_csv(CSV_RESULT_PATH)
     df = df.sort_values(by="Overall", ascending=False)
+    present_columns = MODEL_INFO + selected_columns
     df = df[present_columns]
     return df
 def get_all_df():
     df = pd.read_csv(CSV_RESULT_PATH)
     df = df.sort_values(by="Overall", ascending=False)
     return df
 block = gr.Blocks()
                 value=AVG_INFO,
                 label="Evaluation Dimension",
                 interactive=True,
+            )  # user can select the evaluation dimension
+            baseline_value = get_baseline_df(checkbox_group.value)
             baseline_header = MODEL_INFO + checkbox_group.value
             baseline_datatype = ['markdown'] * 2 + ['number'] * len(checkbox_group.value)
                 visible=True,
                 )
+            def on_filter_method_change(selected_columns):
                 updated_data = get_all_df()
                 # columns:
+                selected_columns = [
+                    item for item in TASK_INFO if item in selected_columns
+                ]
                 present_columns = MODEL_INFO + selected_columns
                 updated_data = updated_data[present_columns]
+                updated_data = updated_data.sort_values(
+                    by=selected_columns[0], ascending=False
+                )
                 updated_headers = present_columns
+                update_datatype = [
+                    DATA_TITILE_TYPE[COLUMN_NAMES.index(x)]
+                    for x in updated_headers
+                ]
                 filter_component = gr.components.Dataframe(
                     value=updated_data,
                     )
                 return filter_component
+            checkbox_group.change(fn=on_filter_method_change, inputs=[checkbox_group], outputs=data_component)
     def refresh_data():
+        value = get_baseline_df(checkbox_group.value)
         return value
     with gr.Row():

constants.py CHANGED Viewed

@@ -1,18 +1,29 @@
 # this is .py for store constants
-MODEL_INFO = ["Model Type", "Model"]
-MODEL_SIZE = ["<10B", ">=10B", "-"]
-LEADERBOARD_VERSION = ["Version1"]
-TASK_INFO = ["Overall", "Culture-Specific", "Culture-Agnostic", "Japanese Art", "Japanese Heritage", "Japanese History", "World History", "Art & Psychology", "Business", "Science", "Health & Medicine", "Tech & Engineering"]
-# Overall, Culture-Specific, Culture-Agnostic, English Original, Japanese Art, Japanese Heritage, Japanese History, World History, Art & Psychology, Business, Science, Health & Medicine, Tech & Engineering
 AVG_INFO = ["Overall"]
-DATA_TITILE_TYPE = ["markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
-CSV_RESULT_PATH = "./download_from_dataset/result.csv"
 COLUMN_NAMES = MODEL_INFO + TASK_INFO
-LEADERBORAD_VERSION = ["JMMMU"]
 LEADERBORAD_INTRODUCTION = """
@@ -34,7 +45,7 @@ Following the evolution from MMMU to MMMU-Pro, JMMMU-Pro extends JMMMU by compos
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""@article{miyai2025jmmmu-pro,
   author    = {Miyai, Atsuyuki and Onohara, Shota and Baek, Jeonghun and Aizawa, Kiyoharu},
-  title     = {JMMMU-Pro: Vibe Benchmark Construction of Image-based Japanese Multi-discipline Multimodal Understanding Benchmark},
   journal   = {TBD},
   year      = {2025},
 }"""

 # this is .py for store constants
+MODEL_INFO = ["model_name", "prompt"]
+TASK_INFO = [
+    "Overall", "culture-specific", "culture-agnostic",
+    "Japanese_Art", "Japanese_Heritage", "Japanese_History",
+    "World_History", "Accounting", "Agriculture",
+    "Architecture_and_Engineering", "Basic_Medical_Science",
+    "Biology", "Chemistry", "Clinical_Medicine",
+    "Computer_Science", "Design",
+    "Diagnostics_and_Laboratory_Medicine", "Economics",
+    "Electronics", "Energy_and_Power", "Finance", "Manage",
+    "Marketing", "Materials", "Math", "Mechanical_Engineering",
+    "Music", "Pharmacy", "Physics", "Psychology", "Public_Health"
+]
 AVG_INFO = ["Overall"]
+# Data types for each column:
+# model_name (markdown), prompt (markdown), then all numbers
+DATA_TITILE_TYPE = (
+    ["markdown", "markdown"] + ["number"] * (len(TASK_INFO))
+)
+CSV_RESULT_PATH = "./result.csv"
 COLUMN_NAMES = MODEL_INFO + TASK_INFO
+LEADERBORAD_VERSION = ["JMMMU_Pro"]
 LEADERBORAD_INTRODUCTION = """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""@article{miyai2025jmmmu-pro,
   author    = {Miyai, Atsuyuki and Onohara, Shota and Baek, Jeonghun and Aizawa, Kiyoharu},
+  title     = {JMMMU-Pro: Image-based Japanese Multi-discipline Multimodal Understanding Benchmark via Vibe Benchmark Construction},
   journal   = {TBD},
   year      = {2025},
 }"""

result.csv ADDED Viewed

	@@ -0,0 +1,28 @@

+model_name,prompt,Overall,culture-specific,culture-agnostic,Japanese_Art,Japanese_Heritage,Japanese_History,World_History,Accounting,Agriculture,Architecture_and_Engineering,Basic_Medical_Science,Biology,Chemistry,Clinical_Medicine,Computer_Science,Design,Diagnostics_and_Laboratory_Medicine,Economics,Electronics,Energy_and_Power,Finance,Manage,Marketing,Materials,Math,Mechanical_Engineering,Music,Pharmacy,Physics,Psychology,Public_Health
+Aya-Vision-8B,Direct,22.424,23.833,21.25,26.0,26.667,20.0,22.667,30.0,20.0,20.0,30.0,16.667,20.0,26.667,13.333,23.333,23.333,26.667,10.0,23.333,26.667,30.0,16.667,10.0,20.0,20.0,16.667,20.0,16.667,20.0,30.0
+Heron-NVILA-Lite-15B,Direct,26.97,26.667,27.222,26.0,27.333,31.333,22.0,23.333,30.0,30.0,33.333,26.667,20.0,23.333,33.333,40.0,30.0,30.0,10.0,30.0,26.667,26.667,16.667,26.667,30.0,23.333,13.333,26.667,30.0,23.333,50.0
+Phi-4-multimodal,Direct,31.818,28.833,34.306,29.333,18.0,30.0,38.0,26.667,33.333,20.0,46.667,23.333,26.667,30.0,56.667,20.0,13.333,43.333,50.0,40.0,20.0,30.0,43.333,46.667,33.333,46.667,33.333,36.667,26.667,33.333,43.333
+InternVL2.5-8B,Direct,25.076,23.833,26.111,26.0,18.0,24.667,26.667,30.0,13.333,30.0,30.0,23.333,23.333,20.0,30.0,23.333,33.333,26.667,26.667,36.667,26.667,30.0,33.333,16.667,23.333,36.667,26.667,13.333,30.0,23.333,20.0
+Qwen2.5-VL-7B,Direct,44.697,50.167,40.139,42.667,44.0,38.667,75.333,30.0,53.333,30.0,60.0,40.0,36.667,36.667,50.0,53.333,40.0,30.0,43.333,33.333,46.667,23.333,36.667,33.333,36.667,33.333,36.667,53.333,33.333,50.0,43.333
+Qwen3-VL-8B,Direct,45.833,47.0,44.861,39.333,37.333,40.667,70.667,50.0,53.333,46.667,73.333,40.0,26.667,40.0,43.333,53.333,40.0,53.333,43.333,33.333,36.667,43.333,50.0,40.0,33.333,53.333,40.0,46.667,43.333,53.333,40.0
+Gemini3.0,Direct,87.045,95.0,80.417,91.333,96.667,95.333,96.667,80.0,66.667,93.333,90.0,70.0,90.0,83.333,86.667,80.0,56.667,83.333,83.333,86.667,90.0,73.333,86.667,73.333,90.0,80.0,46.667,83.333,86.667,83.333,86.667
+GPT-5.2,Direct,78.409,87.167,71.111,79.333,94.0,87.333,88.0,73.333,56.667,40.0,83.333,56.667,73.333,80.0,80.0,76.667,56.667,86.667,66.667,56.667,83.333,66.667,90.0,60.0,66.667,56.667,30.0,96.667,96.667,80.0,93.333
+gpt-5.2,Direct,83.333,88.333,79.167,80.0,92.667,90.0,90.667,80.0,63.333,86.667,93.333,70.0,90.0,60.0,83.333,83.333,53.333,80.0,83.333,73.333,90.0,76.667,90.0,76.667,93.333,50.0,60.0,86.667,93.333,86.667,96.667
+LLaVA-OneVision-1.5-8B,Direct,29.924,26.333,32.917,34.0,22.0,20.667,28.667,33.333,23.333,26.667,43.333,16.667,33.333,23.333,33.333,23.333,23.333,36.667,43.333,33.333,26.667,36.667,43.333,46.667,33.333,43.333,43.333,23.333,40.0,26.667,33.333
+LLaVA-OV-7B,Direct,27.348,26.5,28.056,30.667,26.667,26.0,22.667,46.667,26.667,30.0,33.333,26.667,30.0,26.667,23.333,26.667,23.333,30.0,33.333,36.667,16.667,23.333,16.667,33.333,23.333,33.333,36.667,33.333,13.333,20.0,30.0
+Pangea-7B,Direct,19.545,23.0,16.667,28.0,24.667,20.0,19.333,13.333,16.667,23.333,16.667,10.0,3.333,3.333,13.333,10.0,10.0,20.0,13.333,16.667,20.0,33.333,26.667,13.333,20.0,20.0,30.0,23.333,13.333,16.667,13.333
+Sarashina2-V-14B,Direct,30.682,32.333,29.306,32.0,27.333,31.333,38.667,33.333,33.333,23.333,30.0,20.0,23.333,20.0,26.667,26.667,36.667,36.667,33.333,43.333,30.0,20.0,30.0,20.0,43.333,20.0,20.0,40.0,26.667,23.333,43.333
+Sarashina2-V-8B,Direct,27.879,27.0,28.611,29.333,24.0,28.0,26.667,40.0,30.0,26.667,36.667,26.667,23.333,16.667,23.333,36.667,26.667,33.333,40.0,23.333,20.0,26.667,13.333,26.667,40.0,20.0,20.0,40.0,30.0,26.667,40.0
+Sarashina2.2-V-3B,Direct,38.03,40.167,36.25,42.0,29.333,36.0,53.333,36.667,50.0,36.667,53.333,33.333,33.333,30.0,50.0,43.333,40.0,43.333,46.667,33.333,30.0,20.0,30.0,33.333,36.667,26.667,26.667,36.667,23.333,33.333,43.333
+Aya-Vision-8B,CoT,26.742,27.0,26.528,22.0,30.667,26.667,28.667,23.333,13.333,23.333,40.0,26.667,13.333,13.333,30.0,36.667,26.667,30.0,26.667,33.333,26.667,43.333,30.0,30.0,16.667,36.667,26.667,26.667,16.667,20.0,26.667
+Heron-NVILA-Lite-15B,CoT,5.303,1.0,8.889,1.333,1.333,0.667,0.667,10.0,13.333,0.0,3.333,10.0,6.667,0.0,10.0,6.667,3.333,3.333,23.333,6.667,13.333,0.0,10.0,10.0,10.0,23.333,10.0,3.333,0.0,16.667,20.0
+Phi-4-multimodal,CoT,24.167,22.0,25.972,19.333,18.667,17.333,32.667,33.333,20.0,26.667,36.667,20.0,13.333,16.667,30.0,10.0,20.0,43.333,40.0,33.333,26.667,23.333,26.667,26.667,20.0,23.333,23.333,33.333,20.0,23.333,33.333
+InternVL2.5-8B,CoT,31.212,29.0,33.056,29.333,22.667,28.0,36.0,33.333,26.667,23.333,43.333,36.667,26.667,43.333,40.0,40.0,23.333,40.0,33.333,33.333,36.667,26.667,36.667,16.667,30.0,40.0,13.333,40.0,40.0,23.333,46.667
+Qwen2.5-VL-7B,CoT,45.0,46.667,43.611,34.0,41.333,43.333,68.0,43.333,30.0,33.333,83.333,40.0,36.667,53.333,36.667,33.333,33.333,46.667,46.667,26.667,53.333,33.333,50.0,36.667,43.333,36.667,40.0,60.0,43.333,43.333,63.333
+Qwen3-VL-8B,CoT,47.273,47.5,47.083,42.667,34.0,44.667,68.667,53.333,53.333,10.0,63.333,36.667,43.333,40.0,43.333,46.667,36.667,60.0,33.333,30.0,43.333,56.667,80.0,23.333,56.667,36.667,26.667,63.333,66.667,66.667,60.0
+LLaVA-OneVision-1.5-8B,CoT,31.97,28.0,35.278,28.0,20.667,29.333,34.0,30.0,16.667,40.0,46.667,36.667,23.333,30.0,50.0,30.0,20.0,36.667,43.333,43.333,60.0,23.333,30.0,40.0,36.667,46.667,30.0,26.667,33.333,26.667,46.667
+LLaVA-OV-7B,CoT,14.091,14.333,13.889,11.333,14.0,12.667,19.333,13.333,13.333,6.667,13.333,6.667,13.333,13.333,10.0,23.333,0.0,20.0,3.333,10.0,33.333,3.333,13.333,13.333,26.667,16.667,20.0,20.0,10.0,10.0,20.0
+Pangea-7B,CoT,23.409,21.667,24.861,22.0,22.0,24.0,18.667,13.333,26.667,26.667,33.333,16.667,10.0,23.333,23.333,40.0,20.0,26.667,33.333,30.0,30.0,20.0,30.0,36.667,23.333,13.333,16.667,36.667,20.0,20.0,26.667
+Sarashina2-V-14B,CoT,30.0,30.5,29.583,30.0,26.0,29.333,36.667,36.667,33.333,13.333,40.0,23.333,13.333,13.333,30.0,33.333,36.667,43.333,36.667,46.667,33.333,20.0,23.333,30.0,36.667,20.0,23.333,43.333,26.667,20.0,33.333
+Sarashina2-V-8B,CoT,27.273,25.333,28.889,24.0,24.667,31.333,21.333,36.667,23.333,26.667,36.667,23.333,23.333,16.667,30.0,36.667,26.667,30.0,43.333,23.333,30.0,36.667,16.667,26.667,30.0,20.0,16.667,40.0,23.333,30.0,46.667
+Sarashina2.2-V-3B,CoT,42.879,54.0,33.611,48.0,51.333,51.333,65.333,36.667,40.0,23.333,63.333,30.0,13.333,23.333,30.0,33.333,26.667,50.0,36.667,23.333,43.333,16.667,26.667,33.333,30.0,20.0,23.333,53.333,43.333,40.0,46.667