Possible corruption of weights
#2
by
boydcheung
- opened
I used swift to load your model to perform evaluation on VSI-Bench, I've got the result,
{
"results": {
"vsibench": {
"alias": "vsibench",
"vsibench_score,none": {
"obj_appearance_order_accuracy": 0.2022653721682848,
"object_abs_distance_MRA:.5:.95:.05": 0.03501199040767387,
"object_counting_MRA:.5:.95:.05": 0.0895575221238938,
"object_rel_distance_accuracy": 0.10845070422535211,
"object_size_estimation_MRA:.5:.95:.05": 0.003987408184679958,
"room_size_estimation_MRA:.5:.95:.05": 0.003125,
"route_planning_accuracy": 0.07731958762886598,
"object_rel_direction_accuracy": 0.086771403831567,
"overall": 0.07581112357128969
},
"vsibench_score_stderr,none": "N/A"
}
},
"group_subtasks": {
"vsibench": []
},
"configs": {
"vsibench": {
"task": "vsibench",
"dataset_path": "nyu-visionx/VSI-Bench",
"dataset_kwargs": {
"token": true
},
"test_split": "test",
"full_docs": false,
"process_results_use_image": false,
"process_docs": "<function process_docs at 0x7f9ce9f61300>",
"doc_to_visual": "<function vsibench_doc_to_visual at 0x7f9ce9f61c60>",
"doc_to_text": "<function vsibench_doc_to_text at 0x7f9ce9f622a0>",
"doc_to_target": "ground_truth",
"process_results": "<function vsibench_process_results at 0x7f9ce9f62de0>",
"description": "",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"num_fewshot": 0,
"metric_list": [
{
"metric": "vsibench_score",
"aggregation": "<function vsibench_aggregate_results at 0x7f9ce9f63560>",
"higher_is_better": true
}
],
"output_type": "generate_until",
"generation_kwargs": {
"max_new_tokens": 16,
"temperature": 0.0,
"top_p": 1.0,
"num_beams": 1,
"do_sample": false,
"until": [
"\n\n"
]
},
"repeats": 1,
"should_decontaminate": false,
"metadata": [
{
"version": 0.0
}
],
"lmms_eval_specific_kwargs": {
"default": {
"pre_prompt": "",
"mca_post_prompt": "Answer with the option's letter from the given choices directly.",
"na_post_prompt": "Please answer the question using a single word or phrase."
},
"gemini_api": {
"pre_prompt": "",
"mca_post_prompt": "Answer with the option's letter from the given choices directly.",
"na_post_prompt": "Do not response anything other than a single number!"
},
"gpt4v": {
"pre_prompt": "",
"mca_post_prompt": "Answer with the option's letter from the given choices directly.",
"na_post_prompt": "Do not response anything other than a single number!"
},
"pre_prompt": "",
"mca_post_prompt": "Answer with the option's letter from the given choices directly.",
"na_post_prompt": "Please answer the question using a single word or phrase."
}
}
},
"versions": {
"vsibench": "Yaml"
},
"n-shot": {
"vsibench": 0
},
"higher_is_better": {
"vsibench": {
"vsibench_score": true
}
},
"n-samples": {
"vsibench": {
"original": 5130,
"effective": 5130
}
},
"config": {
"model": "qwen2_5_vl",
"model_args": "pretrained=/mnt/vlm/common/models/X-Humanoid/Pelican1.0-VL-7B/",
"batch_size": 1,
"batch_sizes": [],
"device": null,
"use_cache": null,
"limit": null,
"bootstrap_iters": 100000,
"gen_kwargs": "",
"random_seed": 0,
"numpy_seed": 1234,
"torch_seed": 1234,
"fewshot_seed": 1234
},
"git_hash": "c894e588",
"date": "20251202_120508",
"task_hashes": {
"vsibench": "60e3948f7b6e8072d0fce80ade35b2adc712bfb3bb628c15b145701c3a947076"
},
"model_source": "qwen2_5_vl",
"model_name": "/mnt/vlm/common/models/X-Humanoid/Pelican1.0-VL-7B/",
"model_name_sanitized": "Pelican1.0-VL-7B__",
"system_instruction": null,
"system_instruction_sha": null,
"fewshot_as_multiturn": false,
"chat_template": null,
"chat_template_sha": null,
"start_time": 1651138.314116108,
"end_time": 1693089.558756093,
"total_evaluation_time_seconds": "41951.24463998503"
}
It seems the model output is also unexpected,
{"doc_id": 0, "target": "4", "filtered_resps": ["3chtsvalidators Catalyst zupełnesses לעבור whale까요? readOnlychts נכПравหร"], "doc_hash": "74234e98afe7498fb5daf1f36ac2d78acc339464f950703b8c019892f982b90b", "vsibench_score": {"id": 0, "dataset": "arkitscenes", "scene_name": "41069025", "question_type": "object_counting", "question": "How many table(s) are in this room?", "ground_truth": "4", "options": null, "pruned": false, "prediction": "3chtsvalidators Catalyst zupełnesses לעבור whale까요? readOnlychts נכПравหร", "MRA:.5:.95:.05": 0.0}, "input": "These are frames of a video.\nHow many table(s) are in this room?\nPlease answer the question using a single word or phrase."}
Could you confirm that the weights are correct?