Possible corruption of weights

#2
by boydcheung - opened

I used swift to load your model to perform evaluation on VSI-Bench, I've got the result,

{
  "results": {
    "vsibench": {
      "alias": "vsibench",
      "vsibench_score,none": {
        "obj_appearance_order_accuracy": 0.2022653721682848,
        "object_abs_distance_MRA:.5:.95:.05": 0.03501199040767387,
        "object_counting_MRA:.5:.95:.05": 0.0895575221238938,
        "object_rel_distance_accuracy": 0.10845070422535211,
        "object_size_estimation_MRA:.5:.95:.05": 0.003987408184679958,
        "room_size_estimation_MRA:.5:.95:.05": 0.003125,
        "route_planning_accuracy": 0.07731958762886598,
        "object_rel_direction_accuracy": 0.086771403831567,
        "overall": 0.07581112357128969
      },
      "vsibench_score_stderr,none": "N/A"
    }
  },
  "group_subtasks": {
    "vsibench": []
  },
  "configs": {
    "vsibench": {
      "task": "vsibench",
      "dataset_path": "nyu-visionx/VSI-Bench",
      "dataset_kwargs": {
        "token": true
      },
      "test_split": "test",
      "full_docs": false,
      "process_results_use_image": false,
      "process_docs": "<function process_docs at 0x7f9ce9f61300>",
      "doc_to_visual": "<function vsibench_doc_to_visual at 0x7f9ce9f61c60>",
      "doc_to_text": "<function vsibench_doc_to_text at 0x7f9ce9f622a0>",
      "doc_to_target": "ground_truth",
      "process_results": "<function vsibench_process_results at 0x7f9ce9f62de0>",
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "num_fewshot": 0,
      "metric_list": [
        {
          "metric": "vsibench_score",
          "aggregation": "<function vsibench_aggregate_results at 0x7f9ce9f63560>",
          "higher_is_better": true
        }
      ],
      "output_type": "generate_until",
      "generation_kwargs": {
        "max_new_tokens": 16,
        "temperature": 0.0,
        "top_p": 1.0,
        "num_beams": 1,
        "do_sample": false,
        "until": [
          "\n\n"
        ]
      },
      "repeats": 1,
      "should_decontaminate": false,
      "metadata": [
        {
          "version": 0.0
        }
      ],
      "lmms_eval_specific_kwargs": {
        "default": {
          "pre_prompt": "",
          "mca_post_prompt": "Answer with the option's letter from the given choices directly.",
          "na_post_prompt": "Please answer the question using a single word or phrase."
        },
        "gemini_api": {
          "pre_prompt": "",
          "mca_post_prompt": "Answer with the option's letter from the given choices directly.",
          "na_post_prompt": "Do not response anything other than a single number!"
        },
        "gpt4v": {
          "pre_prompt": "",
          "mca_post_prompt": "Answer with the option's letter from the given choices directly.",
          "na_post_prompt": "Do not response anything other than a single number!"
        },
        "pre_prompt": "",
        "mca_post_prompt": "Answer with the option's letter from the given choices directly.",
        "na_post_prompt": "Please answer the question using a single word or phrase."
      }
    }
  },
  "versions": {
    "vsibench": "Yaml"
  },
  "n-shot": {
    "vsibench": 0
  },
  "higher_is_better": {
    "vsibench": {
      "vsibench_score": true
    }
  },
  "n-samples": {
    "vsibench": {
      "original": 5130,
      "effective": 5130
    }
  },
  "config": {
    "model": "qwen2_5_vl",
    "model_args": "pretrained=/mnt/vlm/common/models/X-Humanoid/Pelican1.0-VL-7B/",
    "batch_size": 1,
    "batch_sizes": [],
    "device": null,
    "use_cache": null,
    "limit": null,
    "bootstrap_iters": 100000,
    "gen_kwargs": "",
    "random_seed": 0,
    "numpy_seed": 1234,
    "torch_seed": 1234,
    "fewshot_seed": 1234
  },
  "git_hash": "c894e588",
  "date": "20251202_120508",
  "task_hashes": {
    "vsibench": "60e3948f7b6e8072d0fce80ade35b2adc712bfb3bb628c15b145701c3a947076"
  },
  "model_source": "qwen2_5_vl",
  "model_name": "/mnt/vlm/common/models/X-Humanoid/Pelican1.0-VL-7B/",
  "model_name_sanitized": "Pelican1.0-VL-7B__",
  "system_instruction": null,
  "system_instruction_sha": null,
  "fewshot_as_multiturn": false,
  "chat_template": null,
  "chat_template_sha": null,
  "start_time": 1651138.314116108,
  "end_time": 1693089.558756093,
  "total_evaluation_time_seconds": "41951.24463998503"
}

It seems the model output is also unexpected,

{"doc_id": 0, "target": "4", "filtered_resps": ["3chtsvalidators Catalyst zupełnesses לעבור whale까요? readOnlychts נכПравหร"], "doc_hash": "74234e98afe7498fb5daf1f36ac2d78acc339464f950703b8c019892f982b90b", "vsibench_score": {"id": 0, "dataset": "arkitscenes", "scene_name": "41069025", "question_type": "object_counting", "question": "How many table(s) are in this room?", "ground_truth": "4", "options": null, "pruned": false, "prediction": "3chtsvalidators Catalyst zupełnesses לעבור whale까요? readOnlychts נכПравหร", "MRA:.5:.95:.05": 0.0}, "input": "These are frames of a video.\nHow many table(s) are in this room?\nPlease answer the question using a single word or phrase."}

Could you confirm that the weights are correct?

Sign up or log in to comment