Spaces:

hallucinations-leaderboard
/

leaderboard

Runtime error

App Files Files Community

pminervini commited on Dec 8, 2023

Commit

ca9ece0

1 Parent(s): a88d51c

update

Browse files

Files changed (4) hide show

backend-cli.py +1 -1
completed-cli.py +45 -1
src/display/utils.py +2 -0
src/leaderboard/read_evals.py +74 -18

backend-cli.py CHANGED Viewed

@@ -103,7 +103,7 @@ def process_finished_requests() -> bool:
     random.shuffle(eval_requests)
     from src.leaderboard.read_evals import get_raw_eval_results
-    eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND)
     result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
     result_name_to_result = {r.eval_name: r for r in eval_results}

     random.shuffle(eval_requests)
     from src.leaderboard.read_evals import get_raw_eval_results
+    eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND, True)
     result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
     result_name_to_result = {r.eval_name: r for r in eval_results}

completed-cli.py CHANGED Viewed

@@ -45,16 +45,59 @@ def request_to_result_name(request: EvalRequest) -> str:
 def process_finished_requests() -> bool:
     current_finished_status = [FINISHED_STATUS]
     # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
     eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
     # Sort the evals by priority (first submitted first run)
     eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
     import random
     random.shuffle(eval_requests)
     from src.leaderboard.read_evals import get_raw_eval_results
-    eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND)
     result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
     result_name_to_result = {r.eval_name: r for r in eval_results}
@@ -73,6 +116,7 @@ def process_finished_requests() -> bool:
             if eval_result is None or task_name not in eval_result.results:
                 eval_request: EvalRequest = result_name_to_request[result_name]
                 print(result_name, 'is incomplete -- missing task:', task_name)

 def process_finished_requests() -> bool:
     current_finished_status = [FINISHED_STATUS]
+    if False:
+        import os
+        import dateutil
+        model_result_filepaths = []
+        results_path = f'{EVAL_RESULTS_PATH_BACKEND}/EleutherAI/gpt-neo-1.3B'
+        requests_path = f'{EVAL_REQUESTS_PATH_BACKEND}/EleutherAI/gpt-neo-1.3B_eval_request_False_False_False.json'
+        for root, _, files in os.walk(results_path):
+            # We should only have json files in model results
+            if len(files) == 0 or any([not f.endswith(".json") for f in files]):
+                continue
+            # Sort the files by date
+            try:
+                files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
+            except dateutil.parser._parser.ParserError:
+                files = [files[-1]]
+            for file in files:
+                model_result_filepaths.append(os.path.join(root, file))
+        eval_results = {}
+        for model_result_filepath in model_result_filepaths:
+            # Creation of result
+            eval_result = EvalResult.init_from_json_file(model_result_filepath)
+            eval_result.update_with_request_file(requests_path)
+            print('XXX', eval_result)
+            # Store results of same eval together
+            eval_name = eval_result.eval_name
+            if eval_name in eval_results.keys():
+                eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
+            else:
+                eval_results[eval_name] = eval_result
+        print(eval_results)
+        return True
     # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
     eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
     # Sort the evals by priority (first submitted first run)
     eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
+    # XXX
+    # eval_requests = [r for r in eval_requests if 'neo-1.3B' in r.model]
     import random
     random.shuffle(eval_requests)
     from src.leaderboard.read_evals import get_raw_eval_results
+    eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND, True)
     result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
     result_name_to_result = {r.eval_name: r for r in eval_results}
             if eval_result is None or task_name not in eval_result.results:
                 eval_request: EvalRequest = result_name_to_request[result_name]
+                # print(eval_result)
                 print(result_name, 'is incomplete -- missing task:', task_name)

src/display/utils.py CHANGED Viewed

@@ -25,6 +25,8 @@ class Tasks(Enum):
     # drop = Task("drop", "f1", "DROP")
     nqopen = Task("nq_open", "em", "NQ Open")
     triviaqa = Task("triviaqa", "em", "TriviaQA")
 # These classes are for user facing column names,
 # to avoid having to change them all around the code

     # drop = Task("drop", "f1", "DROP")
     nqopen = Task("nq_open", "em", "NQ Open")
     triviaqa = Task("triviaqa", "em", "TriviaQA")
+    #truthfulqa_mc1 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1")
+    #truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2")
 # These classes are for user facing column names,
 # to avoid having to change them all around the code

src/leaderboard/read_evals.py CHANGED Viewed

@@ -69,23 +69,78 @@ class EvalResult:
         results = {}
         for task in Tasks:
             task = task.value
-            # We skip old mmlu entries
-            wrong_mmlu_version = False
-            if task.benchmark == "hendrycksTest":
-                for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
-                    if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
-                        wrong_mmlu_version = True
-            if wrong_mmlu_version:
                 continue
-            # Some truthfulQA values are NaNs
-            if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
-                if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
-                    results[task.benchmark] = 0.0
-                    continue
-            # We average all scores of a given metric (mostly for mmlu)
             def post_process_results(results: dict) -> dict:
                 # {'nq_open': {'em': 0.018005540166204988, 'em_stderr': 0.0022134216580395583}}
@@ -191,7 +246,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
     return request_file
-def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
@@ -212,7 +267,10 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Creation of result
-        eval_result = EvalResult.init_from_json_file(model_result_filepath)
         eval_result.update_with_request_file(requests_path)
         # Store results of same eval together
@@ -222,8 +280,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
         else:
             eval_results[eval_name] = eval_result
-    # breakpoint()
     results = []
     for v in eval_results.values():
         results.append(v)

         results = {}
         for task in Tasks:
             task = task.value
+            def post_process_results(results: dict) -> dict:
+                # {'nq_open': {'em': 0.018005540166204988, 'em_stderr': 0.0022134216580395583}}
+                res_copy = results.copy()
+                for task_name in res_copy.keys():
+                    entry_copy = results[task_name].copy()
+                    for k, v in entry_copy.items():
+                        if "exact_match" in k:
+                            results[task_name][k.replace("exact_match", "em")] = v
+                    entry_copy = results[task_name].copy()
+                    for k, v in entry_copy.items():
+                        if "," in k:
+                            tokens = k.split(",")
+                            results[task_name][tokens[0]] = v
+                return results
+            accs = np.array([v.get(task.metric, None) for k, v in post_process_results(data["results"]).items() if task.benchmark in k])
+            if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
+            mean_acc = np.mean(accs) * 100.0
+            results[task.benchmark] = mean_acc
+        return EvalResult(eval_name=result_key, full_model=full_model, org=org, model=model, results=results,
+                          precision=precision, revision=config.get("model_sha", ""), still_on_hub=still_on_hub,
+                          architecture=architecture)
+    @staticmethod
+    def init_from_json_file_backend(json_filepath):
+        """Inits the result from the specific model result file"""
+        with open(json_filepath) as fp:
+            data = json.load(fp)
+        # We manage the legacy config format
+        config = data.get("config", data.get("config_general", None))
+        # Precision
+        precision = Precision.from_str(config.get("model_dtype"))
+        # Get model and org
+        org_and_model = config.get("model_name", config.get("model_args", None))
+        org_and_model = org_and_model.split("/", 1)
+        if len(org_and_model) == 1:
+            org = None
+            model = org_and_model[0]
+            result_key = f"{model}_{precision.value.name}"
+        else:
+            org = org_and_model[0]
+            model = org_and_model[1]
+            result_key = f"{org}_{model}_{precision.value.name}"
+        full_model = "/".join(org_and_model)
+        still_on_hub, error, model_config = \
+            is_model_on_hub(full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False)
+        architecture = "?"
+        if model_config is not None:
+            architectures = getattr(model_config, "architectures", None)
+            if architectures:
+                architecture = ";".join(architectures)
+        # Extract results available in this file (some results are split in several files)
+        results = {}
+        from src.backend.envs import Tasks as BackendTasks
+        for task in BackendTasks:
+            task = task.value
             def post_process_results(results: dict) -> dict:
                 # {'nq_open': {'em': 0.018005540166204988, 'em_stderr': 0.0022134216580395583}}
     return request_file
+def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool = False) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Creation of result
+        if is_backend:
+            eval_result = EvalResult.init_from_json_file_backend(model_result_filepath)
+        else:
+            eval_result = EvalResult.init_from_json_file(model_result_filepath)
         eval_result.update_with_request_file(requests_path)
         # Store results of same eval together
         else:
             eval_results[eval_name] = eval_result
     results = []
     for v in eval_results.values():
         results.append(v)