Spaces:

PatrickHaller
/

pecc-leaderboard

Sleeping

App Files Files Community

Patrick Haller commited on Mar 15, 2024

Commit

983ff7e

1 Parent(s): 51a9af1

Init leaderboard

Browse files

Files changed (3) hide show

app.py +73 -0
current_results.json +53 -0
text.py +26 -0

app.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import sys
+import json
+import gradio as gr
+import pandas as pd
+from text import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, INTRODUCTION_TEXT, TITLE_TEXT, TASK_DESCRIPTION
+# with open("app.css") as f:
+#     css_code = f.read()
+demo = gr.Blocks()
+with open("current_results.json") as f:
+    result_list = json.load(f)
+df = pd.DataFrame(result_list)
+df["Model"] = df.apply(lambda x: f"<a style='text-decoration: underline' href='{x['link']}'>{x['Model']}</a>" if isinstance(x["link"], str) else x["Model"], axis=1)
+# Sort columns by aoc_original, aoc_leet, euler_original, euler_story
+df = df[["Model", "instruction_only", "aoc_original", "aoc_leet", "euler_original", "euler_story"]]
+df["instruction_only"] = df["instruction_only"].map({True: 1, False: 0})
+average_scores = df.iloc[:, 2:].mean(axis=1).round(2)
+# Replace Column names
+df.columns = ["Model", "Evaluation", "AOC Original",
+              "AOC Leet", "Euler Original", "Euler Story"]
+average_scores = df.iloc[:, 2:].mean(axis=1).round(2)
+df.insert(loc=2, column="⬆️ Average", value=average_scores)
+df = df.sort_values(by=["Evaluation", "⬆️ Average"], ascending=[True, False])
+df["Evaluation"] = df["Evaluation"].map({1: "🔶", 0: "🟩"})
+with demo:
+    gr.HTML(f"<h2 style='text-align: center'>{TITLE_TEXT}</h2>")
+    # gr.HTML('<hr>')
+    gr.HTML(f"<h3>{INTRODUCTION_TEXT}<h3>")
+    gr.HTML('<hr style="border-top: 3px dotted #bbb" class="dotted">')
+    gr.HTML("<h3>📊 Results</h3>")
+    gr.components.Dataframe(
+        value=df,
+        datatype=["html"]
+    )
+    gr.HTML("<h3>Legend</h3>")
+    gr.HTML("<p>🔶: Evaluated only on the first part of each AoC day</p>")
+    gr.HTML("<p>🟩: Complete Evaluation</p>")
+    # with gr.Row():
+    #     with gr.Accordion("Task", open=True):
+    #         with gr.Row():
+    #             with gr.Column(scale=1):
+    #                 gr.Image("assets/front.png")
+    #             with gr.Column(scale=4):
+    #                 gr.Markdown(TASK_DESCRIPTION)
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                lines=20,
+                elem_id="citation-button",
+                show_copy_button=True,
+            )
+demo.launch()

current_results.json ADDED Viewed

	@@ -0,0 +1,53 @@

+[
+  {
+    "Model": "Mistral-7B-Instruct-v0.1",
+    "euler_original": 0.37,
+    "euler_story": 0.12,
+    "aoc_original": 3.0,
+    "aoc_leet": 3.0,
+    "instruction_only": true,
+    "link": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1"
+  },
+  {
+    "Model": "Mixtral-8x7B-Instruct-v0.1",
+    "euler_original": 2.86,
+    "euler_story": 2.23,
+    "aoc_original": 8.67,
+    "aoc_leet": 8.42,
+    "instruction_only": false,
+    "link": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1"
+  },
+  {
+    "Model": "chat_bison",
+    "euler_story": 0.62,
+    "euler_original": 2.44,
+    "aoc_leet": 13.78,
+    "aoc_original": 17.09,
+    "instruction_only": false
+  },
+  {
+    "Model": "codechat_bison",
+    "euler_story": 2.61,
+    "euler_original": 4.59,
+    "aoc_original": 21.17,
+    "aoc_leet": 17.6,
+    "instruction_only": false
+  },
+  {
+    "Model": "WizardCoder-Python-34B-V1.0",
+    "aoc_leet": 22.5,
+    "aoc_original": 24.0,
+    "euler_original": 2.61,
+    "euler_story": 2.48,
+    "instruction_only": true,
+    "link": "https://huggingface.co/WizardLM/WizardCoder-Python-34B-V1.0"
+  },
+  {
+    "Model": "gpt3.5",
+    "euler_original": 8.19,
+    "euler_story": 6.95,
+    "aoc_leet": 29.85,
+    "aoc_original": 50.0,
+    "instruction_only": false
+  }
+]

text.py ADDED Viewed

	@@ -0,0 +1,26 @@

+TITLE_TEXT = "PECC - Problem Extraction and Coding Challenges Evaluation Benchmark"
+INTRODUCTION_TEXT = """📄 PECC: An extensive benchmark centered on code generation from narrative-embedded problem descriptions. Unlike prior benchmarks that evaluate code generation using specific instructions, our dataset requires models to comprehend, extract requirements, and produce the essential code for problem-solving. This approach necessitates syntactically accurate programs and demands reading comprehension skills to derive the desired solution."""
+TASK_DESCRIPTION = """## Task Description
+The task for the model is to generate directly executable python code.
+### Instruction
+The model is first prompted with a system prompt, which is a short description of the problem. The model is then asked to generate the python code that solves the problem.
+### Task
+The model receives the task itself.
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite our paper"
+CITATION_BUTTON_TEXT = r"""
+@misc{pecc,
+    author = {Patrick Haller and Jonas Golde and Alan Akbik},
+    title = {PECC - Problem Extraction and Coding Challenges},
+    year = {2024},
+    publisher = {GitHub},
+    journal = {GitHub repository},
+    howpublished = {}
+}
+"""