Spaces:
Sleeping
Sleeping
Patrick Haller
commited on
Commit
·
983ff7e
1
Parent(s):
51a9af1
Init leaderboard
Browse files- app.py +73 -0
- current_results.json +53 -0
- text.py +26 -0
app.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import json
|
| 3 |
+
|
| 4 |
+
import gradio as gr
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
from text import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, INTRODUCTION_TEXT, TITLE_TEXT, TASK_DESCRIPTION
|
| 8 |
+
|
| 9 |
+
# with open("app.css") as f:
|
| 10 |
+
# css_code = f.read()
|
| 11 |
+
|
| 12 |
+
demo = gr.Blocks()
|
| 13 |
+
|
| 14 |
+
with open("current_results.json") as f:
|
| 15 |
+
result_list = json.load(f)
|
| 16 |
+
|
| 17 |
+
df = pd.DataFrame(result_list)
|
| 18 |
+
|
| 19 |
+
df["Model"] = df.apply(lambda x: f"<a style='text-decoration: underline' href='{x['link']}'>{x['Model']}</a>" if isinstance(x["link"], str) else x["Model"], axis=1)
|
| 20 |
+
|
| 21 |
+
# Sort columns by aoc_original, aoc_leet, euler_original, euler_story
|
| 22 |
+
df = df[["Model", "instruction_only", "aoc_original", "aoc_leet", "euler_original", "euler_story"]]
|
| 23 |
+
|
| 24 |
+
df["instruction_only"] = df["instruction_only"].map({True: 1, False: 0})
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
average_scores = df.iloc[:, 2:].mean(axis=1).round(2)
|
| 28 |
+
|
| 29 |
+
# Replace Column names
|
| 30 |
+
df.columns = ["Model", "Evaluation", "AOC Original",
|
| 31 |
+
"AOC Leet", "Euler Original", "Euler Story"]
|
| 32 |
+
|
| 33 |
+
average_scores = df.iloc[:, 2:].mean(axis=1).round(2)
|
| 34 |
+
df.insert(loc=2, column="⬆️ Average", value=average_scores)
|
| 35 |
+
df = df.sort_values(by=["Evaluation", "⬆️ Average"], ascending=[True, False])
|
| 36 |
+
df["Evaluation"] = df["Evaluation"].map({1: "🔶", 0: "🟩"})
|
| 37 |
+
|
| 38 |
+
with demo:
|
| 39 |
+
gr.HTML(f"<h2 style='text-align: center'>{TITLE_TEXT}</h2>")
|
| 40 |
+
# gr.HTML('<hr>')
|
| 41 |
+
gr.HTML(f"<h3>{INTRODUCTION_TEXT}<h3>")
|
| 42 |
+
gr.HTML('<hr style="border-top: 3px dotted #bbb" class="dotted">')
|
| 43 |
+
|
| 44 |
+
gr.HTML("<h3>📊 Results</h3>")
|
| 45 |
+
gr.components.Dataframe(
|
| 46 |
+
value=df,
|
| 47 |
+
datatype=["html"]
|
| 48 |
+
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
gr.HTML("<h3>Legend</h3>")
|
| 52 |
+
gr.HTML("<p>🔶: Evaluated only on the first part of each AoC day</p>")
|
| 53 |
+
gr.HTML("<p>🟩: Complete Evaluation</p>")
|
| 54 |
+
|
| 55 |
+
# with gr.Row():
|
| 56 |
+
# with gr.Accordion("Task", open=True):
|
| 57 |
+
# with gr.Row():
|
| 58 |
+
# with gr.Column(scale=1):
|
| 59 |
+
# gr.Image("assets/front.png")
|
| 60 |
+
# with gr.Column(scale=4):
|
| 61 |
+
# gr.Markdown(TASK_DESCRIPTION)
|
| 62 |
+
|
| 63 |
+
with gr.Row():
|
| 64 |
+
with gr.Accordion("📙 Citation", open=False):
|
| 65 |
+
citation_button = gr.Textbox(
|
| 66 |
+
value=CITATION_BUTTON_TEXT,
|
| 67 |
+
label=CITATION_BUTTON_LABEL,
|
| 68 |
+
lines=20,
|
| 69 |
+
elem_id="citation-button",
|
| 70 |
+
show_copy_button=True,
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
demo.launch()
|
current_results.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "Mistral-7B-Instruct-v0.1",
|
| 4 |
+
"euler_original": 0.37,
|
| 5 |
+
"euler_story": 0.12,
|
| 6 |
+
"aoc_original": 3.0,
|
| 7 |
+
"aoc_leet": 3.0,
|
| 8 |
+
"instruction_only": true,
|
| 9 |
+
"link": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1"
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"Model": "Mixtral-8x7B-Instruct-v0.1",
|
| 13 |
+
"euler_original": 2.86,
|
| 14 |
+
"euler_story": 2.23,
|
| 15 |
+
"aoc_original": 8.67,
|
| 16 |
+
"aoc_leet": 8.42,
|
| 17 |
+
"instruction_only": false,
|
| 18 |
+
"link": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1"
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"Model": "chat_bison",
|
| 22 |
+
"euler_story": 0.62,
|
| 23 |
+
"euler_original": 2.44,
|
| 24 |
+
"aoc_leet": 13.78,
|
| 25 |
+
"aoc_original": 17.09,
|
| 26 |
+
"instruction_only": false
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"Model": "codechat_bison",
|
| 30 |
+
"euler_story": 2.61,
|
| 31 |
+
"euler_original": 4.59,
|
| 32 |
+
"aoc_original": 21.17,
|
| 33 |
+
"aoc_leet": 17.6,
|
| 34 |
+
"instruction_only": false
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"Model": "WizardCoder-Python-34B-V1.0",
|
| 38 |
+
"aoc_leet": 22.5,
|
| 39 |
+
"aoc_original": 24.0,
|
| 40 |
+
"euler_original": 2.61,
|
| 41 |
+
"euler_story": 2.48,
|
| 42 |
+
"instruction_only": true,
|
| 43 |
+
"link": "https://huggingface.co/WizardLM/WizardCoder-Python-34B-V1.0"
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"Model": "gpt3.5",
|
| 47 |
+
"euler_original": 8.19,
|
| 48 |
+
"euler_story": 6.95,
|
| 49 |
+
"aoc_leet": 29.85,
|
| 50 |
+
"aoc_original": 50.0,
|
| 51 |
+
"instruction_only": false
|
| 52 |
+
}
|
| 53 |
+
]
|
text.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
TITLE_TEXT = "PECC - Problem Extraction and Coding Challenges Evaluation Benchmark"
|
| 2 |
+
|
| 3 |
+
INTRODUCTION_TEXT = """📄 PECC: An extensive benchmark centered on code generation from narrative-embedded problem descriptions. Unlike prior benchmarks that evaluate code generation using specific instructions, our dataset requires models to comprehend, extract requirements, and produce the essential code for problem-solving. This approach necessitates syntactically accurate programs and demands reading comprehension skills to derive the desired solution."""
|
| 4 |
+
|
| 5 |
+
TASK_DESCRIPTION = """## Task Description
|
| 6 |
+
The task for the model is to generate directly executable python code.
|
| 7 |
+
|
| 8 |
+
### Instruction
|
| 9 |
+
The model is first prompted with a system prompt, which is a short description of the problem. The model is then asked to generate the python code that solves the problem.
|
| 10 |
+
|
| 11 |
+
### Task
|
| 12 |
+
The model receives the task itself.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite our paper"
|
| 16 |
+
|
| 17 |
+
CITATION_BUTTON_TEXT = r"""
|
| 18 |
+
@misc{pecc,
|
| 19 |
+
author = {Patrick Haller and Jonas Golde and Alan Akbik},
|
| 20 |
+
title = {PECC - Problem Extraction and Coding Challenges},
|
| 21 |
+
year = {2024},
|
| 22 |
+
publisher = {GitHub},
|
| 23 |
+
journal = {GitHub repository},
|
| 24 |
+
howpublished = {}
|
| 25 |
+
}
|
| 26 |
+
"""
|