Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Rename GAIA2 to Gaia2
Browse files- README.md +2 -2
- app.py +13 -12
- content.py +4 -4
README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
emoji: π
|
| 4 |
colorFrom: red
|
| 5 |
colorTo: blue
|
|
@@ -13,4 +13,4 @@ hf_oauth_scopes:
|
|
| 13 |
- read-repos
|
| 14 |
---
|
| 15 |
|
| 16 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Gaia2 Agents Evaluation Leaderboard
|
| 3 |
emoji: π
|
| 4 |
colorFrom: red
|
| 5 |
colorTo: blue
|
|
|
|
| 13 |
- read-repos
|
| 14 |
---
|
| 15 |
|
| 16 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
|
@@ -200,7 +200,7 @@ def add_new_eval(
|
|
| 200 |
if datetime.datetime.now() - datetime.datetime.strptime(
|
| 201 |
creation_date, "%Y-%m-%dT%H:%M:%S.%fZ"
|
| 202 |
) < datetime.timedelta(days=60):
|
| 203 |
-
raise Exception("This account is not authorized to submit on
|
| 204 |
|
| 205 |
# Can't submit several times per day
|
| 206 |
contact_infos = datasets.load_dataset(
|
|
@@ -405,7 +405,7 @@ custom_css = """
|
|
| 405 |
"""
|
| 406 |
|
| 407 |
demo = gr.Blocks(
|
| 408 |
-
#css=custom_css,
|
| 409 |
theme=gr.themes.Soft(
|
| 410 |
font=[gr.themes.GoogleFont("Roboto"), "Arial", "sans-serif"], primary_hue="blue"
|
| 411 |
),
|
|
@@ -418,18 +418,18 @@ with demo:
|
|
| 418 |
|
| 419 |
# Enhanced leaderboard with custom styling
|
| 420 |
with gr.Column(elem_classes="leaderboard-container"):
|
| 421 |
-
#gr.HTML(
|
| 422 |
# """
|
| 423 |
-
|
| 424 |
# <h2 style="margin: 0; font-weight: 700; font-size: 1.8em;">
|
| 425 |
-
# π
|
| 426 |
# </h2>
|
| 427 |
# <p style="margin: 10px 0 20px 0; color: #666; font-size: 16px;">
|
| 428 |
# Click on column headers to sort β’ Use filters to narrow results
|
| 429 |
# </p>
|
| 430 |
-
|
| 431 |
-
#"""
|
| 432 |
-
#)
|
| 433 |
|
| 434 |
leaderboard_table_val = Leaderboard(
|
| 435 |
value=eval_dataframe_val,
|
|
@@ -447,12 +447,12 @@ with demo:
|
|
| 447 |
"A2A (%)",
|
| 448 |
"Submission date",
|
| 449 |
],
|
| 450 |
-
cant_deselect=[
|
| 451 |
"Model",
|
| 452 |
"Provider",
|
| 453 |
"Total score (%)",
|
| 454 |
"Submission date",
|
| 455 |
-
]
|
| 456 |
),
|
| 457 |
search_columns=["Model", "Provider", "Submitter"],
|
| 458 |
filter_columns=[
|
|
@@ -492,11 +492,12 @@ with demo:
|
|
| 492 |
with gr.Column(scale=1):
|
| 493 |
submit_button = gr.Button("Submit", variant="primary", size="lg")
|
| 494 |
with gr.Column(scale=1):
|
| 495 |
-
refresh_button = gr.Button(
|
|
|
|
|
|
|
| 496 |
|
| 497 |
submission_result = gr.Markdown()
|
| 498 |
|
| 499 |
-
|
| 500 |
with gr.Column():
|
| 501 |
gr.HTML(
|
| 502 |
"""
|
|
|
|
| 200 |
if datetime.datetime.now() - datetime.datetime.strptime(
|
| 201 |
creation_date, "%Y-%m-%dT%H:%M:%S.%fZ"
|
| 202 |
) < datetime.timedelta(days=60):
|
| 203 |
+
raise Exception("This account is not authorized to submit on Gaia2.")
|
| 204 |
|
| 205 |
# Can't submit several times per day
|
| 206 |
contact_infos = datasets.load_dataset(
|
|
|
|
| 405 |
"""
|
| 406 |
|
| 407 |
demo = gr.Blocks(
|
| 408 |
+
# css=custom_css,
|
| 409 |
theme=gr.themes.Soft(
|
| 410 |
font=[gr.themes.GoogleFont("Roboto"), "Arial", "sans-serif"], primary_hue="blue"
|
| 411 |
),
|
|
|
|
| 418 |
|
| 419 |
# Enhanced leaderboard with custom styling
|
| 420 |
with gr.Column(elem_classes="leaderboard-container"):
|
| 421 |
+
# gr.HTML(
|
| 422 |
# """
|
| 423 |
+
# <div style="padding: 20px 20px 0 20px;">
|
| 424 |
# <h2 style="margin: 0; font-weight: 700; font-size: 1.8em;">
|
| 425 |
+
# π Gaia2 Leaderboard Rankings
|
| 426 |
# </h2>
|
| 427 |
# <p style="margin: 10px 0 20px 0; color: #666; font-size: 16px;">
|
| 428 |
# Click on column headers to sort β’ Use filters to narrow results
|
| 429 |
# </p>
|
| 430 |
+
# </div>
|
| 431 |
+
# """
|
| 432 |
+
# )
|
| 433 |
|
| 434 |
leaderboard_table_val = Leaderboard(
|
| 435 |
value=eval_dataframe_val,
|
|
|
|
| 447 |
"A2A (%)",
|
| 448 |
"Submission date",
|
| 449 |
],
|
| 450 |
+
cant_deselect=[
|
| 451 |
"Model",
|
| 452 |
"Provider",
|
| 453 |
"Total score (%)",
|
| 454 |
"Submission date",
|
| 455 |
+
],
|
| 456 |
),
|
| 457 |
search_columns=["Model", "Provider", "Submitter"],
|
| 458 |
filter_columns=[
|
|
|
|
| 492 |
with gr.Column(scale=1):
|
| 493 |
submit_button = gr.Button("Submit", variant="primary", size="lg")
|
| 494 |
with gr.Column(scale=1):
|
| 495 |
+
refresh_button = gr.Button(
|
| 496 |
+
"π Refresh the display", variant="secondary", size="lg"
|
| 497 |
+
)
|
| 498 |
|
| 499 |
submission_result = gr.Markdown()
|
| 500 |
|
|
|
|
| 501 |
with gr.Column():
|
| 502 |
gr.HTML(
|
| 503 |
"""
|
content.py
CHANGED
|
@@ -7,7 +7,7 @@ LEADERBOARD_PATH = f"{OWNER}/leaderboard"
|
|
| 7 |
TITLE = """
|
| 8 |
<div style="text-align: center; padding: 20px 0; background: linear-gradient(135deg, #1877f2 0%, #42a5f5 100%); border-radius: 15px; margin-bottom: 30px;">
|
| 9 |
<h1 style="color: white; font-size: 2em; margin: 0; font-weight: 700; text-shadow: 2px 2px 4px rgba(0,0,0,0.3);">
|
| 10 |
-
|
| 11 |
</h1>
|
| 12 |
</div>
|
| 13 |
"""
|
|
@@ -25,11 +25,11 @@ SCENARIO_LIST = [
|
|
| 25 |
MAX_PARALLELISM = 10
|
| 26 |
|
| 27 |
INTRODUCTION_TEXT = """
|
| 28 |
-
[**
|
| 29 |
|
| 30 |
-
|
| 31 |
|
| 32 |
-
β οΈ All scores on this page are self reported. Associated traces are made available to the open-source community in order to enable deeper study of the tradeoffs between model behavior vs performance on
|
| 33 |
"""
|
| 34 |
|
| 35 |
SUBMISSION_TEXT = """
|
|
|
|
| 7 |
TITLE = """
|
| 8 |
<div style="text-align: center; padding: 20px 0; background: linear-gradient(135deg, #1877f2 0%, #42a5f5 100%); border-radius: 15px; margin-bottom: 30px;">
|
| 9 |
<h1 style="color: white; font-size: 2em; margin: 0; font-weight: 700; text-shadow: 2px 2px 4px rgba(0,0,0,0.3);">
|
| 10 |
+
Gaia2 Leaderboard π
|
| 11 |
</h1>
|
| 12 |
</div>
|
| 13 |
"""
|
|
|
|
| 25 |
MAX_PARALLELISM = 10
|
| 26 |
|
| 27 |
INTRODUCTION_TEXT = """
|
| 28 |
+
[**Gaia2**](https://huggingface.co/datasets/meta-agents-research-environments/gaia2) is a benchmark designed to measure general agent capabilities. Beyond traditional search and execution tasks, Gaia2 runs asynchronously, requiring agents to handle ambiguities and noise, adapt to dynamic environments, collaborate with other agents, and operate under temporal constraints. As of publication, no system dominates across the task spectrum: stronger reasoning often comes at the cost of efficiency & the ability to complete sensitive tasks in due time.
|
| 29 |
|
| 30 |
+
Gaia2 evaluates agents across the following dimensions: **Execution** (instruction following, multi-step tool-use), **Search** (information retrieval), **Ambiguity** (handling unclear or incomplete instructions), **Adaptability** (responding to dynamic environment changes), **Time** (managing temporal constraints and scheduling), **Noise** (operating effectively despite irrelevant information and random tool failures) and **Agent-to-Agent** (collaboration and coordination with other agents).
|
| 31 |
|
| 32 |
+
β οΈ All scores on this page are self reported. Associated traces are made available to the open-source community in order to enable deeper study of the tradeoffs between model behavior vs performance on Gaia2.
|
| 33 |
"""
|
| 34 |
|
| 35 |
SUBMISSION_TEXT = """
|