Spaces:

meta-agents-research-environments
/

leaderboard

Running on CPU Upgrade

App Files Files Community

mlcu commited on Sep 22

Commit

5200fdc

1 Parent(s): d97ec7b

Rename GAIA2 to Gaia2

Browse files

Files changed (3) hide show

README.md +2 -2
app.py +13 -12
content.py +4 -4

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Gaia 2 Agents Evaluation Leaderboard
 emoji: 🐠
 colorFrom: red
 colorTo: blue
@@ -13,4 +13,4 @@ hf_oauth_scopes:
 - read-repos
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Gaia2 Agents Evaluation Leaderboard
 emoji: 🐠
 colorFrom: red
 colorTo: blue
 - read-repos
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -200,7 +200,7 @@ def add_new_eval(
     if datetime.datetime.now() - datetime.datetime.strptime(
         creation_date, "%Y-%m-%dT%H:%M:%S.%fZ"
     ) < datetime.timedelta(days=60):
-        raise Exception("This account is not authorized to submit on GAIA2.")
     # Can't submit several times per day
     contact_infos = datasets.load_dataset(
@@ -405,7 +405,7 @@ custom_css = """
 """
 demo = gr.Blocks(
-    #css=custom_css,
     theme=gr.themes.Soft(
         font=[gr.themes.GoogleFont("Roboto"), "Arial", "sans-serif"], primary_hue="blue"
     ),
@@ -418,18 +418,18 @@ with demo:
     # Enhanced leaderboard with custom styling
     with gr.Column(elem_classes="leaderboard-container"):
-        #gr.HTML(
         #    """
-        #<div style="padding: 20px 20px 0 20px;">
         #    <h2 style="margin: 0; font-weight: 700; font-size: 1.8em;">
-        #        🏆 GAIA2 Leaderboard Rankings
         #    </h2>
         #    <p style="margin: 10px 0 20px 0; color: #666; font-size: 16px;">
         #        Click on column headers to sort • Use filters to narrow results
         #    </p>
-        #</div>
-        #"""
-        #)
         leaderboard_table_val = Leaderboard(
             value=eval_dataframe_val,
@@ -447,12 +447,12 @@ with demo:
                     "A2A (%)",
                     "Submission date",
                 ],
-                cant_deselect=[
                     "Model",
                     "Provider",
                     "Total score (%)",
                     "Submission date",
-                ]
             ),
             search_columns=["Model", "Provider", "Submitter"],
             filter_columns=[
@@ -492,11 +492,12 @@ with demo:
             with gr.Column(scale=1):
                 submit_button = gr.Button("Submit", variant="primary", size="lg")
             with gr.Column(scale=1):
-                refresh_button = gr.Button("🔄 Refresh the display", variant="secondary", size="lg")
         submission_result = gr.Markdown()
     with gr.Column():
         gr.HTML(
             """

     if datetime.datetime.now() - datetime.datetime.strptime(
         creation_date, "%Y-%m-%dT%H:%M:%S.%fZ"
     ) < datetime.timedelta(days=60):
+        raise Exception("This account is not authorized to submit on Gaia2.")
     # Can't submit several times per day
     contact_infos = datasets.load_dataset(
 """
 demo = gr.Blocks(
+    # css=custom_css,
     theme=gr.themes.Soft(
         font=[gr.themes.GoogleFont("Roboto"), "Arial", "sans-serif"], primary_hue="blue"
     ),
     # Enhanced leaderboard with custom styling
     with gr.Column(elem_classes="leaderboard-container"):
+        # gr.HTML(
         #    """
+        # <div style="padding: 20px 20px 0 20px;">
         #    <h2 style="margin: 0; font-weight: 700; font-size: 1.8em;">
+        #        🏆 Gaia2 Leaderboard Rankings
         #    </h2>
         #    <p style="margin: 10px 0 20px 0; color: #666; font-size: 16px;">
         #        Click on column headers to sort • Use filters to narrow results
         #    </p>
+        # </div>
+        # """
+        # )
         leaderboard_table_val = Leaderboard(
             value=eval_dataframe_val,
                     "A2A (%)",
                     "Submission date",
                 ],
+                cant_deselect=[
                     "Model",
                     "Provider",
                     "Total score (%)",
                     "Submission date",
+                ],
             ),
             search_columns=["Model", "Provider", "Submitter"],
             filter_columns=[
             with gr.Column(scale=1):
                 submit_button = gr.Button("Submit", variant="primary", size="lg")
             with gr.Column(scale=1):
+                refresh_button = gr.Button(
+                    "🔄 Refresh the display", variant="secondary", size="lg"
+                )
         submission_result = gr.Markdown()
     with gr.Column():
         gr.HTML(
             """

content.py CHANGED Viewed

@@ -7,7 +7,7 @@ LEADERBOARD_PATH = f"{OWNER}/leaderboard"
 TITLE = """
 <div style="text-align: center; padding: 20px 0; background: linear-gradient(135deg, #1877f2 0%, #42a5f5 100%); border-radius: 15px; margin-bottom: 30px;">
     <h1 style="color: white; font-size: 2em; margin: 0; font-weight: 700; text-shadow: 2px 2px 4px rgba(0,0,0,0.3);">
-        GAIA2 Leaderboard 🏆
     </h1>
 </div>
 """
@@ -25,11 +25,11 @@ SCENARIO_LIST = [
 MAX_PARALLELISM = 10
 INTRODUCTION_TEXT = """
-[**GAIA2**](https://huggingface.co/datasets/meta-agents-research-environments/gaia2) is a benchmark designed to measure general agent capabilities. Beyond traditional search and execution tasks, GAIA2 runs asynchronously, requiring agents to handle ambiguities and noise, adapt to dynamic environments, collaborate with other agents, and operate under temporal constraints. As of publication, no system dominates across the task spectrum: stronger reasoning often comes at the cost of efficiency & the ability to complete sensitive tasks in due time.
-GAIA2 evaluates agents across the following dimensions: **Execution** (instruction following, multi-step tool-use), **Search** (information retrieval), **Ambiguity** (handling unclear or incomplete instructions), **Adaptability** (responding to dynamic environment changes), **Time** (managing temporal constraints and scheduling), **Noise** (operating effectively despite irrelevant information and random tool failures) and **Agent-to-Agent** (collaboration and coordination with other agents).
-⚠️ All scores on this page are self reported. Associated traces are made available to the open-source community in order to enable deeper study of the tradeoffs between model behavior vs performance on GAIA2.
 """
 SUBMISSION_TEXT = """

 TITLE = """
 <div style="text-align: center; padding: 20px 0; background: linear-gradient(135deg, #1877f2 0%, #42a5f5 100%); border-radius: 15px; margin-bottom: 30px;">
     <h1 style="color: white; font-size: 2em; margin: 0; font-weight: 700; text-shadow: 2px 2px 4px rgba(0,0,0,0.3);">
+        Gaia2 Leaderboard 🏆
     </h1>
 </div>
 """
 MAX_PARALLELISM = 10
 INTRODUCTION_TEXT = """
+[**Gaia2**](https://huggingface.co/datasets/meta-agents-research-environments/gaia2) is a benchmark designed to measure general agent capabilities. Beyond traditional search and execution tasks, Gaia2 runs asynchronously, requiring agents to handle ambiguities and noise, adapt to dynamic environments, collaborate with other agents, and operate under temporal constraints. As of publication, no system dominates across the task spectrum: stronger reasoning often comes at the cost of efficiency & the ability to complete sensitive tasks in due time.
+Gaia2 evaluates agents across the following dimensions: **Execution** (instruction following, multi-step tool-use), **Search** (information retrieval), **Ambiguity** (handling unclear or incomplete instructions), **Adaptability** (responding to dynamic environment changes), **Time** (managing temporal constraints and scheduling), **Noise** (operating effectively despite irrelevant information and random tool failures) and **Agent-to-Agent** (collaboration and coordination with other agents).
+⚠️ All scores on this page are self reported. Associated traces are made available to the open-source community in order to enable deeper study of the tradeoffs between model behavior vs performance on Gaia2.
 """
 SUBMISSION_TEXT = """