mlcu commited on
Commit
5200fdc
Β·
1 Parent(s): d97ec7b

Rename GAIA2 to Gaia2

Browse files
Files changed (3) hide show
  1. README.md +2 -2
  2. app.py +13 -12
  3. content.py +4 -4
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Gaia 2 Agents Evaluation Leaderboard
3
  emoji: 🐠
4
  colorFrom: red
5
  colorTo: blue
@@ -13,4 +13,4 @@ hf_oauth_scopes:
13
  - read-repos
14
  ---
15
 
16
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Gaia2 Agents Evaluation Leaderboard
3
  emoji: 🐠
4
  colorFrom: red
5
  colorTo: blue
 
13
  - read-repos
14
  ---
15
 
16
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -200,7 +200,7 @@ def add_new_eval(
200
  if datetime.datetime.now() - datetime.datetime.strptime(
201
  creation_date, "%Y-%m-%dT%H:%M:%S.%fZ"
202
  ) < datetime.timedelta(days=60):
203
- raise Exception("This account is not authorized to submit on GAIA2.")
204
 
205
  # Can't submit several times per day
206
  contact_infos = datasets.load_dataset(
@@ -405,7 +405,7 @@ custom_css = """
405
  """
406
 
407
  demo = gr.Blocks(
408
- #css=custom_css,
409
  theme=gr.themes.Soft(
410
  font=[gr.themes.GoogleFont("Roboto"), "Arial", "sans-serif"], primary_hue="blue"
411
  ),
@@ -418,18 +418,18 @@ with demo:
418
 
419
  # Enhanced leaderboard with custom styling
420
  with gr.Column(elem_classes="leaderboard-container"):
421
- #gr.HTML(
422
  # """
423
- #<div style="padding: 20px 20px 0 20px;">
424
  # <h2 style="margin: 0; font-weight: 700; font-size: 1.8em;">
425
- # πŸ† GAIA2 Leaderboard Rankings
426
  # </h2>
427
  # <p style="margin: 10px 0 20px 0; color: #666; font-size: 16px;">
428
  # Click on column headers to sort β€’ Use filters to narrow results
429
  # </p>
430
- #</div>
431
- #"""
432
- #)
433
 
434
  leaderboard_table_val = Leaderboard(
435
  value=eval_dataframe_val,
@@ -447,12 +447,12 @@ with demo:
447
  "A2A (%)",
448
  "Submission date",
449
  ],
450
- cant_deselect=[
451
  "Model",
452
  "Provider",
453
  "Total score (%)",
454
  "Submission date",
455
- ]
456
  ),
457
  search_columns=["Model", "Provider", "Submitter"],
458
  filter_columns=[
@@ -492,11 +492,12 @@ with demo:
492
  with gr.Column(scale=1):
493
  submit_button = gr.Button("Submit", variant="primary", size="lg")
494
  with gr.Column(scale=1):
495
- refresh_button = gr.Button("πŸ”„ Refresh the display", variant="secondary", size="lg")
 
 
496
 
497
  submission_result = gr.Markdown()
498
 
499
-
500
  with gr.Column():
501
  gr.HTML(
502
  """
 
200
  if datetime.datetime.now() - datetime.datetime.strptime(
201
  creation_date, "%Y-%m-%dT%H:%M:%S.%fZ"
202
  ) < datetime.timedelta(days=60):
203
+ raise Exception("This account is not authorized to submit on Gaia2.")
204
 
205
  # Can't submit several times per day
206
  contact_infos = datasets.load_dataset(
 
405
  """
406
 
407
  demo = gr.Blocks(
408
+ # css=custom_css,
409
  theme=gr.themes.Soft(
410
  font=[gr.themes.GoogleFont("Roboto"), "Arial", "sans-serif"], primary_hue="blue"
411
  ),
 
418
 
419
  # Enhanced leaderboard with custom styling
420
  with gr.Column(elem_classes="leaderboard-container"):
421
+ # gr.HTML(
422
  # """
423
+ # <div style="padding: 20px 20px 0 20px;">
424
  # <h2 style="margin: 0; font-weight: 700; font-size: 1.8em;">
425
+ # πŸ† Gaia2 Leaderboard Rankings
426
  # </h2>
427
  # <p style="margin: 10px 0 20px 0; color: #666; font-size: 16px;">
428
  # Click on column headers to sort β€’ Use filters to narrow results
429
  # </p>
430
+ # </div>
431
+ # """
432
+ # )
433
 
434
  leaderboard_table_val = Leaderboard(
435
  value=eval_dataframe_val,
 
447
  "A2A (%)",
448
  "Submission date",
449
  ],
450
+ cant_deselect=[
451
  "Model",
452
  "Provider",
453
  "Total score (%)",
454
  "Submission date",
455
+ ],
456
  ),
457
  search_columns=["Model", "Provider", "Submitter"],
458
  filter_columns=[
 
492
  with gr.Column(scale=1):
493
  submit_button = gr.Button("Submit", variant="primary", size="lg")
494
  with gr.Column(scale=1):
495
+ refresh_button = gr.Button(
496
+ "πŸ”„ Refresh the display", variant="secondary", size="lg"
497
+ )
498
 
499
  submission_result = gr.Markdown()
500
 
 
501
  with gr.Column():
502
  gr.HTML(
503
  """
content.py CHANGED
@@ -7,7 +7,7 @@ LEADERBOARD_PATH = f"{OWNER}/leaderboard"
7
  TITLE = """
8
  <div style="text-align: center; padding: 20px 0; background: linear-gradient(135deg, #1877f2 0%, #42a5f5 100%); border-radius: 15px; margin-bottom: 30px;">
9
  <h1 style="color: white; font-size: 2em; margin: 0; font-weight: 700; text-shadow: 2px 2px 4px rgba(0,0,0,0.3);">
10
- GAIA2 Leaderboard πŸ†
11
  </h1>
12
  </div>
13
  """
@@ -25,11 +25,11 @@ SCENARIO_LIST = [
25
  MAX_PARALLELISM = 10
26
 
27
  INTRODUCTION_TEXT = """
28
- [**GAIA2**](https://huggingface.co/datasets/meta-agents-research-environments/gaia2) is a benchmark designed to measure general agent capabilities. Beyond traditional search and execution tasks, GAIA2 runs asynchronously, requiring agents to handle ambiguities and noise, adapt to dynamic environments, collaborate with other agents, and operate under temporal constraints. As of publication, no system dominates across the task spectrum: stronger reasoning often comes at the cost of efficiency & the ability to complete sensitive tasks in due time.
29
 
30
- GAIA2 evaluates agents across the following dimensions: **Execution** (instruction following, multi-step tool-use), **Search** (information retrieval), **Ambiguity** (handling unclear or incomplete instructions), **Adaptability** (responding to dynamic environment changes), **Time** (managing temporal constraints and scheduling), **Noise** (operating effectively despite irrelevant information and random tool failures) and **Agent-to-Agent** (collaboration and coordination with other agents).
31
 
32
- ⚠️ All scores on this page are self reported. Associated traces are made available to the open-source community in order to enable deeper study of the tradeoffs between model behavior vs performance on GAIA2.
33
  """
34
 
35
  SUBMISSION_TEXT = """
 
7
  TITLE = """
8
  <div style="text-align: center; padding: 20px 0; background: linear-gradient(135deg, #1877f2 0%, #42a5f5 100%); border-radius: 15px; margin-bottom: 30px;">
9
  <h1 style="color: white; font-size: 2em; margin: 0; font-weight: 700; text-shadow: 2px 2px 4px rgba(0,0,0,0.3);">
10
+ Gaia2 Leaderboard πŸ†
11
  </h1>
12
  </div>
13
  """
 
25
  MAX_PARALLELISM = 10
26
 
27
  INTRODUCTION_TEXT = """
28
+ [**Gaia2**](https://huggingface.co/datasets/meta-agents-research-environments/gaia2) is a benchmark designed to measure general agent capabilities. Beyond traditional search and execution tasks, Gaia2 runs asynchronously, requiring agents to handle ambiguities and noise, adapt to dynamic environments, collaborate with other agents, and operate under temporal constraints. As of publication, no system dominates across the task spectrum: stronger reasoning often comes at the cost of efficiency & the ability to complete sensitive tasks in due time.
29
 
30
+ Gaia2 evaluates agents across the following dimensions: **Execution** (instruction following, multi-step tool-use), **Search** (information retrieval), **Ambiguity** (handling unclear or incomplete instructions), **Adaptability** (responding to dynamic environment changes), **Time** (managing temporal constraints and scheduling), **Noise** (operating effectively despite irrelevant information and random tool failures) and **Agent-to-Agent** (collaboration and coordination with other agents).
31
 
32
+ ⚠️ All scores on this page are self reported. Associated traces are made available to the open-source community in order to enable deeper study of the tradeoffs between model behavior vs performance on Gaia2.
33
  """
34
 
35
  SUBMISSION_TEXT = """