evalita_llm_leaderboard

Running

App Files Files Community

rzanoli commited on Aug 25

Commit

d0105c8

1 Parent(s): bf11a73

Rename prompts for LS, SU, NER, and REL

Browse files

Files changed (2) hide show

app.py +102 -25
src/tasks.py +8 -8

app.py CHANGED Viewed

@@ -47,6 +47,9 @@ def mean_of_max_per_field(df):
 def boxplot_per_task(dataframe=None, baselines=None):
     tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
     if dataframe is None:
@@ -56,7 +59,6 @@ def boxplot_per_task(dataframe=None, baselines=None):
             for task in tasks
         })
-    # baseline per ciascun task (se non viene passata, metto random tra 50 e 70)
     if baselines is None:
         baselines = {task: np.random.randint(50, 70) for task in tasks}
@@ -73,27 +75,26 @@ def boxplot_per_task(dataframe=None, baselines=None):
             fig.add_trace(go.Box(
                 y=y_data,
                 name=task,
-                boxmean="sd",
-                marker=dict(color=colors[i], line=dict(width=1)),
-                line=dict(color=colors[i]),
                 fillcolor=colors[i],
                 opacity=0.7,
-                hovertemplate=f"<b>{task}</b><br>Accuracy: "+"%{y:.2f}%"+"<extra></extra>",
-                width=0.6
             ))
-            # baseline per task (se disponibile)
             if task in baselines and baselines[task] is not None:
-                # baseline come linea orizzontale
                 fig.add_shape(
                     type="line",
-                    x0=i-0.3, x1=i+0.3,   # larghezza in corrispondenza del box
                     y0=baselines[task], y1=baselines[task],
                     line=dict(color="black", width=2, dash="dash"),
                     xref="x", yref="y"
                 )
-                # label con valore baseline
                 fig.add_annotation(
                     x=i, y=baselines[task],
                     text=f"{baselines[task]}%",
@@ -103,19 +104,19 @@ def boxplot_per_task(dataframe=None, baselines=None):
                 )
     fig.update_layout(
-        title="Distribution of Model Accuracy by Task.",
         xaxis_title="Task",
         yaxis_title="Accuracy (%)",
         template="plotly_white",
         boxmode="group",
         dragmode=False,
         font=dict(family="Arial", size=13),
-        margin=dict(b=80),
-        annotations = [
             dict(
                 text=(
-                    "Boxplots show LLM accuracy in zero/few-shot settings. <br>"
-                    "Black dashed lines indicate the best-performing supervised models evaluated during EVALITA."
                 ),
                 xref="paper", yref="paper",
                 x=0.5, y=-0.33,
@@ -124,7 +125,6 @@ def boxplot_per_task(dataframe=None, baselines=None):
             )
         ]
     )
-    #fig.update_yaxes(fixedrange=True)
     fig.update_yaxes(range=[0, 100], fixedrange=True)
     return fig
@@ -137,6 +137,74 @@ BASELINES = {
 }
 def line_chart(dataframe):
@@ -255,11 +323,11 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
     for _, row in sorted_dataframe.iterrows():
         if row['IS_FS']:  # 5-Few-Shot
-            if row["#Params (B)"] > 30 and not large_medal_fs_assigned:
-                new_model_column.append(f"{row['Model']} 7️⃣0️⃣🅱️🏆")
                 large_medal_fs_assigned = True
-            elif 10 < row["#Params (B)"] <= 30 and not medium_medal_fs_assigned:
-                new_model_column.append(f"{row['Model']} 3️⃣0️⃣🅱️🏆")
                 medium_medal_fs_assigned = True
             elif row["#Params (B)"] <= 10 and not small_medal_fs_assigned:
                 new_model_column.append(f"{row['Model']} 1️⃣0️⃣🅱️🏆")
@@ -267,11 +335,11 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
             else:
                 new_model_column.append(row["Model"])
         else:  # 0-Shot
-            if row["#Params (B)"] > 30 and not large_medal_0shot_assigned:
-                new_model_column.append(f"{row['Model']} 7️⃣0️⃣🅱️🎖️")
                 large_medal_0shot_assigned = True
-            elif 10 < row["#Params (B)"] <= 30 and not medium_medal_0shot_assigned:
-                new_model_column.append(f"{row['Model']} 3️⃣0️⃣🅱️🎖️")
                 medium_medal_0shot_assigned = True
             elif row["#Params (B)"] <= 10 and not small_medal_0shot_assigned:
                 new_model_column.append(f"{row['Model']} 1️⃣0️⃣🅱️🎖️")
@@ -279,6 +347,14 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
             else:
                 new_model_column.append(row["Model"])
     # Aggiorna la colonna Model
     sorted_dataframe["Model"] = new_model_column
@@ -503,6 +579,7 @@ with demo:
             #gr.Plot(value=line_chart_interactive_test(), label="Andamento interattivo")
             gr.Plot(value=line_chart(LEADERBOARD_DF))
             gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES))
         # About tab
         with gr.TabItem("📝 About"):

 def boxplot_per_task(dataframe=None, baselines=None):
+    print(dataframe.columns)
     tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
     if dataframe is None:
             for task in tasks
         })
     if baselines is None:
         baselines = {task: np.random.randint(50, 70) for task in tasks}
             fig.add_trace(go.Box(
                 y=y_data,
                 name=task,
+                marker=dict(color=colors[i]),
+                # Modifica: Impostiamo il colore della linea della scatola su un colore diverso dal riempimento
+                line=dict(color="black", width=2),
                 fillcolor=colors[i],
                 opacity=0.7,
+                hovertemplate="<b>"+task+"</b><br>Accuracy: %{y:.2f}%<extra></extra>",
+                width=0.6,
+                whiskerwidth=0.2,
+                quartilemethod="linear"
             ))
+            # baseline
             if task in baselines and baselines[task] is not None:
                 fig.add_shape(
                     type="line",
+                    x0=i-0.3, x1=i+0.3,
                     y0=baselines[task], y1=baselines[task],
                     line=dict(color="black", width=2, dash="dash"),
                     xref="x", yref="y"
                 )
                 fig.add_annotation(
                     x=i, y=baselines[task],
                     text=f"{baselines[task]}%",
                 )
     fig.update_layout(
+        title="Distribution of Model Accuracy by Task",
         xaxis_title="Task",
         yaxis_title="Accuracy (%)",
         template="plotly_white",
         boxmode="group",
         dragmode=False,
         font=dict(family="Arial", size=13),
+        margin=dict(b=140),
+        annotations=[
             dict(
                 text=(
+                    "Boxplots show LLM accuracy in zero/few-shot settings. Black dashed lines<br>"
+                    "indicate best-performing supervised models evaluated on EVALITA."
                 ),
                 xref="paper", yref="paper",
                 x=0.5, y=-0.33,
             )
         ]
     )
     fig.update_yaxes(range=[0, 100], fixedrange=True)
     return fig
 }
+def boxplot_prompts_per_task(dataframe, tasks=None):
+    if tasks is None:
+        tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
+    fig = go.Figure()
+    # Liste per creare una sola voce in legenda per Average e Best
+    avg_x, avg_y = [], []
+    best_x, best_y, best_text = [], [], []
+    for task in tasks:
+        avg_col = f"{task} Prompt Average"
+        best_col = f"{task} Best Prompt"
+        best_id_col = f"{task} Best Prompt Id"
+        if all(col in dataframe.columns for col in [avg_col, best_col, best_id_col]):
+            avg_value = dataframe[avg_col].mean()
+            avg_x.append(task)
+            avg_y.append(avg_value)
+            best_value = dataframe[best_col].mean()
+            best_x.append(task)
+            best_y.append(best_value)
+            best_id = dataframe[best_id_col].mode()[0]  # Most frequent best prompt id
+            best_text.append(f"P:{best_id}")
+    # Barre Average Accuracy (azzurro)
+    fig.add_trace(go.Bar(
+        x=avg_x,
+        y=avg_y,
+        name="Average Accuracy",
+        marker_color="#1f77b4",
+        #hovertemplate="%{y:.2f}%<extra></extra>"
+        #hovertemplate="<b>" + task + "</b><br>Accuracy: %{y:.2f}%<extra></extra>",
+    ))
+    # Barre Best Prompt (rosso)
+    fig.add_trace(go.Bar(
+        x=best_x,
+        y=best_y,
+        name="Best Prompt",
+        marker_color="#d62728",
+        #hovertemplate="%{y:.2f}%<extra></extra>"
+        #hovertemplate = "<b>" + task + "</b><br>Accuracy: %{y:.2f}%<extra></extra>",
+    ))
+    # Testo sopra barre Best Prompt con ID
+    for x, y, text in zip(best_x, best_y, best_text):
+        fig.add_annotation(
+            x=x,
+            y=y + 1,  # leggermente sopra la barra
+            text=text,
+            showarrow=False,
+            font=dict(size=12, color="black")
+        )
+    fig.update_layout(
+        title="Comparison of Average Prompt Accuracy vs Best Prompt Accuracy per Task",
+        xaxis_title="Task",
+        yaxis_title="Accuracy (%)",
+        barmode='group',
+        template="plotly_white",
+        font=dict(family="Arial", size=13),
+        yaxis=dict(range=[0, 100], fixedrange=True)
+    )
+    return fig
 def line_chart(dataframe):
     for _, row in sorted_dataframe.iterrows():
         if row['IS_FS']:  # 5-Few-Shot
+            if row["#Params (B)"] > 50 and not large_medal_fs_assigned:
+                new_model_column.append(f"{row['Model']} 1️⃣0️⃣0️⃣🅱️🏆")
                 large_medal_fs_assigned = True
+            elif 10 < row["#Params (B)"] <= 50 and not medium_medal_fs_assigned:
+                new_model_column.append(f"{row['Model']} 5️⃣0️⃣🅱️🏆")
                 medium_medal_fs_assigned = True
             elif row["#Params (B)"] <= 10 and not small_medal_fs_assigned:
                 new_model_column.append(f"{row['Model']} 1️⃣0️⃣🅱️🏆")
             else:
                 new_model_column.append(row["Model"])
         else:  # 0-Shot
+            if row["#Params (B)"] > 50 and not large_medal_0shot_assigned:
+                new_model_column.append(f"{row['Model']} 1️⃣0️⃣0️⃣🅱️🎖️")
                 large_medal_0shot_assigned = True
+            elif 10 < row["#Params (B)"] <= 50 and not medium_medal_0shot_assigned:
+                new_model_column.append(f"{row['Model']} 5️⃣0️⃣🅱️🎖️")
                 medium_medal_0shot_assigned = True
             elif row["#Params (B)"] <= 10 and not small_medal_0shot_assigned:
                 new_model_column.append(f"{row['Model']} 1️⃣0️⃣🅱️🎖️")
             else:
                 new_model_column.append(row["Model"])
+    # Lista delle colonne da aggiornare
+    cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"]
+    # Applichiamo la trasformazione
+    for col in cols_to_update:
+        dataframe[col] = dataframe[col].replace({1: 7, 2: 8})
     # Aggiorna la colonna Model
     sorted_dataframe["Model"] = new_model_column
             #gr.Plot(value=line_chart_interactive_test(), label="Andamento interattivo")
             gr.Plot(value=line_chart(LEADERBOARD_DF))
             gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES))
+            gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF))
         # About tab
         with gr.TabItem("📝 About"):

src/tasks.py CHANGED Viewed

@@ -125,8 +125,8 @@ LS_DESCRIPTION = """### Lexical Substitution (LS) --- *Generative task*
 | #   | Prompt                                                                                       |
 |-----|--------------------------------------------------------------------------------|
-| 1   | Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `<head>` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: |
-| 2   | Devi risolvere un compito di sostituzione lessicale. Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `<head>` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: |
 <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
@@ -137,8 +137,8 @@ SU_DESCRIPTION = """### Summarization (SUM) --- *Generative task*
 | #   | Prompt                                                                                       |
 |-----|--------------------------------------------------------------------------------|
-| 1   | Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto:             |
-| 2   | Devi risolvere un compito di sintesi automatica del testo. Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto: |
 <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
@@ -149,8 +149,8 @@ NER_DESCRIPTION = """### Named Entity Recognition (NER) --- *Generative task*
 | #   | Prompt                                                                                       |
 |-----|--------------------------------------------------------------------------------|
-| 1   | Estrai tutte le entità di tipo PER (persona), LOC (luogo) e ORG (organizzazione) dal testo seguente. Riporta ogni entità con il formato: Entità$Tipo, separando ciascuna coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.\\nTesto: '{{text}}'\\nEntità: |
-| 2   | Devi svolgere un compito di riconoscimento delle entità nei testi. Estrai tutte le entità di tipo PER (persona), LOC (luogo) e ORG (organizzazione) dal testo seguente. Riporta ogni entità con il formato: Entità$Tipo, separando ciascuna coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.\\nTesto: '{{text}}'\\nEntità: |
 <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
@@ -161,8 +161,8 @@ REL_DESCRIPTION = """### Relation Extraction (REL) --- *Generative task*
 | #   | Prompt                                                                                       |
 |-----|--------------------------------------------------------------------------------|
-| 1   | Dato un documento medico devi estrarre tutte le misurazioni degli esami medici presenti. Riporta ogni relazione nel formato: misurazione$esame, separando ciascuna coppia con '%'. Se non ci sono relazioni da estrarre, rispondi con '&&NOREL&&'.\\nTesto: '{{text}}'\\nRelazioni: |
-| 2   | Devi svolgere un compito di estrazione di relazioni da documenti medici. Dato un documento medico devi estrarre tutte le misurazioni degli esami medici presenti. Riporta ogni relazione nel formato: misurazione$esame, separando ciascuna coppia con '%'. Se non ci sono relazioni da estrarre, rispondi con '&&NOREL&&'.\\nTesto: '{{text}}'\\nRelazioni: |
 <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>

 | #   | Prompt                                                                                       |
 |-----|--------------------------------------------------------------------------------|
+| 7   | Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `<head>` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: |
+| 8   | Devi risolvere un compito di sostituzione lessicale. Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `<head>` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: |
 <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
 | #   | Prompt                                                                                       |
 |-----|--------------------------------------------------------------------------------|
+| 7   | Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto:             |
+| 8   | Devi risolvere un compito di sintesi automatica del testo. Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto: |
 <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
 | #   | Prompt                                                                                       |
 |-----|--------------------------------------------------------------------------------|
+| 7   | Estrai tutte le entità di tipo PER (persona), LOC (luogo) e ORG (organizzazione) dal testo seguente. Riporta ogni entità con il formato: Entità$Tipo, separando ciascuna coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.\\nTesto: '{{text}}'\\nEntità: |
+| 8   | Devi svolgere un compito di riconoscimento delle entità nei testi. Estrai tutte le entità di tipo PER (persona), LOC (luogo) e ORG (organizzazione) dal testo seguente. Riporta ogni entità con il formato: Entità$Tipo, separando ciascuna coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.\\nTesto: '{{text}}'\\nEntità: |
 <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
 | #   | Prompt                                                                                       |
 |-----|--------------------------------------------------------------------------------|
+| 7   | Dato un documento medico devi estrarre tutte le misurazioni degli esami medici presenti. Riporta ogni relazione nel formato: misurazione$esame, separando ciascuna coppia con '%'. Se non ci sono relazioni da estrarre, rispondi con '&&NOREL&&'.\\nTesto: '{{text}}'\\nRelazioni: |
+| 8   | Devi svolgere un compito di estrazione di relazioni da documenti medici. Dato un documento medico devi estrarre tutte le misurazioni degli esami medici presenti. Riporta ogni relazione nel formato: misurazione$esame, separando ciascuna coppia con '%'. Se non ci sono relazioni da estrarre, rispondi con '&&NOREL&&'.\\nTesto: '{{text}}'\\nRelazioni: |
 <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>