feat: Update leaderboards

ScandEval · Jan 11, 2025 · 6f423fe · 6f423fe
1 parent a8a4dbb
commit 6f423fe
Show file tree

Hide file tree

Showing 8 changed files with 1,103 additions and 1,103 deletions.
diff --git a/dutch-nlg.csv b/dutch-nlg.csv
diff --git a/dutch-nlg.md b/dutch-nlg.md
diff --git a/dutch-nlu.csv b/dutch-nlu.csv
@@ -50,7 +50,7 @@ meta-llama/Meta-Llama-3-8B (few-shot),8030,128,8192,True,False,1477,2.67,62.26,1
 meta-llama/Meta-Llama-3-8B-Instruct (few-shot),8030,128,8192,True,False,1483,2.68,68.72,14.67,32.91,45.36
 mistralai/Mixtral-8x7B-v0.1 (few-shot),46703,32,32768,True,False,2363,2.68,64.81,12.99,39.38,49.08
 mistralai/Ministral-8B-Instruct-2410 (few-shot),8020,131,32768,True,False,1302,2.69,63.3,11.82,32.2,59.45
-Nexusflow/Starling-LM-7B-beta (few-shot),7242,32,4096,False,False,4136,2.7,62.86,15.11,39.11,36.48
+Nexusflow/Starling-LM-7B-beta (few-shot),7242,32,4096,False,False,4136,2.7,63.01,15.13,39.2,36.46
 mistralai/Mixtral-8x7B-Instruct-v0.1 (few-shot),46703,32,32768,True,False,5535,2.7,58.8,12.5,45.22,47.03
 nvidia/mistral-nemo-minitron-8b-base (few-shot),8414,131,8192,True,False,2470,2.72,66.29,12.71,31.39,48.33
 senseable/WestLake-7B-v2 (few-shot),7242,32,32768,False,False,5993,2.74,64.25,13.66,28.59,49.64
@@ -59,8 +59,8 @@ robinsmits/Qwen1.5-7B-Dutch-Chat (few-shot),7719,152,32768,False,False,4686,2.77
 yhavinga/Boreas-7B-chat (few-shot),7242,32,32768,False,False,2913,2.77,60.22,11.97,30.94,52.19
 robinsmits/Qwen1.5-7B-Dutch-Chat-Sft-Bf16 (few-shot),7719,152,32768,False,False,2413,2.79,56.83,14.79,23.58,55.9
 CohereForAI/aya-expanse-8b (few-shot),8028,256,8192,False,False,2686,2.8,53.02,13.68,29.97,53.4
+CohereForAI/aya-23-8B (few-shot),8028,256,8192,False,False,2707,2.81,60.81,7.9,31.12,63.0
 skole-gpt-mixtral (few-shot),-1,32,32768,False,False,3583,2.81,62.16,8.92,32.76,56.87
-CohereForAI/aya-23-8B (few-shot),8028,256,8192,False,False,2707,2.82,60.81,7.9,31.12,63.0
 ibm-granite/granite-3.0-8b-instruct (few-shot),8171,49,4096,True,False,1118,2.83,53.62,13.37,23.47,61.2
 sentence-transformers/quora-distilbert-multilingual,135,120,512,True,False,26458,2.83,67.89,23.25,21.36,4.5
 "claude-3-5-haiku-20241022 (zero-shot, val)",-1,-1,200000,True,False,277,2.84,61.15,12.71,35.26,41.27
@@ -80,7 +80,7 @@ sentence-transformers/stsb-xlm-r-multilingual,278,250,512,True,False,15040,2.95,
 mistralai/Mistral-7B-Instruct-v0.2 (few-shot),7242,32,32768,False,False,2370,2.97,55.56,12.37,21.5,50.77
 alpindale/Mistral-7B-v0.2-hf (few-shot),7242,32,32768,True,False,1841,2.98,56.76,7.11,23.55,61.89
 mgoin/Nemotron-4-340B-Instruct-hf-FP8 (few-shot),341029,256,4096,True,False,1904,2.98,47.6,10.62,61.64,24.02
-ibm-granite/granite-8b-code-instruct-4k (few-shot),8055,49,4096,True,False,5617,3.0,60.72,12.38,10.96,51.2
+ibm-granite/granite-8b-code-instruct-4k (few-shot),8055,49,4096,True,False,5617,2.99,60.72,12.38,10.96,51.2
 mistralai/Mistral-7B-v0.3 (few-shot),7248,33,32768,True,False,1364,3.0,56.52,7.02,23.41,61.9
 NorwAI/NorwAI-Mixtral-8x7B-instruct (few-shot),46998,68,32768,True,False,9015,3.01,62.81,11.28,28.57,38.75
 RuterNorway/Llama-2-13b-chat-norwegian (few-shot),-1,32,4096,False,False,3254,3.04,57.66,8.41,16.93,56.29
@@ -89,8 +89,8 @@ meta-llama/Llama-2-13b-chat-hf (few-shot),13016,32,4096,True,False,2849,3.04,57.
 meta-llama/Llama-3.1-8B-Instruct (few-shot),8030,128,131072,True,False,1473,3.04,61.68,8.97,36.57,33.88
 Geotrend/distilbert-base-25lang-cased,108,85,512,True,False,26099,3.05,75.02,7.45,45.28,20.18
 microsoft/Phi-3-mini-4k-instruct (few-shot),3821,32,4096,True,False,8681,3.05,50.31,12.58,14.72,56.19
+NorwAI/NorwAI-Mixtral-8x7B (few-shot),46998,68,32768,True,False,2368,3.07,62.76,13.83,24.44,26.17
 google/gemma-7b (few-shot),8538,256,8192,True,False,1378,3.07,47.75,7.68,28.28,61.49
-NorwAI/NorwAI-Mixtral-8x7B (few-shot),46998,68,32768,True,False,2368,3.08,62.76,13.83,24.44,26.17
 Rijgersberg/GEITje-7B-chat-v2 (few-shot),7242,32,32768,False,False,5908,3.08,42.12,11.06,19.71,59.19
 google/gemma-7b-it (few-shot),8538,256,8317,False,False,1792,3.09,53.93,12.83,6.58,53.45
 occiglot/occiglot-7b-eu5-instruct (few-shot),7242,32,32768,False,False,2088,3.09,53.78,7.78,16.23,63.09
@@ -105,10 +105,10 @@ EuropeanParliament/EUBERT,93,66,512,True,False,20070,3.18,49.54,14.86,27.9,20.65
 occiglot/occiglot-7b-eu5 (few-shot),7242,32,32768,True,False,2219,3.18,51.31,7.41,13.04,59.28
 TrustLLMeu/baseline-7-8b_1t-tokens_llama (few-shot),7800,100,4096,True,False,6197,3.19,48.24,11.37,10.73,54.83
 Twitter/twhin-bert-large,560,250,512,True,False,9707,3.19,77.35,6.55,18.25,28.37
+BramVanroy/fietje-2b (few-shot),2780,51,2048,True,False,4804,3.2,33.92,13.39,6.75,58.57
 Qwen/Qwen1.5-4B-Chat (few-shot),3950,152,32768,False,False,4347,3.2,42.52,14.68,4.07,55.18
 Twitter/twhin-bert-base,278,250,512,True,False,11514,3.2,74.03,9.53,39.12,7.71
 meta-llama/Llama-3.2-3B-Instruct (few-shot),3213,128,131200,False,False,10424,3.2,43.66,12.87,17.94,47.77
-BramVanroy/fietje-2b (few-shot),2780,51,2048,True,False,4804,3.21,33.92,13.39,6.75,58.57
 meta-llama/Llama-2-7b-chat-hf (few-shot),6738,32,4096,False,False,2643,3.21,50.23,10.07,14.73,53.42
 mistralai/Mistral-7B-Instruct-v0.1 (few-shot),7242,32,32768,False,False,634,3.21,52.72,7.91,18.14,52.75
 BramVanroy/fietje-2b-instruct (few-shot),2775,50,2048,False,False,4710,3.22,36.5,13.7,4.81,60.63

diff --git a/dutch-nlu.md b/dutch-nlu.md
@@ -3,7 +3,7 @@ layout: leaderboard
 title: Dutch NLU 🇳🇱
 ---
 
-<center>Last updated: 11/01/2025 11:03:45 CET</center>
+<center>Last updated: 11/01/2025 17:22:53 CET</center>
 
 <div class="blocked centered">
   <input type="checkbox" id="merged-models-checkbox">
@@ -909,14 +909,14 @@ title: Dutch NLU 🇳🇱
    <td class="commercially_licensed">False</td> <!-- Whether the model is commercially licensed -->
    <td class="speed">4,136 ± 1,282 / 668 ± 326</td> <!-- Model inference speed -->
    <td class="rank">2.70</td> <!-- ScandEval rank -->
-   <td class="nl ner">62.86 ± 2.07 / 33.76 ± 1.85</td> <!-- CoNLL-nl -->
-   <td class="nl sent">15.11 ± 1.83 / 40.18 ± 1.61</td> <!-- Dutch Social -->
-   <td class="nl la">39.11 ± 1.22 / 68.00 ± 1.24</td> <!-- ScaLA-nl -->
-   <td class="nl rc">36.48 ± 2.80 / 59.22 ± 2.19</td> <!-- SQuAD-nl -->
-   <td>14.0.4</td> <!-- CoNLL-nl version -->
-   <td>14.0.4</td> <!-- Dutch Social version -->
-   <td>14.0.4</td> <!-- ScaLA-nl version -->
-   <td>14.0.4</td> <!-- SQuAD-nl version -->
+   <td class="nl ner">63.01 ± 2.00 / 33.76 ± 1.86</td> <!-- CoNLL-nl -->
+   <td class="nl sent">15.13 ± 1.83 / 40.18 ± 1.61</td> <!-- Dutch Social -->
+   <td class="nl la">39.20 ± 1.25 / 68.05 ± 1.27</td> <!-- ScaLA-nl -->
+   <td class="nl rc">36.46 ± 2.79 / 59.21 ± 2.19</td> <!-- SQuAD-nl -->
+   <td>14.1.2</td> <!-- CoNLL-nl version -->
+   <td>14.1.2</td> <!-- Dutch Social version -->
+   <td>14.1.2</td> <!-- ScaLA-nl version -->
+   <td>14.1.2</td> <!-- SQuAD-nl version -->
    </tr>
   <tr class="not-merged-model">
    <td>mistralai/Mixtral-8x7B-Instruct-v0.1 (few-shot)</td> <!-- Model ID -->
@@ -1054,6 +1054,23 @@ title: Dutch NLU 🇳🇱
    <td>14.1.2</td> <!-- ScaLA-nl version -->
    <td>14.1.2</td> <!-- SQuAD-nl version -->
    </tr>
+  <tr class="not-merged-model">
+   <td>CohereForAI/aya-23-8B (few-shot)</td> <!-- Model ID -->
+   <td class="num_model_parameters">8028</td> <!-- Number of trainable parameters -->
+   <td class="vocabulary_size">256</td> <!-- Size of the model's vocabulary -->
+   <td class="max_sequence_length">8192</td> <!-- Maximum sequence length of the model -->
+   <td class="commercially_licensed">False</td> <!-- Whether the model is commercially licensed -->
+   <td class="speed">2,707 ± 688 / 497 ± 166</td> <!-- Model inference speed -->
+   <td class="rank">2.81</td> <!-- ScandEval rank -->
+   <td class="nl ner">60.81 ± 1.94 / 46.59 ± 3.32</td> <!-- CoNLL-nl -->
+   <td class="nl sent">7.90 ± 1.63 / 24.82 ± 0.95</td> <!-- Dutch Social -->
+   <td class="nl la">31.12 ± 2.35 / 64.29 ± 1.88</td> <!-- ScaLA-nl -->
+   <td class="nl rc">63.00 ± 1.23 / 74.60 ± 0.67</td> <!-- SQuAD-nl -->
+   <td>13.0.0</td> <!-- CoNLL-nl version -->
+   <td>13.0.0</td> <!-- Dutch Social version -->
+   <td>13.0.0</td> <!-- ScaLA-nl version -->
+   <td>13.0.0</td> <!-- SQuAD-nl version -->
+   </tr>
   <tr class="not-merged-model">
    <td>skole-gpt-mixtral (few-shot)</td> <!-- Model ID -->
    <td class="num_model_parameters">unknown</td> <!-- Number of trainable parameters -->
@@ -1071,23 +1088,6 @@ title: Dutch NLU 🇳🇱
    <td>13.0.0</td> <!-- ScaLA-nl version -->
    <td>13.0.0</td> <!-- SQuAD-nl version -->
    </tr>
-  <tr class="not-merged-model">
-   <td>CohereForAI/aya-23-8B (few-shot)</td> <!-- Model ID -->
-   <td class="num_model_parameters">8028</td> <!-- Number of trainable parameters -->
-   <td class="vocabulary_size">256</td> <!-- Size of the model's vocabulary -->
-   <td class="max_sequence_length">8192</td> <!-- Maximum sequence length of the model -->
-   <td class="commercially_licensed">False</td> <!-- Whether the model is commercially licensed -->
-   <td class="speed">2,707 ± 688 / 497 ± 166</td> <!-- Model inference speed -->
-   <td class="rank">2.82</td> <!-- ScandEval rank -->
-   <td class="nl ner">60.81 ± 1.94 / 46.59 ± 3.32</td> <!-- CoNLL-nl -->
-   <td class="nl sent">7.90 ± 1.63 / 24.82 ± 0.95</td> <!-- Dutch Social -->
-   <td class="nl la">31.12 ± 2.35 / 64.29 ± 1.88</td> <!-- ScaLA-nl -->
-   <td class="nl rc">63.00 ± 1.23 / 74.60 ± 0.67</td> <!-- SQuAD-nl -->
-   <td>13.0.0</td> <!-- CoNLL-nl version -->
-   <td>13.0.0</td> <!-- Dutch Social version -->
-   <td>13.0.0</td> <!-- ScaLA-nl version -->
-   <td>13.0.0</td> <!-- SQuAD-nl version -->
-   </tr>
   <tr class="not-merged-model">
    <td>ibm-granite/granite-3.0-8b-instruct (few-shot)</td> <!-- Model ID -->
    <td class="num_model_parameters">8171</td> <!-- Number of trainable parameters -->
@@ -1418,7 +1418,7 @@ title: Dutch NLU 🇳🇱
    <td class="max_sequence_length">4096</td> <!-- Maximum sequence length of the model -->
    <td class="commercially_licensed">True</td> <!-- Whether the model is commercially licensed -->
    <td class="speed">5,617 ± 995 / 1,623 ± 540</td> <!-- Model inference speed -->
-   <td class="rank">3.00</td> <!-- ScandEval rank -->
+   <td class="rank">2.99</td> <!-- ScandEval rank -->
    <td class="nl ner">60.72 ± 2.14 / 45.52 ± 2.46</td> <!-- CoNLL-nl -->
    <td class="nl sent">12.38 ± 1.62 / 29.91 ± 1.91</td> <!-- Dutch Social -->
    <td class="nl la">10.96 ± 1.47 / 47.97 ± 3.45</td> <!-- ScaLA-nl -->
@@ -1564,6 +1564,23 @@ title: Dutch NLU 🇳🇱
    <td>12.10.5</td> <!-- ScaLA-nl version -->
    <td>12.10.5</td> <!-- SQuAD-nl version -->
    </tr>
+  <tr class="not-merged-model">
+   <td>NorwAI/NorwAI-Mixtral-8x7B (few-shot)</td> <!-- Model ID -->
+   <td class="num_model_parameters">46998</td> <!-- Number of trainable parameters -->
+   <td class="vocabulary_size">68</td> <!-- Size of the model's vocabulary -->
+   <td class="max_sequence_length">32768</td> <!-- Maximum sequence length of the model -->
+   <td class="commercially_licensed">True</td> <!-- Whether the model is commercially licensed -->
+   <td class="speed">2,368 ± 793 / 317 ± 108</td> <!-- Model inference speed -->
+   <td class="rank">3.07</td> <!-- ScandEval rank -->
+   <td class="nl ner">62.76 ± 3.54 / 40.29 ± 1.82</td> <!-- CoNLL-nl -->
+   <td class="nl sent">13.83 ± 1.32 / 37.70 ± 1.94</td> <!-- Dutch Social -->
+   <td class="nl la">24.44 ± 2.86 / 58.02 ± 2.28</td> <!-- ScaLA-nl -->
+   <td class="nl rc">26.17 ± 2.88 / 37.61 ± 2.20</td> <!-- SQuAD-nl -->
+   <td>14.0.4</td> <!-- CoNLL-nl version -->
+   <td>14.0.4</td> <!-- Dutch Social version -->
+   <td>14.0.4</td> <!-- ScaLA-nl version -->
+   <td>14.0.4</td> <!-- SQuAD-nl version -->
+   </tr>
   <tr class="not-merged-model">
    <td>google/gemma-7b (few-shot)</td> <!-- Model ID -->
    <td class="num_model_parameters">8538</td> <!-- Number of trainable parameters -->
@@ -1581,23 +1598,6 @@ title: Dutch NLU 🇳🇱
    <td>12.9.1</td> <!-- ScaLA-nl version -->
    <td>12.9.1</td> <!-- SQuAD-nl version -->
    </tr>
-  <tr class="not-merged-model">
-   <td>NorwAI/NorwAI-Mixtral-8x7B (few-shot)</td> <!-- Model ID -->
-   <td class="num_model_parameters">46998</td> <!-- Number of trainable parameters -->
-   <td class="vocabulary_size">68</td> <!-- Size of the model's vocabulary -->
-   <td class="max_sequence_length">32768</td> <!-- Maximum sequence length of the model -->
-   <td class="commercially_licensed">True</td> <!-- Whether the model is commercially licensed -->
-   <td class="speed">2,368 ± 793 / 317 ± 108</td> <!-- Model inference speed -->
-   <td class="rank">3.08</td> <!-- ScandEval rank -->
-   <td class="nl ner">62.76 ± 3.54 / 40.29 ± 1.82</td> <!-- CoNLL-nl -->
-   <td class="nl sent">13.83 ± 1.32 / 37.70 ± 1.94</td> <!-- Dutch Social -->
-   <td class="nl la">24.44 ± 2.86 / 58.02 ± 2.28</td> <!-- ScaLA-nl -->
-   <td class="nl rc">26.17 ± 2.88 / 37.61 ± 2.20</td> <!-- SQuAD-nl -->
-   <td>14.0.4</td> <!-- CoNLL-nl version -->
-   <td>14.0.4</td> <!-- Dutch Social version -->
-   <td>14.0.4</td> <!-- ScaLA-nl version -->
-   <td>14.0.4</td> <!-- SQuAD-nl version -->
-   </tr>
   <tr class="not-merged-model">
    <td>Rijgersberg/GEITje-7B-chat-v2 (few-shot)</td> <!-- Model ID -->
    <td class="num_model_parameters">7242</td> <!-- Number of trainable parameters -->
@@ -1836,6 +1836,23 @@ title: Dutch NLU 🇳🇱
    <td>0.0.0</td> <!-- ScaLA-nl version -->
    <td>0.0.0</td> <!-- SQuAD-nl version -->
    </tr>
+  <tr class="not-merged-model">
+   <td>BramVanroy/fietje-2b (few-shot)</td> <!-- Model ID -->
+   <td class="num_model_parameters">2780</td> <!-- Number of trainable parameters -->
+   <td class="vocabulary_size">51</td> <!-- Size of the model's vocabulary -->
+   <td class="max_sequence_length">2048</td> <!-- Maximum sequence length of the model -->
+   <td class="commercially_licensed">True</td> <!-- Whether the model is commercially licensed -->
+   <td class="speed">4,804 ± 1,045 / 1,220 ± 392</td> <!-- Model inference speed -->
+   <td class="rank">3.20</td> <!-- ScandEval rank -->
+   <td class="nl ner">33.92 ± 3.43 / 28.63 ± 2.42</td> <!-- CoNLL-nl -->
+   <td class="nl sent">13.39 ± 1.64 / 41.03 ± 1.87</td> <!-- Dutch Social -->
+   <td class="nl la">6.75 ± 2.55 / 41.28 ± 2.37</td> <!-- ScaLA-nl -->
+   <td class="nl rc">58.57 ± 1.03 / 69.39 ± 0.79</td> <!-- SQuAD-nl -->
+   <td>12.6.1</td> <!-- CoNLL-nl version -->
+   <td>12.6.1</td> <!-- Dutch Social version -->
+   <td>12.6.1</td> <!-- ScaLA-nl version -->
+   <td>12.6.1</td> <!-- SQuAD-nl version -->
+   </tr>
   <tr class="not-merged-model">
    <td>Qwen/Qwen1.5-4B-Chat (few-shot)</td> <!-- Model ID -->
    <td class="num_model_parameters">3950</td> <!-- Number of trainable parameters -->
@@ -1887,23 +1904,6 @@ title: Dutch NLU 🇳🇱
    <td>13.0.0</td> <!-- ScaLA-nl version -->
    <td>13.0.0</td> <!-- SQuAD-nl version -->
    </tr>
-  <tr class="not-merged-model">
-   <td>BramVanroy/fietje-2b (few-shot)</td> <!-- Model ID -->
-   <td class="num_model_parameters">2780</td> <!-- Number of trainable parameters -->
-   <td class="vocabulary_size">51</td> <!-- Size of the model's vocabulary -->
-   <td class="max_sequence_length">2048</td> <!-- Maximum sequence length of the model -->
-   <td class="commercially_licensed">True</td> <!-- Whether the model is commercially licensed -->
-   <td class="speed">4,804 ± 1,045 / 1,220 ± 392</td> <!-- Model inference speed -->
-   <td class="rank">3.21</td> <!-- ScandEval rank -->
-   <td class="nl ner">33.92 ± 3.43 / 28.63 ± 2.42</td> <!-- CoNLL-nl -->
-   <td class="nl sent">13.39 ± 1.64 / 41.03 ± 1.87</td> <!-- Dutch Social -->
-   <td class="nl la">6.75 ± 2.55 / 41.28 ± 2.37</td> <!-- ScaLA-nl -->
-   <td class="nl rc">58.57 ± 1.03 / 69.39 ± 0.79</td> <!-- SQuAD-nl -->
-   <td>12.6.1</td> <!-- CoNLL-nl version -->
-   <td>12.6.1</td> <!-- Dutch Social version -->
-   <td>12.6.1</td> <!-- ScaLA-nl version -->
-   <td>12.6.1</td> <!-- SQuAD-nl version -->
-   </tr>
   <tr class="not-merged-model">
    <td>meta-llama/Llama-2-7b-chat-hf (few-shot)</td> <!-- Model ID -->
    <td class="num_model_parameters">6738</td> <!-- Number of trainable parameters -->