From c00e987b6a3bc1df2e0267d5d616052540e8c5d6 Mon Sep 17 00:00:00 2001
From: saattrupdan <saattrupdan@gmail.com>
Date: Thu, 5 Dec 2024 15:47:34 +0100
Subject: [PATCH] feat: Update benchmark results

---
 scandeval_benchmark_results.jsonl | 54 +++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/scandeval_benchmark_results.jsonl b/scandeval_benchmark_results.jsonl
index 5c6116b2..df8a3bb5 100644
--- a/scandeval_benchmark_results.jsonl
+++ b/scandeval_benchmark_results.jsonl
@@ -12016,6 +12016,60 @@
 {"dataset": "suc3", "task": "named-entity-recognition", "dataset_languages": ["sv"], "model": "setu4993/LaBSE", "results": {"raw": {"test": [{"test_loss": 0.05363640561699867, "test_micro_f1": 0.6967545638945234, "test_micro_f1_no_misc": 0.7808219178082192, "test_runtime": 6.1658, "test_samples_per_second": 332.157, "test_steps_per_second": 10.38}, {"test_loss": 0.05508258193731308, "test_micro_f1": 0.7307898979043526, "test_micro_f1_no_misc": 0.7917189460476788, "test_runtime": 5.4622, "test_samples_per_second": 374.943, "test_steps_per_second": 11.717}, {"test_loss": 0.05452176183462143, "test_micro_f1": 0.7195301027900147, "test_micro_f1_no_misc": 0.7572192513368985, "test_runtime": 5.635, "test_samples_per_second": 363.444, "test_steps_per_second": 11.358}, {"test_loss": 0.052072539925575256, "test_micro_f1": 0.6940187839841819, "test_micro_f1_no_misc": 0.7383783783783784, "test_runtime": 5.8358, "test_samples_per_second": 350.935, "test_steps_per_second": 10.967}, {"test_loss": 0.056713271886110306, "test_micro_f1": 0.7537205952952473, "test_micro_f1_no_misc": 0.8060344827586208, "test_runtime": 5.888, "test_samples_per_second": 347.825, "test_steps_per_second": 10.87}, {"test_loss": 0.05259690433740616, "test_micro_f1": 0.7307146237576905, "test_micro_f1_no_misc": 0.7865290602933188, "test_runtime": 5.2057, "test_samples_per_second": 393.411, "test_steps_per_second": 12.294}, {"test_loss": 0.060233913362026215, "test_micro_f1": 0.6587419514611194, "test_micro_f1_no_misc": 0.7283105022831051, "test_runtime": 5.388, "test_samples_per_second": 380.102, "test_steps_per_second": 11.878}, {"test_loss": 0.04685961455106735, "test_micro_f1": 0.7400109469074986, "test_micro_f1_no_misc": 0.7893101305158484, "test_runtime": 5.9504, "test_samples_per_second": 344.178, "test_steps_per_second": 10.756}, {"test_loss": 0.05065387859940529, "test_micro_f1": 0.7451946771808773, "test_micro_f1_no_misc": 0.8066298342541437, "test_runtime": 5.8209, "test_samples_per_second": 351.838, "test_steps_per_second": 10.995}, {"test_loss": 0.050872787833213806, "test_micro_f1": 0.7383406971035836, "test_micro_f1_no_misc": 0.7931818181818181, "test_runtime": 5.9217, "test_samples_per_second": 345.845, "test_steps_per_second": 10.808}]}, "total": {"test_micro_f1": 72.0781684027909, "test_micro_f1_se": 1.8131533127377135, "test_micro_f1_no_misc": 77.7813432185803, "test_micro_f1_no_misc_se": 1.6915321575549511}}, "num_model_parameters": 470343177, "max_sequence_length": 512, "vocabulary_size": 501153, "generative": false, "few_shot": true, "validation_split": false}
 {"dataset": "swerec", "task": "sentiment-classification", "dataset_languages": ["sv"], "model": "setu4993/LaBSE", "results": {"raw": {"test": [{"test_loss": 0.39160189032554626, "test_mcc": 0.7567870831601035, "test_macro_f1": 0.7289763219565746, "test_runtime": 11.5486, "test_samples_per_second": 177.337, "test_steps_per_second": 22.167}, {"test_loss": 0.4390621781349182, "test_mcc": 0.723739133752698, "test_macro_f1": 0.6969478456198573, "test_runtime": 10.7608, "test_samples_per_second": 190.32, "test_steps_per_second": 23.79}, {"test_loss": 0.4432726502418518, "test_mcc": 0.6964579219036385, "test_macro_f1": 0.6671826181759957, "test_runtime": 11.4002, "test_samples_per_second": 179.646, "test_steps_per_second": 22.456}, {"test_loss": 0.3569955825805664, "test_mcc": 0.7751174997239167, "test_macro_f1": 0.7789615238707869, "test_runtime": 10.6461, "test_samples_per_second": 192.371, "test_steps_per_second": 24.046}, {"test_loss": 0.4175899922847748, "test_mcc": 0.7212307127188395, "test_macro_f1": 0.695020012932253, "test_runtime": 10.6259, "test_samples_per_second": 192.736, "test_steps_per_second": 24.092}, {"test_loss": 0.42545294761657715, "test_mcc": 0.718500794543388, "test_macro_f1": 0.6465026491283783, "test_runtime": 11.4216, "test_samples_per_second": 179.309, "test_steps_per_second": 22.414}, {"test_loss": 0.3952180743217468, "test_mcc": 0.7470259055064282, "test_macro_f1": 0.7239874029964369, "test_runtime": 10.772, "test_samples_per_second": 190.123, "test_steps_per_second": 23.765}, {"test_loss": 0.40563833713531494, "test_mcc": 0.7431716847512082, "test_macro_f1": 0.7183025991854014, "test_runtime": 11.1195, "test_samples_per_second": 184.182, "test_steps_per_second": 23.023}, {"test_loss": 0.41585657000541687, "test_mcc": 0.736429291012297, "test_macro_f1": 0.6572166533245158, "test_runtime": 11.3178, "test_samples_per_second": 180.953, "test_steps_per_second": 22.619}, {"test_loss": 0.3966815769672394, "test_mcc": 0.739975281053016, "test_macro_f1": 0.7299728016968468, "test_runtime": 10.7781, "test_samples_per_second": 190.015, "test_steps_per_second": 23.752}]}, "total": {"test_mcc": 73.58435308125533, "test_mcc_se": 1.3687740934352473, "test_macro_f1": 70.43070428887049, "test_macro_f1_se": 2.487928208964444}}, "num_model_parameters": 470929155, "max_sequence_length": 512, "vocabulary_size": 501153, "generative": false, "few_shot": true, "validation_split": false}
 {"dataset": "wikiann-fo", "task": "named-entity-recognition", "dataset_languages": ["fo"], "model": "setu4993/LaBSE", "results": {"raw": {"test": [{"test_loss": 0.114360511302948, "test_micro_f1": 0.8536023054755043, "test_micro_f1_no_misc": 0.8536023054755043, "test_runtime": 6.2762, "test_samples_per_second": 326.311, "test_steps_per_second": 10.197}, {"test_loss": 0.11677064001560211, "test_micro_f1": 0.85417071748007, "test_micro_f1_no_misc": 0.85417071748007, "test_runtime": 6.3334, "test_samples_per_second": 323.364, "test_steps_per_second": 10.105}, {"test_loss": 0.12441583722829819, "test_micro_f1": 0.8701590088771828, "test_micro_f1_no_misc": 0.8701590088771828, "test_runtime": 6.2487, "test_samples_per_second": 327.75, "test_steps_per_second": 10.242}, {"test_loss": 0.12342134863138199, "test_micro_f1": 0.8523795677037899, "test_micro_f1_no_misc": 0.8523795677037899, "test_runtime": 6.1753, "test_samples_per_second": 331.643, "test_steps_per_second": 10.364}, {"test_loss": 0.11405598372220993, "test_micro_f1": 0.8630603366748629, "test_micro_f1_no_misc": 0.8630603366748629, "test_runtime": 6.1155, "test_samples_per_second": 334.889, "test_steps_per_second": 10.465}, {"test_loss": 0.11746443808078766, "test_micro_f1": 0.8527720939296503, "test_micro_f1_no_misc": 0.8527720939296503, "test_runtime": 6.2213, "test_samples_per_second": 329.193, "test_steps_per_second": 10.287}, {"test_loss": 0.11694245040416718, "test_micro_f1": 0.8637652811735943, "test_micro_f1_no_misc": 0.8637652811735943, "test_runtime": 6.2682, "test_samples_per_second": 326.728, "test_steps_per_second": 10.21}, {"test_loss": 0.10889463871717453, "test_micro_f1": 0.8681762660903881, "test_micro_f1_no_misc": 0.8681762660903881, "test_runtime": 6.279, "test_samples_per_second": 326.169, "test_steps_per_second": 10.193}, {"test_loss": 0.11825648695230484, "test_micro_f1": 0.8617418798505317, "test_micro_f1_no_misc": 0.8617418798505317, "test_runtime": 6.2197, "test_samples_per_second": 329.274, "test_steps_per_second": 10.29}, {"test_loss": 0.12485436350107193, "test_micro_f1": 0.8708378351831844, "test_micro_f1_no_misc": 0.8708378351831844, "test_runtime": 6.1851, "test_samples_per_second": 331.119, "test_steps_per_second": 10.347}]}, "total": {"test_micro_f1": 86.10665292438759, "test_micro_f1_se": 0.45637211288368984, "test_micro_f1_no_misc": 86.10665292438759, "test_micro_f1_no_misc_se": 0.45637211288368984}}, "num_model_parameters": 470343177, "max_sequence_length": 512, "vocabulary_size": 501153, "generative": false, "few_shot": true, "validation_split": false}
+{"dataset": "angry-tweets", "task": "sentiment-classification", "dataset_languages": ["da"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.4848311582932243, "macro_f1": 0.6566593163040514}, {"mcc": 0.4710493564701632, "macro_f1": 0.6132470892695207}, {"mcc": 0.532960600282665, "macro_f1": 0.6912704374110179}, {"mcc": 0.4950512777185183, "macro_f1": 0.6489856313891399}, {"mcc": 0.5554322057292909, "macro_f1": 0.709002801597209}, {"mcc": 0.50897850677457, "macro_f1": 0.6543960657662273}, {"mcc": 0.5083409827574323, "macro_f1": 0.6669577383860424}, {"mcc": 0.5401651182991014, "macro_f1": 0.6759161828434398}, {"mcc": 0.532214370523262, "macro_f1": 0.6729219335771565}, {"mcc": 0.5223279828282013, "macro_f1": 0.6831793583901348}]}, "total": {"test_mcc": 51.51351559676429, "test_mcc_se": 1.634750622109416, "test_macro_f1": 66.72536554933939, "test_macro_f1_se": 1.6279237763749863}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "arc-is", "task": "knowledge", "dataset_languages": ["is"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.1962774017513744, "accuracy": 0.3994140625}, {"mcc": 0.22466889536328083, "accuracy": 0.4208984375}, {"mcc": 0.20758627988326703, "accuracy": 0.4033203125}, {"mcc": 0.18157323284009355, "accuracy": 0.38671875}, {"mcc": 0.21452357169098776, "accuracy": 0.4072265625}, {"mcc": 0.2099694217993396, "accuracy": 0.404296875}, {"mcc": 0.16534692945691656, "accuracy": 0.373046875}, {"mcc": 0.14983971768525764, "accuracy": 0.361328125}, {"mcc": 0.17611908013378738, "accuracy": 0.3837890625}, {"mcc": 0.2013679634256645, "accuracy": 0.40234375}]}, "total": {"test_mcc": 19.27272494029969, "test_mcc_se": 1.473981813038057, "test_accuracy": 39.423828125, "test_accuracy_se": 1.1036285125395775}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "cnn-dailymail", "task": "summarization", "dataset_languages": ["en"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"bertscore": 0.7062732756603509, "rouge_l": 0.2502204326450074}, {"bertscore": 0.7108152939326828, "rouge_l": 0.26626888332334875}, {"bertscore": 0.7132071297673974, "rouge_l": 0.26919351187868634}, {"bertscore": 0.704810529932729, "rouge_l": 0.23804942069986523}, {"bertscore": 0.7155684455210576, "rouge_l": 0.27392637002695397}, {"bertscore": 0.7092310815787641, "rouge_l": 0.24104671173510026}, {"bertscore": 0.7095974086550996, "rouge_l": 0.2660701758343997}, {"bertscore": 0.6961289528990164, "rouge_l": 0.25602328865955803}, {"bertscore": 0.7135379151150119, "rouge_l": 0.26556088674901934}, {"bertscore": 0.710403663906618, "rouge_l": 0.2617790456328498}]}, "total": {"test_bertscore": 70.89573696968728, "test_bertscore_se": 0.3443167803990276, "test_rouge_l": 25.88138727184789, "test_rouge_l_se": 0.7526499529153959}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "conll-en", "task": "named-entity-recognition", "dataset_languages": ["en"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.7005706276904594, "micro_f1": 0.6205482961824238}, {"micro_f1_no_misc": 0.657651522521646, "micro_f1": 0.6190643686246922}, {"micro_f1_no_misc": 0.6809752031673265, "micro_f1": 0.6155767686680346}, {"micro_f1_no_misc": 0.6597818743364541, "micro_f1": 0.5921096071310664}, {"micro_f1_no_misc": 0.6728502415458938, "micro_f1": 0.6182772931311934}, {"micro_f1_no_misc": 0.6645135244247072, "micro_f1": 0.5957521195699677}, {"micro_f1_no_misc": 0.6587269815852682, "micro_f1": 0.5736872763584921}, {"micro_f1_no_misc": 0.689166909126176, "micro_f1": 0.6106386526596631}, {"micro_f1_no_misc": 0.6739475774424146, "micro_f1": 0.6145608429052596}, {"micro_f1_no_misc": 0.6846169477748425, "micro_f1": 0.579434657927079}]}, "total": {"test_micro_f1_no_misc": 67.42801409615188, "test_micro_f1_no_misc_se": 0.8983351042716555, "test_micro_f1": 60.39649883157872, "test_micro_f1_se": 1.0786850011867384}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "conll-nl", "task": "named-entity-recognition", "dataset_languages": ["nl"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.6240074731433909, "micro_f1": 0.39880209633142}, {"micro_f1_no_misc": 0.6177105831533478, "micro_f1": 0.49001490312965723}, {"micro_f1_no_misc": 0.6554953178905865, "micro_f1": 0.5061278569062604}, {"micro_f1_no_misc": 0.6221628838451269, "micro_f1": 0.44438356164383563}, {"micro_f1_no_misc": 0.6383389666827619, "micro_f1": 0.4803352675693101}, {"micro_f1_no_misc": 0.6123032904148784, "micro_f1": 0.4757097791798107}, {"micro_f1_no_misc": 0.6279264214046824, "micro_f1": 0.42970958622462097}, {"micro_f1_no_misc": 0.6112895823772372, "micro_f1": 0.460156476383657}, {"micro_f1_no_misc": 0.6176897051715805, "micro_f1": 0.422275163787969}, {"micro_f1_no_misc": 0.5886828340466, "micro_f1": 0.4687399807630651}]}, "total": {"test_micro_f1_no_misc": 62.15607058130191, "test_micro_f1_no_misc_se": 1.0896961174131132, "test_micro_f1": 45.76254671919606, "test_micro_f1_se": 2.0700205068659603}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "danish-citizen-tests", "task": "knowledge", "dataset_languages": ["da"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.7487312981149461, "accuracy": 0.83203125}, {"mcc": 0.7918681579523887, "accuracy": 0.861328125}, {"mcc": 0.812043552111651, "accuracy": 0.873046875}, {"mcc": 0.7783980327416375, "accuracy": 0.8515625}, {"mcc": 0.7280019680321452, "accuracy": 0.818359375}, {"mcc": 0.7683112832085706, "accuracy": 0.84765625}, {"mcc": 0.8363009856847436, "accuracy": 0.890625}, {"mcc": 0.7499177686287761, "accuracy": 0.833984375}, {"mcc": 0.763735741543181, "accuracy": 0.83984375}, {"mcc": 0.7783713138054275, "accuracy": 0.853515625}]}, "total": {"test_mcc": 77.5568010182347, "test_mcc_se": 1.9745217295831479, "test_accuracy": 85.01953125, "test_accuracy_se": 1.3088605138541547}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "dansk", "task": "named-entity-recognition", "dataset_languages": ["da"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.5448123620309051, "micro_f1": 0.3825873697118332}, {"micro_f1_no_misc": 0.4802578565672844, "micro_f1": 0.34192297035189806}, {"micro_f1_no_misc": 0.44192634560906513, "micro_f1": 0.3029177718832891}, {"micro_f1_no_misc": 0.4811783960720131, "micro_f1": 0.34061624649859945}, {"micro_f1_no_misc": 0.47838383838383836, "micro_f1": 0.3565345080763583}, {"micro_f1_no_misc": 0.48847926267281105, "micro_f1": 0.3220112329499866}, {"micro_f1_no_misc": 0.4185698240359417, "micro_f1": 0.2900226187484293}, {"micro_f1_no_misc": 0.5652528548123982, "micro_f1": 0.41249263406010606}, {"micro_f1_no_misc": 0.5192307692307692, "micro_f1": 0.3679012345679012}, {"micro_f1_no_misc": 0.49876135425268375, "micro_f1": 0.35710275126333524}]}, "total": {"test_micro_f1_no_misc": 49.1685286366771, "test_micro_f1_no_misc_se": 2.7169170219174363, "test_micro_f1": 34.741093381117366, "test_micro_f1_se": 2.2692894703936}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "danske-talemaader", "task": "knowledge", "dataset_languages": ["da"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.6334746970851035, "accuracy": 0.7236328125}, {"mcc": 0.6182536254251814, "accuracy": 0.7109375}, {"mcc": 0.6623348847203763, "accuracy": 0.7470703125}, {"mcc": 0.6539086123992836, "accuracy": 0.740234375}, {"mcc": 0.6867628770411217, "accuracy": 0.7646484375}, {"mcc": 0.6680662671286371, "accuracy": 0.7509765625}, {"mcc": 0.6545865442412065, "accuracy": 0.740234375}, {"mcc": 0.6010029372876066, "accuracy": 0.7001953125}, {"mcc": 0.7065239854623881, "accuracy": 0.779296875}, {"mcc": 0.6919315116852112, "accuracy": 0.7685546875}]}, "total": {"test_mcc": 65.76845942476115, "test_mcc_se": 2.056773334301985, "test_accuracy": 74.2578125, "test_accuracy_se": 1.5691162222278203}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "dutch-social", "task": "sentiment-classification", "dataset_languages": ["nl"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.055783330610642085, "macro_f1": 0.22582872840507626}, {"mcc": 0.08846209271104587, "macro_f1": 0.24204880593406872}, {"mcc": 0.07545361427773793, "macro_f1": 0.24549587770983047}, {"mcc": 0.08732153099252354, "macro_f1": 0.2452163824554677}, {"mcc": 0.09386319478228963, "macro_f1": 0.25167638583354995}, {"mcc": 0.07497890541247694, "macro_f1": 0.2253842169768432}, {"mcc": 0.0895838347181929, "macro_f1": 0.2407973553156728}, {"mcc": 0.10695809962098839, "macro_f1": 0.23313212469664715}, {"mcc": 0.11291663975635005, "macro_f1": 0.2620866634885002}, {"mcc": 0.10655402674683144, "macro_f1": 0.25652516360471805}]}, "total": {"test_mcc": 8.918752696290786, "test_mcc_se": 1.0760842497222487, "test_macro_f1": 24.281917044203745, "test_macro_f1_se": 0.7567464322402746}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "fone", "task": "named-entity-recognition", "dataset_languages": ["fo"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.6266583563351439, "micro_f1": 0.5563251333492556}, {"micro_f1_no_misc": 0.6921062476618032, "micro_f1": 0.6233202570737001}, {"micro_f1_no_misc": 0.6501938858328074, "micro_f1": 0.570137157107232}, {"micro_f1_no_misc": 0.6166254238841536, "micro_f1": 0.5698091189464032}, {"micro_f1_no_misc": 0.708410306271269, "micro_f1": 0.6248536544572671}, {"micro_f1_no_misc": 0.6755256211886775, "micro_f1": 0.6101364522417153}, {"micro_f1_no_misc": 0.7121886542286201, "micro_f1": 0.6321993937352645}, {"micro_f1_no_misc": 0.6965473262275668, "micro_f1": 0.5800367985280589}, {"micro_f1_no_misc": 0.7122371777953725, "micro_f1": 0.6386041780768328}, {"micro_f1_no_misc": 0.6986288592482044, "micro_f1": 0.5935880829015543}]}, "total": {"test_micro_f1_no_misc": 67.89121858673617, "test_micro_f1_no_misc_se": 2.206696483664736, "test_micro_f1": 59.990102264172826, "test_micro_f1_se": 1.8411585386849543}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "germanquad", "task": "reading-comprehension", "dataset_languages": ["de"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"em": 33.23005422153369, "f1": 60.44775445306856}, {"em": 35.58139534883721, "f1": 69.10724732020131}, {"em": 28.5935085007728, "f1": 55.20371730864918}, {"em": 34.50155763239876, "f1": 63.19218222176982}, {"em": 36.67953667953668, "f1": 68.02791858507516}, {"em": 37.085582112567465, "f1": 69.15209618437655}, {"em": 35.231586940015184, "f1": 63.02943749359389}, {"em": 35.2211016291699, "f1": 64.24434676681467}, {"em": 25.49019607843137, "f1": 59.92506194005498}, {"em": 33.850931677018636, "f1": 67.21384454480352}]}, "total": {"test_em": 33.546545082028175, "test_em_se": 2.288222359114571, "test_f1": 63.95436068184076, "test_f1_se": 2.8283568171903783}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "germeval", "task": "named-entity-recognition", "dataset_languages": ["de"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.5795996186844614, "micro_f1": 0.480512351326624}, {"micro_f1_no_misc": 0.5512763098969995, "micro_f1": 0.4072773699329716}, {"micro_f1_no_misc": 0.5896193771626298, "micro_f1": 0.4421087754421088}, {"micro_f1_no_misc": 0.5793366738248628, "micro_f1": 0.42014534392428593}, {"micro_f1_no_misc": 0.5531491162559123, "micro_f1": 0.358638325075915}, {"micro_f1_no_misc": 0.5829239766081872, "micro_f1": 0.4678635547576301}, {"micro_f1_no_misc": 0.6120756672713139, "micro_f1": 0.4270389919972757}, {"micro_f1_no_misc": 0.6151300236406618, "micro_f1": 0.4656862745098039}, {"micro_f1_no_misc": 0.6000931098696461, "micro_f1": 0.4572812175717744}, {"micro_f1_no_misc": 0.5187586845761928, "micro_f1": 0.39555935098206657}]}, "total": {"test_micro_f1_no_misc": 57.819625577908674, "test_micro_f1_no_misc_se": 1.8516400919914577, "test_micro_f1": 43.22111555520456, "test_micro_f1_se": 2.3557625927402546}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "hellaswag-da", "task": "common-sense-reasoning", "dataset_languages": ["da"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.2763767926206515, "accuracy": 0.447265625}, {"mcc": 0.31778905330127777, "accuracy": 0.4833984375}, {"mcc": 0.2751161455761231, "accuracy": 0.45166015625}, {"mcc": 0.336243577876775, "accuracy": 0.49072265625}, {"mcc": 0.2953824596569701, "accuracy": 0.45703125}, {"mcc": 0.3021933713300136, "accuracy": 0.47216796875}, {"mcc": 0.275485122519006, "accuracy": 0.443359375}, {"mcc": 0.3391611538931869, "accuracy": 0.49462890625}, {"mcc": 0.300999188936978, "accuracy": 0.46044921875}, {"mcc": 0.31503013270137875, "accuracy": 0.48388671875}]}, "total": {"test_mcc": 30.337769984123607, "test_mcc_se": 1.4726591219960752, "test_accuracy": 46.845703125, "test_accuracy_se": 1.1708334202161148}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "hellaswag-de", "task": "common-sense-reasoning", "dataset_languages": ["de"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.3948764275290547, "accuracy": 0.51806640625}, {"mcc": 0.4334805532532269, "accuracy": 0.5673828125}, {"mcc": 0.3852734133475938, "accuracy": 0.49951171875}, {"mcc": 0.409287806432935, "accuracy": 0.53271484375}, {"mcc": 0.3902491596663706, "accuracy": 0.51611328125}, {"mcc": 0.40114955643717043, "accuracy": 0.5322265625}, {"mcc": 0.33117718062534474, "accuracy": 0.4697265625}, {"mcc": 0.38722784109820896, "accuracy": 0.51904296875}, {"mcc": 0.33177617007611915, "accuracy": 0.48095703125}, {"mcc": 0.4634328350988746, "accuracy": 0.58740234375}]}, "total": {"test_mcc": 39.27930943564898, "test_mcc_se": 2.4968331255583047, "test_accuracy": 52.2314453125, "test_accuracy_se": 2.219131939270454}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "hellaswag-nl", "task": "common-sense-reasoning", "dataset_languages": ["nl"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.3640877942570892, "accuracy": 0.51904296875}, {"mcc": 0.3566510856380519, "accuracy": 0.51611328125}, {"mcc": 0.2835703913658462, "accuracy": 0.44677734375}, {"mcc": 0.3237940168557, "accuracy": 0.4814453125}, {"mcc": 0.32867628589473086, "accuracy": 0.48876953125}, {"mcc": 0.2950440068359512, "accuracy": 0.46337890625}, {"mcc": 0.33806534264176163, "accuracy": 0.48095703125}, {"mcc": 0.314173454981738, "accuracy": 0.47509765625}, {"mcc": 0.36948182340202795, "accuracy": 0.52392578125}, {"mcc": 0.35791300480345356, "accuracy": 0.5166015625}]}, "total": {"test_mcc": 33.3145720667635, "test_mcc_se": 1.8298790716896163, "test_accuracy": 49.12109375, "test_accuracy_se": 1.6430739823711953}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "hellaswag-no", "task": "common-sense-reasoning", "dataset_languages": ["nb", "nn", "no"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.29152405145932375, "accuracy": 0.44970703125}, {"mcc": 0.3032268957129148, "accuracy": 0.46923828125}, {"mcc": 0.33428676125944434, "accuracy": 0.48583984375}, {"mcc": 0.36256865160259777, "accuracy": 0.51904296875}, {"mcc": 0.30398906495008915, "accuracy": 0.47607421875}, {"mcc": 0.3051222146204048, "accuracy": 0.46630859375}, {"mcc": 0.27545316666663416, "accuracy": 0.43505859375}, {"mcc": 0.3645407780611671, "accuracy": 0.5205078125}, {"mcc": 0.35172711322023187, "accuracy": 0.5048828125}, {"mcc": 0.32367029412235976, "accuracy": 0.48486328125}]}, "total": {"test_mcc": 32.16108991675167, "test_mcc_se": 1.9102095756473112, "test_accuracy": 48.115234375, "test_accuracy_se": 1.7390135346989404}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "hellaswag-sv", "task": "common-sense-reasoning", "dataset_languages": ["sv"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.3255767986191176, "accuracy": 0.48681640625}, {"mcc": 0.2573583456387363, "accuracy": 0.4365234375}, {"mcc": 0.33296084464025144, "accuracy": 0.49658203125}, {"mcc": 0.339418209794295, "accuracy": 0.49853515625}, {"mcc": 0.25992805051321216, "accuracy": 0.43115234375}, {"mcc": 0.274702003024931, "accuracy": 0.4384765625}, {"mcc": 0.2335981067291543, "accuracy": 0.4130859375}, {"mcc": 0.24355164719260358, "accuracy": 0.427734375}, {"mcc": 0.3454818019745888, "accuracy": 0.50244140625}, {"mcc": 0.271228302491543, "accuracy": 0.443359375}]}, "total": {"test_mcc": 28.83804110618433, "test_mcc_se": 2.653879085978296, "test_accuracy": 45.7470703125, "test_accuracy_se": 2.131622257083236}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "hellaswag", "task": "common-sense-reasoning", "dataset_languages": ["en"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.50045898177155, "accuracy": 0.6083984375}, {"mcc": 0.6372602090881178, "accuracy": 0.7236328125}, {"mcc": 0.5304392884629627, "accuracy": 0.640625}, {"mcc": 0.5968766168568715, "accuracy": 0.69189453125}, {"mcc": 0.5345585431554953, "accuracy": 0.64013671875}, {"mcc": 0.5966659966315644, "accuracy": 0.6943359375}, {"mcc": 0.635372825854554, "accuracy": 0.7236328125}, {"mcc": 0.6016132981354979, "accuracy": 0.69580078125}, {"mcc": 0.5678628735380534, "accuracy": 0.6689453125}, {"mcc": 0.5866195737883739, "accuracy": 0.6875}]}, "total": {"test_mcc": 57.877282072830404, "test_mcc_se": 2.8023830158940317, "test_accuracy": 67.7490234375, "test_accuracy_se": 2.3361605681955986}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "mim-gold-ner", "task": "named-entity-recognition", "dataset_languages": ["is"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.3523443504996157, "micro_f1": 0.26019376131161503}, {"micro_f1_no_misc": 0.46153846153846156, "micro_f1": 0.3810182623132264}, {"micro_f1_no_misc": 0.47597078046905034, "micro_f1": 0.380409071935439}, {"micro_f1_no_misc": 0.39227746454809503, "micro_f1": 0.28518731988472623}, {"micro_f1_no_misc": 0.4612935323383085, "micro_f1": 0.4068713217750914}, {"micro_f1_no_misc": 0.3905835543766578, "micro_f1": 0.29677571193221935}, {"micro_f1_no_misc": 0.46292811441475346, "micro_f1": 0.39773235864538264}, {"micro_f1_no_misc": 0.4445646295294754, "micro_f1": 0.343063872255489}, {"micro_f1_no_misc": 0.37566428440535093, "micro_f1": 0.28346055979643764}, {"micro_f1_no_misc": 0.39826551034022684, "micro_f1": 0.3170197224251278}]}, "total": {"test_micro_f1_no_misc": 42.154306824599956, "test_micro_f1_no_misc_se": 2.7436078299264595, "test_micro_f1": 33.51731962274754, "test_micro_f1_se": 3.3225840109377494}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "mlsum", "task": "summarization", "dataset_languages": ["de"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"bertscore": 0.7106385409715585, "rouge_l": 0.3200267358761247}, {"bertscore": 0.6810438845714089, "rouge_l": 0.2535924083297121}, {"bertscore": 0.6873981252720114, "rouge_l": 0.2676453404686747}, {"bertscore": 0.6841653371520806, "rouge_l": 0.25219130552611413}, {"bertscore": 0.7021989381173626, "rouge_l": 0.30050537925593745}, {"bertscore": 0.6690387488488341, "rouge_l": 0.21613306449047257}, {"bertscore": 0.6890311362221837, "rouge_l": 0.2694646314305539}, {"bertscore": 0.6997596587461885, "rouge_l": 0.29949237623547525}, {"bertscore": 0.7077829950430896, "rouge_l": 0.31936543444285603}, {"bertscore": 0.7059375142125646, "rouge_l": 0.31420973870195623}]}, "total": {"test_bertscore": 69.36994879157282, "test_bertscore_se": 0.8429593577815244, "test_rouge_l": 28.126264147578766, "test_rouge_l_se": 2.15895434394283}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "mmlu-de", "task": "knowledge", "dataset_languages": ["de"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.514402461637217, "accuracy": 0.63525390625}, {"mcc": 0.5054427199740312, "accuracy": 0.6279296875}, {"mcc": 0.5013168733727311, "accuracy": 0.62548828125}, {"mcc": 0.5229057402271983, "accuracy": 0.6416015625}, {"mcc": 0.5257942418532754, "accuracy": 0.6435546875}, {"mcc": 0.4974804030720876, "accuracy": 0.62255859375}, {"mcc": 0.5117092770809332, "accuracy": 0.6318359375}, {"mcc": 0.5133115372692787, "accuracy": 0.6357421875}, {"mcc": 0.5011059307740975, "accuracy": 0.62548828125}, {"mcc": 0.5029443859215884, "accuracy": 0.62744140625}]}, "total": {"test_mcc": 50.96413571182439, "test_mcc_se": 0.5951402247503085, "test_accuracy": 63.1689453125, "test_accuracy_se": 0.44342447916666655}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "mmlu-nl", "task": "knowledge", "dataset_languages": ["nl"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.44720234619626803, "accuracy": 0.5849609375}, {"mcc": 0.4637151596537735, "accuracy": 0.59716796875}, {"mcc": 0.46255162078138373, "accuracy": 0.59619140625}, {"mcc": 0.46516748571450794, "accuracy": 0.59814453125}, {"mcc": 0.4488138305209938, "accuracy": 0.5859375}, {"mcc": 0.4752674546930024, "accuracy": 0.60595703125}, {"mcc": 0.471646821371307, "accuracy": 0.603515625}, {"mcc": 0.4734708854315509, "accuracy": 0.6044921875}, {"mcc": 0.44999582699913093, "accuracy": 0.5869140625}, {"mcc": 0.45888868463751425, "accuracy": 0.59326171875}]}, "total": {"test_mcc": 46.167201159994335, "test_mcc_se": 0.639965617723984, "test_accuracy": 59.56542968750001, "test_accuracy_se": 0.4826344007592892}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "mmlu-no", "task": "knowledge", "dataset_languages": ["nb", "nn", "no"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.40105639263858195, "accuracy": 0.5498046875}, {"mcc": 0.4177713426642163, "accuracy": 0.56396484375}, {"mcc": 0.39914982247970915, "accuracy": 0.54931640625}, {"mcc": 0.4123322016368145, "accuracy": 0.556640625}, {"mcc": 0.4028323563045292, "accuracy": 0.5517578125}, {"mcc": 0.42344341547836284, "accuracy": 0.5673828125}, {"mcc": 0.39599258021769757, "accuracy": 0.54541015625}, {"mcc": 0.3973343334875812, "accuracy": 0.54541015625}, {"mcc": 0.3779798986509559, "accuracy": 0.53125}, {"mcc": 0.3710341427584269, "accuracy": 0.529296875}]}, "total": {"test_mcc": 39.98926486316875, "test_mcc_se": 1.0067089193967442, "test_accuracy": 54.90234375000001, "test_accuracy_se": 0.7612527064327688}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "mmlu-sv", "task": "knowledge", "dataset_languages": ["sv"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.4063424226237854, "accuracy": 0.55224609375}, {"mcc": 0.42681415992997285, "accuracy": 0.56884765625}, {"mcc": 0.4415829919166096, "accuracy": 0.578125}, {"mcc": 0.4375905550652955, "accuracy": 0.5771484375}, {"mcc": 0.4289987271501374, "accuracy": 0.5703125}, {"mcc": 0.45440377512289853, "accuracy": 0.5908203125}, {"mcc": 0.4453528378649612, "accuracy": 0.58349609375}, {"mcc": 0.4452109070596511, "accuracy": 0.583984375}, {"mcc": 0.43570576396083877, "accuracy": 0.57666015625}, {"mcc": 0.4399179729888803, "accuracy": 0.5791015625}]}, "total": {"test_mcc": 43.619201136830306, "test_mcc_se": 0.8193416753122832, "test_accuracy": 57.607421875, "test_accuracy_se": 0.6543680516841321}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "mmlu", "task": "knowledge", "dataset_languages": ["en"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.6099623712237747, "accuracy": 0.70654296875}, {"mcc": 0.6086397615375846, "accuracy": 0.705078125}, {"mcc": 0.5773516719611181, "accuracy": 0.681640625}, {"mcc": 0.6021466024370159, "accuracy": 0.70166015625}, {"mcc": 0.5939579306716496, "accuracy": 0.69580078125}, {"mcc": 0.5827093641762607, "accuracy": 0.68798828125}, {"mcc": 0.5996239341580418, "accuracy": 0.69873046875}, {"mcc": 0.5573914413395569, "accuracy": 0.66748046875}, {"mcc": 0.6000993317954143, "accuracy": 0.69921875}, {"mcc": 0.5817972698107822, "accuracy": 0.68408203125}]}, "total": {"test_mcc": 59.13679679111199, "test_mcc_se": 1.01819056996072, "test_accuracy": 69.2822265625, "test_accuracy_se": 0.7642213967795112}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "no-sammendrag", "task": "summarization", "dataset_languages": ["nb", "nn", "no"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"bertscore": 0.6390248541720212, "rouge_l": 0.15051950159612124}, {"bertscore": 0.652024975119275, "rouge_l": 0.1822603611098965}, {"bertscore": 0.6533482232771348, "rouge_l": 0.18097419591397884}, {"bertscore": 0.6318108073901385, "rouge_l": 0.13599799347573888}, {"bertscore": 0.6491617570572998, "rouge_l": 0.17159093835694836}, {"bertscore": 0.6494091944186948, "rouge_l": 0.17122748977409527}, {"bertscore": 0.6565147308865562, "rouge_l": 0.18877313064391063}, {"bertscore": 0.6607806792744668, "rouge_l": 0.1984844930688449}, {"bertscore": 0.657618027747958, "rouge_l": 0.19056571043341813}, {"bertscore": 0.6203150792862289, "rouge_l": 0.13982639300151858}]}, "total": {"test_bertscore": 64.70008328629774, "test_bertscore_se": 0.7936338189924962, "test_rouge_l": 17.102202073744717, "test_rouge_l_se": 1.3543333736249867}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "nordjylland-news", "task": "summarization", "dataset_languages": ["da"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"bertscore": 0.6723040144715924, "rouge_l": 0.2132586562910368}, {"bertscore": 0.6884205529931933, "rouge_l": 0.2566154763051667}, {"bertscore": 0.654656048733159, "rouge_l": 0.21936538515533544}, {"bertscore": 0.6735598470404511, "rouge_l": 0.21479054642140394}, {"bertscore": 0.6554798600118374, "rouge_l": 0.21995870866450523}, {"bertscore": 0.672920772980433, "rouge_l": 0.21962325136587368}, {"bertscore": 0.6643207747547422, "rouge_l": 0.1981183746111051}, {"bertscore": 0.669471997738583, "rouge_l": 0.20747875338132274}, {"bertscore": 0.6592364808020648, "rouge_l": 0.1830793306083483}, {"bertscore": 0.6729089717555325, "rouge_l": 0.22026584419489684}]}, "total": {"test_bertscore": 66.83279321281589, "test_bertscore_se": 0.6337635493008186, "test_rouge_l": 21.52554326998995, "test_rouge_l_se": 1.1664347859232422}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "norec", "task": "sentiment-classification", "dataset_languages": ["nb", "nn", "no"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.5914447709557425, "macro_f1": 0.7296629983283278}, {"mcc": 0.5487751862762847, "macro_f1": 0.685864955298046}, {"mcc": 0.5752170409924663, "macro_f1": 0.6906832937458004}, {"mcc": 0.46269480048231504, "macro_f1": 0.5942959032931256}, {"mcc": 0.5057815582064938, "macro_f1": 0.6494521302003479}, {"mcc": 0.5078372970304633, "macro_f1": 0.6618528328031262}, {"mcc": 0.39075313300706027, "macro_f1": 0.51407186639842}, {"mcc": 0.553858050862248, "macro_f1": 0.6973566322080469}, {"mcc": 0.4884277867437218, "macro_f1": 0.6180247929689242}, {"mcc": 0.5844419494630197, "macro_f1": 0.7303769144675162}]}, "total": {"test_mcc": 52.09231574019816, "test_mcc_se": 3.8985685470294076, "test_macro_f1": 65.71642319711681, "test_macro_f1_se": 4.143393594377484}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "norne-nb", "task": "named-entity-recognition", "dataset_languages": ["nb", "no"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.5522137119372316, "micro_f1": 0.4169406813100092}, {"micro_f1_no_misc": 0.6776232616940581, "micro_f1": 0.5952093744614855}, {"micro_f1_no_misc": 0.6286052943500593, "micro_f1": 0.4640044994375703}, {"micro_f1_no_misc": 0.6341370799316498, "micro_f1": 0.5478004041660189}, {"micro_f1_no_misc": 0.6166434540389972, "micro_f1": 0.5075157146761411}, {"micro_f1_no_misc": 0.5932294465341215, "micro_f1": 0.47087776866283837}, {"micro_f1_no_misc": 0.6503536608679029, "micro_f1": 0.5768616171300736}, {"micro_f1_no_misc": 0.6216977809087708, "micro_f1": 0.4267701260911736}, {"micro_f1_no_misc": 0.6228995701445879, "micro_f1": 0.4466177669111655}, {"micro_f1_no_misc": 0.6543593492962896, "micro_f1": 0.4897959183673469}]}, "total": {"test_micro_f1_no_misc": 62.51762609703668, "test_micro_f1_no_misc_se": 2.143343718231484, "test_micro_f1": 49.423938712138224, "test_micro_f1_se": 3.828615976633605}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "norne-nn", "task": "named-entity-recognition", "dataset_languages": ["nn"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.6440113276694985, "micro_f1": 0.5292493755751282}, {"micro_f1_no_misc": 0.638187478381183, "micro_f1": 0.5027322404371585}, {"micro_f1_no_misc": 0.6203609124957441, "micro_f1": 0.5317059483726151}, {"micro_f1_no_misc": 0.598369011213048, "micro_f1": 0.39468280557656976}, {"micro_f1_no_misc": 0.5786143112120721, "micro_f1": 0.3938320483434049}, {"micro_f1_no_misc": 0.6435866366626277, "micro_f1": 0.5478906871849296}, {"micro_f1_no_misc": 0.5961474593158419, "micro_f1": 0.49256954147084975}, {"micro_f1_no_misc": 0.5733228097868981, "micro_f1": 0.4196714080935054}, {"micro_f1_no_misc": 0.6206774250753679, "micro_f1": 0.5057878251395888}, {"micro_f1_no_misc": 0.6412949888066127, "micro_f1": 0.43537414965986393}]}, "total": {"test_micro_f1_no_misc": 61.54572360618894, "test_micro_f1_no_misc_se": 1.6834886414269081, "test_micro_f1": 47.534960298536134, "test_micro_f1_se": 3.6473672425366686}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "norquad", "task": "reading-comprehension", "dataset_languages": ["nb", "nn", "no"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"em": 51.282051282051285, "f1": 74.2550107828616}, {"em": 36.090225563909776, "f1": 61.66679522873849}, {"em": 54.30242272347535, "f1": 77.07168415402032}, {"em": 44.83043837882548, "f1": 73.63881589823525}, {"em": 50.20542317173377, "f1": 74.44347915794044}, {"em": 55.61056105610561, "f1": 77.40539427811072}, {"em": 51.11294311624073, "f1": 77.09582505074827}, {"em": 49.958368026644465, "f1": 75.11248730827052}, {"em": 40.18379281537176, "f1": 67.40949892153573}, {"em": 39.447236180904525, "f1": 67.87853540957244}]}, "total": {"test_em": 47.30234623152628, "test_em_se": 4.17350155528098, "test_f1": 72.59775261900339, "test_f1_se": 3.2377783402178264}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "nqii", "task": "reading-comprehension", "dataset_languages": ["is"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"em": 27.811550151975684, "f1": 56.45740672245821}, {"em": 16.901408450704224, "f1": 51.0314856348443}, {"em": 18.856259659969087, "f1": 53.47554410859639}, {"em": 23.802163833075735, "f1": 54.655167771670214}, {"em": 31.57051282051282, "f1": 57.13163422503625}, {"em": 27.56508422664625, "f1": 56.416204580581294}, {"em": 30.568356374807987, "f1": 57.55089380877086}, {"em": 32.6219512195122, "f1": 58.01444924500172}, {"em": 31.89522342064715, "f1": 55.550109801160225}, {"em": 33.53566009104704, "f1": 60.79647022429977}]}, "total": {"test_em": 27.512817024889813, "test_em_se": 3.6304616425154252, "test_f1": 56.107936612241915, "test_f1_se": 1.6539263534480264}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "rrn", "task": "summarization", "dataset_languages": ["is"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"bertscore": 0.6716969274566509, "rouge_l": 0.2158190451257037}, {"bertscore": 0.6688159213808831, "rouge_l": 0.1810354572944225}, {"bertscore": 0.6769947757129557, "rouge_l": 0.21577831819840737}, {"bertscore": 0.6699207223136909, "rouge_l": 0.19445588699516658}, {"bertscore": 0.6697677207703236, "rouge_l": 0.19539616698307505}, {"bertscore": 0.6715953071252443, "rouge_l": 0.18822421113840593}, {"bertscore": 0.6550151575065684, "rouge_l": 0.17173888053038933}, {"bertscore": 0.6677836592425592, "rouge_l": 0.19234386286879682}, {"bertscore": 0.6685689899022691, "rouge_l": 0.195909957722126}, {"bertscore": 0.6692902076465543, "rouge_l": 0.20955583694569602}]}, "total": {"test_bertscore": 66.894493890577, "test_bertscore_se": 0.3436629360348251, "test_rouge_l": 19.60257623802189, "test_rouge_l_se": 0.890274776121395}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "sb10k", "task": "sentiment-classification", "dataset_languages": ["de"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.6004141795942156, "macro_f1": 0.7361369261344667}, {"mcc": 0.5621320255878345, "macro_f1": 0.7088923692858069}, {"mcc": 0.6185721976889894, "macro_f1": 0.7496985424030376}, {"mcc": 0.5755914943086926, "macro_f1": 0.7179205039323898}, {"mcc": 0.6251165358664775, "macro_f1": 0.7479009238143203}, {"mcc": 0.626535373095385, "macro_f1": 0.7490062286693256}, {"mcc": 0.5514791335157965, "macro_f1": 0.6989429970351088}, {"mcc": 0.6299141938827565, "macro_f1": 0.753754554951802}, {"mcc": 0.603056203373947, "macro_f1": 0.7327891320693981}, {"mcc": 0.5522660619134898, "macro_f1": 0.7003014991355881}]}, "total": {"test_mcc": 59.45077398827585, "test_mcc_se": 1.954021222014685, "test_macro_f1": 72.95343677431244, "test_macro_f1_se": 1.325356986763912}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "scala-da", "task": "linguistic-acceptability", "dataset_languages": ["da"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.34920271781604434, "macro_f1": 0.6418902330875964}, {"mcc": 0.27703782498633156, "macro_f1": 0.6208736689851952}, {"mcc": 0.3352585985327314, "macro_f1": 0.6676235600059953}, {"mcc": 0.268958371788544, "macro_f1": 0.6309050056406418}, {"mcc": 0.31050190060831556, "macro_f1": 0.6551283947201458}, {"mcc": 0.36061203384671386, "macro_f1": 0.6801696047158199}, {"mcc": 0.36630702172182567, "macro_f1": 0.6484389187314599}, {"mcc": 0.2837033489903027, "macro_f1": 0.6134972617990231}, {"mcc": 0.2981451498120715, "macro_f1": 0.6299460911591621}, {"mcc": 0.35390015737914715, "macro_f1": 0.6740788177316324}]}, "total": {"test_mcc": 32.03627125482028, "test_mcc_se": 2.2972353629332405, "test_macro_f1": 64.62551556576672, "test_macro_f1_se": 1.4167579110736181}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "scala-de", "task": "linguistic-acceptability", "dataset_languages": ["de"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.3897387146829424, "macro_f1": 0.6927745778324788}, {"mcc": 0.30708062591594937, "macro_f1": 0.6373900862223735}, {"mcc": 0.3765585439029678, "macro_f1": 0.6844299987961959}, {"mcc": 0.4184561079913006, "macro_f1": 0.7054728435641926}, {"mcc": 0.36778472729184675, "macro_f1": 0.6682785941494448}, {"mcc": 0.3985483240908738, "macro_f1": 0.6992365577352765}, {"mcc": 0.2902393358547057, "macro_f1": 0.6408515286981271}, {"mcc": 0.3503256938479051, "macro_f1": 0.6737939419913577}, {"mcc": 0.3703781257944903, "macro_f1": 0.6825342519086073}, {"mcc": 0.4060845071107298, "macro_f1": 0.7029879056418711}]}, "total": {"test_mcc": 36.75194706483712, "test_mcc_se": 2.5762329168104134, "test_macro_f1": 67.87750286539925, "test_macro_f1_se": 1.4963616116913832}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "scala-en", "task": "linguistic-acceptability", "dataset_languages": ["en"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.35197291174027295, "macro_f1": 0.6744287602229415}, {"mcc": 0.36056100977686273, "macro_f1": 0.6796569497060496}, {"mcc": 0.38484426768455376, "macro_f1": 0.6880150471059072}, {"mcc": 0.43174605417766043, "macro_f1": 0.6816979488397877}, {"mcc": 0.4605210873894839, "macro_f1": 0.7289687745747699}, {"mcc": 0.3640743534069476, "macro_f1": 0.6810402993314114}, {"mcc": 0.36956686514434395, "macro_f1": 0.6698721216694001}, {"mcc": 0.40325790407142825, "macro_f1": 0.7016287847266351}, {"mcc": 0.4187910695263388, "macro_f1": 0.6994047619047619}, {"mcc": 0.42975511987590087, "macro_f1": 0.7144505181772689}]}, "total": {"test_mcc": 39.750906427937934, "test_mcc_se": 2.2788329337826467, "test_macro_f1": 69.19163966258934, "test_macro_f1_se": 1.1707360837393903}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "scala-fo", "task": "linguistic-acceptability", "dataset_languages": ["fo"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.0011009549921192898, "macro_f1": 0.34073906259799536}, {"mcc": 0.013404496178620483, "macro_f1": 0.38182843183312976}, {"mcc": 0.03017131690998504, "macro_f1": 0.37681159420289856}, {"mcc": 0.012647963396375698, "macro_f1": 0.431847201588899}, {"mcc": -0.030129739907658926, "macro_f1": 0.4191692922632421}, {"mcc": 0.0028080619506621047, "macro_f1": 0.49556650246305417}, {"mcc": -0.06490486869266075, "macro_f1": 0.35862030625544344}, {"mcc": 0.0, "macro_f1": 0.3359273670557717}, {"mcc": 0.002684244135045226, "macro_f1": 0.36721670551813174}, {"mcc": -0.00904894661130757, "macro_f1": 0.38734684080810683}]}, "total": {"test_mcc": -0.412665176488194, "test_mcc_se": 1.6396319228360534, "test_macro_f1": 38.95073304586673, "test_macro_f1_se": 2.982944012846368}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "scala-is", "task": "linguistic-acceptability", "dataset_languages": ["is"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.06209936054661416, "macro_f1": 0.4605182908861754}, {"mcc": -0.025815002001404765, "macro_f1": 0.38997815290647475}, {"mcc": 0.0352125389618339, "macro_f1": 0.46664784327817865}, {"mcc": 0.04104486514439594, "macro_f1": 0.4977358794617678}, {"mcc": 0.038520378778925786, "macro_f1": 0.40700483091787437}, {"mcc": 0.07165460018374402, "macro_f1": 0.5356125335746296}, {"mcc": 0.034169419697992164, "macro_f1": 0.5140807618458377}, {"mcc": 0.06773110116423657, "macro_f1": 0.4986817345008185}, {"mcc": 0.05036414012238881, "macro_f1": 0.4248314449744337}, {"mcc": 0.07366709837251391, "macro_f1": 0.5357909587192542}]}, "total": {"test_mcc": 4.486485009712405, "test_mcc_se": 1.807673291725086, "test_macro_f1": 47.308824310654444, "test_macro_f1_se": 3.2395283014377503}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "scala-nb", "task": "linguistic-acceptability", "dataset_languages": ["nb", "no"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.214253415140715, "macro_f1": 0.6012429544956515}, {"mcc": 0.20337888517564065, "macro_f1": 0.57765961983042}, {"mcc": 0.1874599702318662, "macro_f1": 0.5146384479717813}, {"mcc": 0.19217521812876043, "macro_f1": 0.5451920941594492}, {"mcc": 0.2512585932390895, "macro_f1": 0.5881007972320683}, {"mcc": 0.26963345454006005, "macro_f1": 0.6324348312202468}, {"mcc": 0.17384814519289427, "macro_f1": 0.5441173245268511}, {"mcc": 0.24524827137506383, "macro_f1": 0.6148684294483993}, {"mcc": 0.2656487524697225, "macro_f1": 0.6250884339995502}, {"mcc": 0.1957211512456187, "macro_f1": 0.5781477852659458}]}, "total": {"test_mcc": 21.986258567394312, "test_mcc_se": 2.169173653401366, "test_macro_f1": 58.21490718150364, "test_macro_f1_se": 2.3780870236712945}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "scala-nl", "task": "linguistic-acceptability", "dataset_languages": ["nl"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.2667270934107124, "macro_f1": 0.5373595706040971}, {"mcc": 0.273779530484507, "macro_f1": 0.6359336462124521}, {"mcc": 0.4131383128328353, "macro_f1": 0.7036264381814795}, {"mcc": 0.37108085363748894, "macro_f1": 0.6740169092223147}, {"mcc": 0.3393189937420753, "macro_f1": 0.6691400695320913}, {"mcc": 0.28942891171036716, "macro_f1": 0.642271083808353}, {"mcc": 0.3133745604916274, "macro_f1": 0.6566871236448951}, {"mcc": 0.3130699670815257, "macro_f1": 0.6546357868990293}, {"mcc": 0.3757417175998072, "macro_f1": 0.6873923765639682}, {"mcc": 0.32019762294640547, "macro_f1": 0.6563935210947364}]}, "total": {"test_mcc": 32.75857563937352, "test_mcc_se": 2.935627487665181, "test_macro_f1": 65.17456525763417, "test_macro_f1_se": 2.7902164752500234}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "scala-nn", "task": "linguistic-acceptability", "dataset_languages": ["nn"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.1621543203348454, "macro_f1": 0.5801828097396113}, {"mcc": 0.1983167407703628, "macro_f1": 0.5428024673231993}, {"mcc": 0.1726621545067224, "macro_f1": 0.5534284102812022}, {"mcc": 0.236665627305556, "macro_f1": 0.6090099447938908}, {"mcc": 0.1710701637167153, "macro_f1": 0.5518207282913166}, {"mcc": 0.13076213100127754, "macro_f1": 0.4342152684580229}, {"mcc": 0.18680698712013433, "macro_f1": 0.5680729675970168}, {"mcc": 0.13919226358149264, "macro_f1": 0.47763999638600363}, {"mcc": 0.1263779339663037, "macro_f1": 0.40362748904542683}, {"mcc": 0.16012770725615563, "macro_f1": 0.4554111506737062}]}, "total": {"test_mcc": 16.841360295595656, "test_mcc_se": 2.0724483802848965, "test_macro_f1": 51.76211232589396, "test_macro_f1_se": 4.299946041691093}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "scala-sv", "task": "linguistic-acceptability", "dataset_languages": ["sv"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.3195208236733346, "macro_f1": 0.6087917489603055}, {"mcc": 0.3268183574452309, "macro_f1": 0.6507066821069156}, {"mcc": 0.3268853551254594, "macro_f1": 0.6595835737458111}, {"mcc": 0.38034754975169854, "macro_f1": 0.690107848593825}, {"mcc": 0.31425193288315884, "macro_f1": 0.6430243725280562}, {"mcc": 0.3514054961333375, "macro_f1": 0.6538364779874214}, {"mcc": 0.2676675828814306, "macro_f1": 0.562863647862383}, {"mcc": 0.30916706653237963, "macro_f1": 0.6397505869913389}, {"mcc": 0.35290042610573213, "macro_f1": 0.6727044448867725}, {"mcc": 0.3000274604192923, "macro_f1": 0.6464581964102509}]}, "total": {"test_mcc": 32.48992050951055, "test_mcc_se": 1.9438892864185013, "test_macro_f1": 64.2782758007308, "test_macro_f1_se": 2.183289700912671}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "scandiqa-da", "task": "reading-comprehension", "dataset_languages": ["da"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"em": 58.55925639039504, "f1": 66.18036236936307}, {"em": 59.457364341085274, "f1": 66.8774231797487}, {"em": 59.19629057187017, "f1": 65.75105214208766}, {"em": 58.56697819314642, "f1": 67.02770533378003}, {"em": 57.99227799227799, "f1": 65.91331577045855}, {"em": 55.43562066306862, "f1": 64.41652175460958}, {"em": 58.76993166287016, "f1": 66.07557295484398}, {"em": 60.35686578743212, "f1": 67.08323398005315}, {"em": 60.3921568627451, "f1": 66.65378151260501}, {"em": 56.52173913043478, "f1": 65.55412711856805}]}, "total": {"test_em": 58.52484815953257, "test_em_se": 0.9705019778749818, "test_f1": 66.15330961161179, "test_f1_se": 0.5063132890281222}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "scandiqa-sv", "task": "reading-comprehension", "dataset_languages": ["sv"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"em": 60.573199070487995, "f1": 66.32117903612865}, {"em": 56.04651162790697, "f1": 64.66611854983944}, {"em": 60.123647604327665, "f1": 65.93858099654078}, {"em": 60.66978193146417, "f1": 67.61780489645707}, {"em": 58.68725868725869, "f1": 67.61473875759582}, {"em": 57.51734772552043, "f1": 65.88262290266907}, {"em": 60.288534548215644, "f1": 66.4384050944415}, {"em": 58.029480217222655, "f1": 67.50586580360776}, {"em": 59.6078431372549, "f1": 67.53960113960103}, {"em": 57.91925465838509, "f1": 66.03551038333643}]}, "total": {"test_em": 58.946285920804414, "test_em_se": 0.9614379079010096, "test_f1": 66.55604275602175, "test_f1_se": 0.6151141954699942}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"test_speed": 1027.64, "test_speed_short": 140.47}, {"test_speed": 1739.92, "test_speed_short": 256.2}, {"test_speed": 2733.6000000000004, "test_speed_short": 470.05999999999995}, {"test_speed": 3315.92, "test_speed_short": 581.86}, {"test_speed": 2983.2, "test_speed_short": 529.76}, {"test_speed": 3571.7799999999997, "test_speed_short": 670.44}, {"test_speed": 4000.56, "test_speed_short": 755.3}, {"test_speed": 5111.76, "test_speed_short": 1035.92}, {"test_speed": 5497.24, "test_speed_short": 1148.37}, {"test_speed": 5844.96, "test_speed_short": 1272.7}]}, "total": {"test_speed": 3582.6580000000004, "test_speed_se": 977.2029380818664, "test_speed_short": 686.108, "test_speed_short_se": 230.9612879649251}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "squad-nl", "task": "reading-comprehension", "dataset_languages": ["nl"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"em": 54.066615027110764, "f1": 71.06795082167552}, {"em": 57.054263565891475, "f1": 72.62205131152902}, {"em": 57.49613601236476, "f1": 74.50845763095366}, {"em": 56.85358255451713, "f1": 71.22844031052752}, {"em": 58.3011583011583, "f1": 73.36764251470109}, {"em": 58.905165767154976, "f1": 73.16469528245439}, {"em": 58.086560364464695, "f1": 74.34364011443898}, {"em": 55.46935608999224, "f1": 70.74803823192765}, {"em": 55.529411764705884, "f1": 71.43041848847695}, {"em": 56.98757763975155, "f1": 73.23301575724435}]}, "total": {"test_em": 56.87498270871117, "test_em_se": 0.9176467406942986, "test_f1": 72.5714350463929, "test_f1_se": 0.852085858955841}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "squad", "task": "reading-comprehension", "dataset_languages": ["en"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"em": 70.41053446940356, "f1": 85.81355924435837}, {"em": 70.69767441860465, "f1": 84.36529968787616}, {"em": 66.46058732612056, "f1": 83.41672590887084}, {"em": 56.77570093457944, "f1": 78.26502187116355}, {"em": 69.65250965250965, "f1": 85.82340946195579}, {"em": 67.07787201233616, "f1": 82.91586372911807}, {"em": 61.35155656795748, "f1": 80.11322026547639}, {"em": 66.64080682699768, "f1": 82.01400242704706}, {"em": 67.84313725490196, "f1": 84.57403547921079}, {"em": 62.34472049689441, "f1": 81.33033398173112}]}, "total": {"test_em": 65.92550999603057, "test_em_se": 2.7722036827917376, "test_f1": 82.8631472056808, "test_f1_se": 1.5285253939064662}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "sst5", "task": "sentiment-classification", "dataset_languages": ["en"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.7021189465677934, "macro_f1": 0.7079847853078064}, {"mcc": 0.6594299374700751, "macro_f1": 0.7082528407946977}, {"mcc": 0.6994166919403821, "macro_f1": 0.7161217133276804}, {"mcc": 0.6380952250464198, "macro_f1": 0.693862514113174}, {"mcc": 0.6899304908515126, "macro_f1": 0.672594098063552}, {"mcc": 0.6859096253365898, "macro_f1": 0.676713604685351}, {"mcc": 0.6848161264561329, "macro_f1": 0.7133184834217374}, {"mcc": 0.6950049896081338, "macro_f1": 0.6809438173004249}, {"mcc": 0.7135904910037421, "macro_f1": 0.6894871060673773}, {"mcc": 0.6863299955226437, "macro_f1": 0.704044785753711}]}, "total": {"test_mcc": 68.54642519803427, "test_mcc_se": 1.354321691223024, "test_macro_f1": 69.63323748835512, "test_macro_f1_se": 0.9812341990901465}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "suc3", "task": "named-entity-recognition", "dataset_languages": ["sv"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.4842000497636228, "micro_f1": 0.31900698215671064}, {"micro_f1_no_misc": 0.5299514301133297, "micro_f1": 0.3637011945088251}, {"micro_f1_no_misc": 0.5832264683252115, "micro_f1": 0.4636636636636637}, {"micro_f1_no_misc": 0.5692668927732847, "micro_f1": 0.379905808477237}, {"micro_f1_no_misc": 0.5653846153846155, "micro_f1": 0.3366993652625505}, {"micro_f1_no_misc": 0.4981768459434823, "micro_f1": 0.38622060284862536}, {"micro_f1_no_misc": 0.5501840490797546, "micro_f1": 0.3831050981686191}, {"micro_f1_no_misc": 0.4813214739517153, "micro_f1": 0.29701446598953524}, {"micro_f1_no_misc": 0.5720450731239511, "micro_f1": 0.42854619325207555}, {"micro_f1_no_misc": 0.5804480651731162, "micro_f1": 0.3848357203032856}]}, "total": {"test_micro_f1_no_misc": 54.14204963632084, "test_micro_f1_no_misc_se": 2.489781292179128, "test_micro_f1": 37.42699094631128, "test_micro_f1_se": 3.0537358996301927}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "swedn", "task": "summarization", "dataset_languages": ["sv"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"bertscore": 0.6627534071740229, "rouge_l": 0.20317208227398864}, {"bertscore": 0.658439153645304, "rouge_l": 0.19790560248913974}, {"bertscore": 0.6590590397827327, "rouge_l": 0.19347183441259624}, {"bertscore": 0.6524240077851573, "rouge_l": 0.1782341450978794}, {"bertscore": 0.6585789689124795, "rouge_l": 0.1996361457987636}, {"bertscore": 0.6554302386648487, "rouge_l": 0.19676474373416797}, {"bertscore": 0.665099200385157, "rouge_l": 0.20810799090442883}, {"bertscore": 0.6622595301887486, "rouge_l": 0.19961580878572016}, {"bertscore": 0.6609339708520565, "rouge_l": 0.2057033708475867}, {"bertscore": 0.6505525434622541, "rouge_l": 0.18395724358492804}]}, "total": {"test_bertscore": 65.85530060852761, "test_bertscore_se": 0.2859008975801646, "test_rouge_l": 19.665689679291994, "test_rouge_l_se": 0.5790168156563257}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "swerec", "task": "sentiment-classification", "dataset_languages": ["sv"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.7874799971676427, "macro_f1": 0.7820426032992566}, {"mcc": 0.7779920775651227, "macro_f1": 0.7807358069036794}, {"mcc": 0.8019993448142841, "macro_f1": 0.7935101982755698}, {"mcc": 0.7970700266763614, "macro_f1": 0.798701063708827}, {"mcc": 0.7284064801395786, "macro_f1": 0.7556312301162754}, {"mcc": 0.7954188192069412, "macro_f1": 0.7866951582574608}, {"mcc": 0.7998901646878526, "macro_f1": 0.7976792939027089}, {"mcc": 0.7787595856752327, "macro_f1": 0.7542422243717967}, {"mcc": 0.7821533802990992, "macro_f1": 0.7375495788478014}, {"mcc": 0.7779216729321043, "macro_f1": 0.778510712171696}]}, "total": {"test_mcc": 78.27091549164219, "test_mcc_se": 1.3176761453560841, "test_macro_f1": 77.65297869855071, "test_macro_f1_se": 1.2795135705934908}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "wiki-lingua-nl", "task": "summarization", "dataset_languages": ["nl"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"bertscore": 0.688731212168932, "rouge_l": 0.21437449286392402}, {"bertscore": 0.6715059995185584, "rouge_l": 0.1939344743949877}, {"bertscore": 0.6846702473412734, "rouge_l": 0.21024335581050702}, {"bertscore": 0.6925134391785832, "rouge_l": 0.22118068000316252}, {"bertscore": 0.6679572111897869, "rouge_l": 0.17592565325921714}, {"bertscore": 0.6942149706883356, "rouge_l": 0.22480584061938197}, {"bertscore": 0.6648867204494309, "rouge_l": 0.17831877824975012}, {"bertscore": 0.6877988064225065, "rouge_l": 0.2156633671581658}, {"bertscore": 0.6991395016229944, "rouge_l": 0.23455335083598816}, {"bertscore": 0.687776954495348, "rouge_l": 0.21420839751697002}]}, "total": {"test_bertscore": 68.39195063075749, "test_bertscore_se": 0.7264298376127906, "test_rouge_l": 20.83208390712055, "test_rouge_l_se": 1.2065871934760715}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
+{"dataset": "winogrande-is", "task": "common-sense-reasoning", "dataset_languages": ["is"], "model": "skole-gpt-mixtral", "results": {"raw": {"test": [{"mcc": 0.11548459676563645, "accuracy": 0.5658482142857143}, {"mcc": 0.027448538682394684, "accuracy": 0.515625}, {"mcc": 0.030835101403506253, "accuracy": 0.5200892857142857}, {"mcc": 0.10129032258064516, "accuracy": 0.5558035714285714}, {"mcc": 0.1688375682878172, "accuracy": 0.5691964285714286}, {"mcc": 0.1005926943227118, "accuracy": 0.5345982142857143}, {"mcc": 0.05732468878294589, "accuracy": 0.5234375}, {"mcc": 0.13548411225626797, "accuracy": 0.5647321428571429}, {"mcc": 0.10658265793691721, "accuracy": 0.5502232142857143}, {"mcc": 0.10139003293314827, "accuracy": 0.5446428571428571}]}, "total": {"test_mcc": 9.45270313951991, "test_mcc_se": 2.7605889823906904, "test_accuracy": 54.441964285714285, "test_accuracy_se": 1.2407837324986717}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
 {"dataset": "angry-tweets", "task": "sentiment-classification", "dataset_languages": ["da"], "model": "skole-gpt", "results": {"raw": {"test": [{"mcc": 0.4848311582932243, "macro_f1": 0.6566593163040514}, {"mcc": 0.4710493564701632, "macro_f1": 0.6132470892695207}, {"mcc": 0.532960600282665, "macro_f1": 0.6912704374110179}, {"mcc": 0.4950512777185183, "macro_f1": 0.6489856313891399}, {"mcc": 0.5554322057292909, "macro_f1": 0.709002801597209}, {"mcc": 0.50897850677457, "macro_f1": 0.6543960657662273}, {"mcc": 0.5083409827574323, "macro_f1": 0.6669577383860424}, {"mcc": 0.5401651182991014, "macro_f1": 0.6759161828434398}, {"mcc": 0.532214370523262, "macro_f1": 0.6729219335771565}, {"mcc": 0.5223279828282013, "macro_f1": 0.6831793583901348}]}, "total": {"test_mcc": 51.51351559676429, "test_mcc_se": 1.634750622109416, "test_macro_f1": 66.72536554933939, "test_macro_f1_se": 1.6279237763749863}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
 {"dataset": "arc-is", "task": "knowledge", "dataset_languages": ["is"], "model": "skole-gpt", "results": {"raw": {"test": [{"mcc": 0.1962774017513744, "accuracy": 0.3994140625}, {"mcc": 0.22466889536328083, "accuracy": 0.4208984375}, {"mcc": 0.20758627988326703, "accuracy": 0.4033203125}, {"mcc": 0.18157323284009355, "accuracy": 0.38671875}, {"mcc": 0.21452357169098776, "accuracy": 0.4072265625}, {"mcc": 0.2099694217993396, "accuracy": 0.404296875}, {"mcc": 0.16534692945691656, "accuracy": 0.373046875}, {"mcc": 0.14983971768525764, "accuracy": 0.361328125}, {"mcc": 0.17611908013378738, "accuracy": 0.3837890625}, {"mcc": 0.2013679634256645, "accuracy": 0.40234375}]}, "total": {"test_mcc": 19.27272494029969, "test_mcc_se": 1.473981813038057, "test_accuracy": 39.423828125, "test_accuracy_se": 1.1036285125395775}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}
 {"dataset": "cnn-dailymail", "task": "summarization", "dataset_languages": ["en"], "model": "skole-gpt", "results": {"raw": {"test": [{"bertscore": 0.7062732756603509, "rouge_l": 0.2502204326450074}, {"bertscore": 0.7108152939326828, "rouge_l": 0.26626888332334875}, {"bertscore": 0.7132071297673974, "rouge_l": 0.26919351187868634}, {"bertscore": 0.704810529932729, "rouge_l": 0.23804942069986523}, {"bertscore": 0.7155684455210576, "rouge_l": 0.27392637002695397}, {"bertscore": 0.7092310815787641, "rouge_l": 0.24104671173510026}, {"bertscore": 0.7095974086550996, "rouge_l": 0.2660701758343997}, {"bertscore": 0.6961289528990164, "rouge_l": 0.25602328865955803}, {"bertscore": 0.7135379151150119, "rouge_l": 0.26556088674901934}, {"bertscore": 0.710403663906618, "rouge_l": 0.2617790456328498}]}, "total": {"test_bertscore": 70.89573696968728, "test_bertscore_se": 0.3443167803990276, "test_rouge_l": 25.88138727184789, "test_rouge_l_se": 0.7526499529153959}}, "num_model_parameters": -1, "max_sequence_length": 32768, "vocabulary_size": 32000, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "13.0.0"}