evaluations: arc_challenge_poly_pt_acc: 0.30085470085470084 arc_challenge_poly_pt_acc_norm: 0.3435897435897436 arc_challenge_poly_pt_acc_norm_stderr: 0.013889944781406437 arc_challenge_poly_pt_acc_stderr: 0.01341389388061822 arc_challenge_poly_pt_alias: arc_challenge_poly_pt assin2_rte_acc,all: 0.5024509803921569 assin2_rte_acc_stderr,all: 0.007142229345039623 assin2_rte_alias: assin2_rte assin2_rte_f1_macro,all: 0.34088811077510744 assin2_rte_f1_macro_stderr,all: 0.0036977314980645975 assin2_sts_alias: assin2_sts assin2_sts_mse,all: 2.584869281045752 assin2_sts_mse_stderr,all: N/A assin2_sts_pearson,all: 0.020246144461495013 assin2_sts_pearson_stderr,all: 0.012848574237206775 assin_entailment_acc: 0.62075 assin_entailment_acc_stderr: 0.007672651221656846 assin_entailment_alias: assin_entailment assin_paraphrase_acc: 0.59675 assin_paraphrase_acc_stderr: 0.007757248423299025 assin_paraphrase_alias: assin_paraphrase belebele_por_Latn_acc: 0.22666666666666666 belebele_por_Latn_acc_norm: 0.22666666666666666 belebele_por_Latn_acc_norm_stderr: 0.013963598349030474 belebele_por_Latn_acc_stderr: 0.013963598349030474 belebele_por_Latn_alias: belebele_por_Latn bluex_acc,all: 0.2698191933240612 bluex_acc,exam_id__UNICAMP_2018: 0.3333333333333333 bluex_acc,exam_id__UNICAMP_2019: 0.3 bluex_acc,exam_id__UNICAMP_2020: 0.3090909090909091 bluex_acc,exam_id__UNICAMP_2021_1: 0.2391304347826087 bluex_acc,exam_id__UNICAMP_2021_2: 0.3333333333333333 bluex_acc,exam_id__UNICAMP_2022: 0.3076923076923077 bluex_acc,exam_id__UNICAMP_2023: 0.3023255813953488 bluex_acc,exam_id__UNICAMP_2024: 0.3333333333333333 bluex_acc,exam_id__USP_2018: 0.25925925925925924 bluex_acc,exam_id__USP_2019: 0.225 bluex_acc,exam_id__USP_2020: 0.17857142857142858 bluex_acc,exam_id__USP_2021: 0.25 bluex_acc,exam_id__USP_2022: 0.2653061224489796 bluex_acc,exam_id__USP_2023: 0.20454545454545456 bluex_acc,exam_id__USP_2024: 0.1951219512195122 bluex_acc_stderr,all: 0.009522448695577326 bluex_acc_stderr,exam_id__UNICAMP_2018: 0.03694964333964267 bluex_acc_stderr,exam_id__UNICAMP_2019: 0.03747275630361617 bluex_acc_stderr,exam_id__UNICAMP_2020: 0.035904653645853185 bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.036254833462246665 bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.038008168468976235 bluex_acc_stderr,exam_id__UNICAMP_2022: 0.042742868943786344 bluex_acc_stderr,exam_id__UNICAMP_2023: 0.040341644184605924 bluex_acc_stderr,exam_id__UNICAMP_2024: 0.04053327096189869 bluex_acc_stderr,exam_id__USP_2018: 0.0344262177569184 bluex_acc_stderr,exam_id__USP_2019: 0.038146963606575296 bluex_acc_stderr,exam_id__USP_2020: 0.029536187641256872 bluex_acc_stderr,exam_id__USP_2021: 0.03461792391082892 bluex_acc_stderr,exam_id__USP_2022: 0.036410693630710485 bluex_acc_stderr,exam_id__USP_2023: 0.03508480264125641 bluex_acc_stderr,exam_id__USP_2024: 0.03582337589286989 bluex_alias: bluex calame_pt_acc: 0.5789980732177264 calame_pt_acc_stderr: 0.010838559109941895 calame_pt_alias: calame_pt calame_pt_perplexity: 7.1322676051739915 calame_pt_perplexity_stderr: 0.41447318920967197 enem_challenge_acc,all: 0.198740377886634 enem_challenge_acc,exam_id__2009: 0.2 enem_challenge_acc,exam_id__2010: 0.21367521367521367 enem_challenge_acc,exam_id__2011: 0.2222222222222222 enem_challenge_acc,exam_id__2012: 0.2672413793103448 enem_challenge_acc,exam_id__2013: 0.21296296296296297 enem_challenge_acc,exam_id__2014: 0.1743119266055046 enem_challenge_acc,exam_id__2015: 0.16806722689075632 enem_challenge_acc,exam_id__2016: 0.18181818181818182 enem_challenge_acc,exam_id__2016_2: 0.2032520325203252 enem_challenge_acc,exam_id__2017: 0.21551724137931033 enem_challenge_acc,exam_id__2022: 0.19548872180451127 enem_challenge_acc,exam_id__2023: 0.14074074074074075 enem_challenge_acc_stderr,all: 0.00610558911077106 enem_challenge_acc_stderr,exam_id__2009: 0.02154694204401486 enem_challenge_acc_stderr,exam_id__2010: 0.021865515933679015 enem_challenge_acc_stderr,exam_id__2011: 0.022195336485659214 enem_challenge_acc_stderr,exam_id__2012: 0.02374992962172603 enem_challenge_acc_stderr,exam_id__2013: 0.02269408166705055 enem_challenge_acc_stderr,exam_id__2014: 0.020897673936382765 enem_challenge_acc_stderr,exam_id__2015: 0.019802918239207434 enem_challenge_acc_stderr,exam_id__2016: 0.020291042922884254 enem_challenge_acc_stderr,exam_id__2016_2: 0.020969856982989393 enem_challenge_acc_stderr,exam_id__2017: 0.02203067864108359 enem_challenge_acc_stderr,exam_id__2022: 0.019840331274268017 enem_challenge_acc_stderr,exam_id__2023: 0.017323886319225976 enem_challenge_alias: enem faquad_nli_acc,all: 0.7753846153846153 faquad_nli_acc_stderr,all: 0.011564640936900579 faquad_nli_alias: faquad_nli faquad_nli_f1_macro,all: 0.43674176776429807 faquad_nli_f1_macro_stderr,all: 0.0036705611888408394 global_piqa_completions_por_latn_braz_acc: 0.78 global_piqa_completions_por_latn_braz_acc_bytes: 0.76 global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.04292346959909278 global_piqa_completions_por_latn_braz_acc_norm: 0.75 global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.04351941398892446 global_piqa_completions_por_latn_braz_acc_stderr: 0.041633319989322654 global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz hatebr_offensive_acc,all: 0.5457142857142857 hatebr_offensive_acc_stderr,all: 0.009408906567694409 hatebr_offensive_alias: hatebr_offensive_binary hatebr_offensive_f1_macro,all: 0.4508029478016081 hatebr_offensive_f1_macro_stderr,all: 0.008913031329806937 hellaswag_poly_pt_acc: 0.361469281612309 hellaswag_poly_pt_acc_norm: 0.4601798678079965 hellaswag_poly_pt_acc_norm_stderr: 0.0051884131715642335 hellaswag_poly_pt_acc_stderr: 0.005001183649049966 hellaswag_poly_pt_alias: hellaswag_poly_pt lambada_poly_pt_acc: 0.3904521637880846 lambada_poly_pt_acc_stderr: 0.0067967279472032245 lambada_poly_pt_alias: lambada_poly_pt lambada_poly_pt_perplexity: 20.774403141688516 lambada_poly_pt_perplexity_stderr: 0.7231795341389268 mmlu_poly_pt_acc: 0.26208345842089464 mmlu_poly_pt_acc_stderr: 0.0038099775311724042 mmlu_poly_pt_alias: mmlu_poly_pt oab_exams_acc,all: 0.2610478359908884 oab_exams_acc,exam_id__2010-01: 0.23529411764705882 oab_exams_acc,exam_id__2010-02: 0.27 oab_exams_acc,exam_id__2011-03: 0.2727272727272727 oab_exams_acc,exam_id__2011-04: 0.275 oab_exams_acc,exam_id__2011-05: 0.2625 oab_exams_acc,exam_id__2012-06: 0.2625 oab_exams_acc,exam_id__2012-06a: 0.225 oab_exams_acc,exam_id__2012-07: 0.275 oab_exams_acc,exam_id__2012-08: 0.25 oab_exams_acc,exam_id__2012-09: 0.35064935064935066 oab_exams_acc,exam_id__2013-10: 0.275 oab_exams_acc,exam_id__2013-11: 0.2875 oab_exams_acc,exam_id__2013-12: 0.25 oab_exams_acc,exam_id__2014-13: 0.25 oab_exams_acc,exam_id__2014-14: 0.275 oab_exams_acc,exam_id__2014-15: 0.32051282051282054 oab_exams_acc,exam_id__2015-16: 0.1875 oab_exams_acc,exam_id__2015-17: 0.2948717948717949 oab_exams_acc,exam_id__2015-18: 0.175 oab_exams_acc,exam_id__2016-19: 0.24358974358974358 oab_exams_acc,exam_id__2016-20: 0.25 oab_exams_acc,exam_id__2016-20a: 0.2875 oab_exams_acc,exam_id__2016-21: 0.2625 oab_exams_acc,exam_id__2017-22: 0.25 oab_exams_acc,exam_id__2017-23: 0.2125 oab_exams_acc,exam_id__2017-24: 0.275 oab_exams_acc,exam_id__2018-25: 0.275 oab_exams_acc_stderr,all: 0.005404522993204322 oab_exams_acc_stderr,exam_id__2010-01: 0.026605455313616685 oab_exams_acc_stderr,exam_id__2010-02: 0.025613225623037045 oab_exams_acc_stderr,exam_id__2011-03: 0.025797748277668113 oab_exams_acc_stderr,exam_id__2011-04: 0.028905026152293026 oab_exams_acc_stderr,exam_id__2011-05: 0.0283315547675685 oab_exams_acc_stderr,exam_id__2012-06: 0.028401435522187126 oab_exams_acc_stderr,exam_id__2012-06a: 0.02692493002179001 oab_exams_acc_stderr,exam_id__2012-07: 0.028685143633187495 oab_exams_acc_stderr,exam_id__2012-08: 0.02792646945667252 oab_exams_acc_stderr,exam_id__2012-09: 0.03143226713819032 oab_exams_acc_stderr,exam_id__2013-10: 0.028749520581305216 oab_exams_acc_stderr,exam_id__2013-11: 0.029216602612826752 oab_exams_acc_stderr,exam_id__2013-12: 0.02799652135475067 oab_exams_acc_stderr,exam_id__2014-13: 0.027941364058848093 oab_exams_acc_stderr,exam_id__2014-14: 0.028700665742904995 oab_exams_acc_stderr,exam_id__2014-15: 0.0304707537714559 oab_exams_acc_stderr,exam_id__2015-16: 0.025187032117302378 oab_exams_acc_stderr,exam_id__2015-17: 0.0297749047613837 oab_exams_acc_stderr,exam_id__2015-18: 0.02449033420128714 oab_exams_acc_stderr,exam_id__2016-19: 0.028105853302671797 oab_exams_acc_stderr,exam_id__2016-20: 0.02785679771062364 oab_exams_acc_stderr,exam_id__2016-20a: 0.029237922790613942 oab_exams_acc_stderr,exam_id__2016-21: 0.028334176853691478 oab_exams_acc_stderr,exam_id__2017-22: 0.027945788844504417 oab_exams_acc_stderr,exam_id__2017-23: 0.026388686173944978 oab_exams_acc_stderr,exam_id__2017-24: 0.02882934165027393 oab_exams_acc_stderr,exam_id__2018-25: 0.02893736358130262 oab_exams_alias: oab_exams portuguese_hate_speech_acc,all: 0.700352526439483 portuguese_hate_speech_acc_stderr,all: 0.011075616750882897 portuguese_hate_speech_alias: portuguese_hate_speech_binary portuguese_hate_speech_f1_macro,all: 0.4118866620594333 portuguese_hate_speech_f1_macro_stderr,all: 0.0038311957825553846 tweetsentbr_acc,all: 0.3373134328358209 tweetsentbr_acc_stderr,all: 0.0074549865355581415 tweetsentbr_alias: tweetsentbr tweetsentbr_f1_macro,all: 0.21837645526737784 tweetsentbr_f1_macro_stderr,all: 0.005256512662126685 step: 22000