Skip to content
180 changes: 180 additions & 0 deletions examples/tasks/instruct_multilingual.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
extended|belebele_native_instruct_deu_Latn|0|0
extended|belebele_native_instruct_fra_Latn|0|0
extended|belebele_native_instruct_ita_Latn|0|0
extended|belebele_native_instruct_por_Latn|0|0
extended|belebele_native_instruct_spa_Latn|0|0
extended|belebele_en_instruct_acm_Arab|0|0
extended|belebele_en_instruct_arz_Arab|0|0
extended|belebele_en_instruct_ceb_Latn|0|0
extended|belebele_en_instruct_fin_Latn|0|0
extended|belebele_en_instruct_hin_Deva|0|0
extended|belebele_en_instruct_ita_Latn|0|0
extended|belebele_en_instruct_khm_Khmr|0|0
extended|belebele_en_instruct_lvs_Latn|0|0
extended|belebele_en_instruct_npi_Deva|0|0
extended|belebele_en_instruct_pol_Latn|0|0
extended|belebele_en_instruct_slv_Latn|0|0
extended|belebele_en_instruct_swe_Latn|0|0
extended|belebele_en_instruct_afr_Latn|0|0
extended|belebele_en_instruct_asm_Beng|0|0
extended|belebele_en_instruct_ces_Latn|0|0
extended|belebele_en_instruct_fra_Latn|0|0
extended|belebele_en_instruct_hin_Latn|0|0
extended|belebele_en_instruct_jav_Latn|0|0
extended|belebele_en_instruct_mal_Mlym|0|0
extended|belebele_en_instruct_npi_Latn|0|0
extended|belebele_en_instruct_por_Latn|0|0
extended|belebele_en_instruct_swh_Latn|0|0
extended|belebele_en_instruct_tur_Latn|0|0
extended|belebele_en_instruct_yor_Latn|0|0
extended|belebele_en_instruct_als_Latn|0|0
extended|belebele_en_instruct_azj_Latn|0|0
extended|belebele_en_instruct_ckb_Arab|0|0
extended|belebele_en_instruct_hrv_Latn|0|0
extended|belebele_en_instruct_jpn_Jpan|0|0
extended|belebele_en_instruct_kir_Cyrl|0|0
extended|belebele_en_instruct_mar_Deva|0|0
extended|belebele_en_instruct_snd_Arab|0|0
extended|belebele_en_instruct_tam_Taml|0|0
extended|belebele_en_instruct_ukr_Cyrl|0|0
extended|belebele_en_instruct_zho_Hans|0|0
extended|belebele_en_instruct_amh_Ethi|0|0
extended|belebele_en_instruct_dan_Latn|0|0
extended|belebele_en_instruct_hun_Latn|0|0
extended|belebele_en_instruct_kor_Hang|0|0
extended|belebele_en_instruct_mkd_Cyrl|0|0
extended|belebele_en_instruct_ron_Latn|0|0
extended|belebele_en_instruct_som_Latn|0|0
extended|belebele_en_instruct_tel_Telu|0|0
extended|belebele_en_instruct_urd_Arab|0|0
extended|belebele_en_instruct_zho_Hant|0|0
extended|belebele_en_instruct_apc_Arab|0|0
extended|belebele_en_instruct_ben_Beng|0|0
extended|belebele_en_instruct_deu_Latn|0|0
extended|belebele_en_instruct_hye_Armn|0|0
extended|belebele_en_instruct_kan_Knda|0|0
extended|belebele_en_instruct_lao_Laoo|0|0
extended|belebele_en_instruct_mlt_Latn|0|0
extended|belebele_en_instruct_ory_Orya|0|0
extended|belebele_en_instruct_rus_Cyrl|0|0
extended|belebele_en_instruct_tgk_Cyrl|0|0
extended|belebele_en_instruct_urd_Latn|0|0
extended|belebele_en_instruct_zsm_Latn|0|0
extended|belebele_en_instruct_arb_Arab|0|0
extended|belebele_en_instruct_ben_Latn|0|0
extended|belebele_en_instruct_ell_Grek|0|0
extended|belebele_en_instruct_guj_Gujr|0|0
extended|belebele_en_instruct_kat_Geor|0|0
extended|belebele_en_instruct_pan_Guru|0|0
extended|belebele_en_instruct_spa_Latn|0|0
extended|belebele_en_instruct_tgl_Latn|0|0
extended|belebele_en_instruct_uzn_Latn|0|0
extended|belebele_en_instruct_arb_Latn|0|0
extended|belebele_en_instruct_eng_Latn|0|0
extended|belebele_en_instruct_kaz_Cyrl|0|0
extended|belebele_en_instruct_lit_Latn|0|0
extended|belebele_en_instruct_mya_Mymr|0|0
extended|belebele_en_instruct_pbt_Arab|0|0
extended|belebele_en_instruct_sin_Latn|0|0
extended|belebele_en_instruct_srp_Cyrl|0|0
extended|belebele_en_instruct_tha_Thai|0|0
extended|belebele_en_instruct_vie_Latn|0|0
extended|belebele_en_instruct_ars_Arab|0|0
extended|belebele_en_instruct_bul_Cyrl|0|0
extended|belebele_en_instruct_est_Latn|0|0
extended|belebele_en_instruct_ind_Latn|0|0
extended|belebele_en_instruct_nld_Latn|0|0
extended|belebele_en_instruct_pes_Arab|0|0
extended|belebele_en_instruct_sin_Sinh|0|0
extended|belebele_en_instruct_war_Latn|0|0
extended|belebele_en_instruct_ary_Arab|0|0
extended|belebele_en_instruct_cat_Latn|0|0
extended|belebele_en_instruct_eus_Latn|0|0
extended|belebele_en_instruct_heb_Hebr|0|0
extended|belebele_en_instruct_isl_Latn|0|0
extended|belebele_en_instruct_nob_Latn|0|0
extended|belebele_en_instruct_plt_Latn|0|0
extended|belebele_en_instruct_slk_Latn|0|0
extended|global_mmlu_instruct_amh|0|0
extended|global_mmlu_instruct_ara|0|0
extended|global_mmlu_instruct_ben|0|0
extended|global_mmlu_instruct_ces|0|0
extended|global_mmlu_instruct_deu|0|0
extended|global_mmlu_instruct_ell|0|0
extended|global_mmlu_instruct_eng|0|0
extended|global_mmlu_instruct_spa|0|0
extended|global_mmlu_instruct_fas|0|0
extended|global_mmlu_instruct_fra|0|0
extended|global_mmlu_instruct_hau|0|0
extended|global_mmlu_instruct_heb|0|0
extended|global_mmlu_instruct_hin|0|0
extended|global_mmlu_instruct_ind|0|0
extended|global_mmlu_instruct_ibo|0|0
extended|global_mmlu_instruct_ita|0|0
extended|global_mmlu_instruct_jpn|0|0
extended|global_mmlu_instruct_kor|0|0
extended|global_mmlu_instruct_kir|0|0
extended|global_mmlu_instruct_lit|0|0
extended|global_mmlu_instruct_mlg|0|0
extended|global_mmlu_instruct_msa|0|0
extended|global_mmlu_instruct_nep|0|0
extended|global_mmlu_instruct_nld|0|0
extended|global_mmlu_instruct_nor|0|0
extended|global_mmlu_instruct_pol|0|0
extended|global_mmlu_instruct_por|0|0
extended|global_mmlu_instruct_ron|0|0
extended|global_mmlu_instruct_rus|0|0
extended|global_mmlu_instruct_sin|0|0
extended|global_mmlu_instruct_sna|0|0
extended|global_mmlu_instruct_som|0|0
extended|global_mmlu_instruct_srp|0|0
extended|global_mmlu_instruct_swe|0|0
extended|global_mmlu_instruct_swa|0|0
extended|global_mmlu_instruct_tel|0|0
extended|global_mmlu_instruct_tur|0|0
extended|global_mmlu_instruct_ukr|0|0
extended|global_mmlu_instruct_vie|0|0
extended|global_mmlu_instruct_yor|0|0
extended|global_mmlu_instruct_zho|0|0
extended|global_mmlu_lite_instruct_amh|0|0
extended|global_mmlu_lite_instruct_ara|0|0
extended|global_mmlu_lite_instruct_ben|0|0
extended|global_mmlu_lite_instruct_ces|0|0
extended|global_mmlu_lite_instruct_deu|0|0
extended|global_mmlu_lite_instruct_ell|0|0
extended|global_mmlu_lite_instruct_eng|0|0
extended|global_mmlu_lite_instruct_spa|0|0
extended|global_mmlu_lite_instruct_fas|0|0
extended|global_mmlu_lite_instruct_fra|0|0
extended|global_mmlu_lite_instruct_hau|0|0
extended|global_mmlu_lite_instruct_heb|0|0
extended|global_mmlu_lite_instruct_hin|0|0
extended|global_mmlu_lite_instruct_ind|0|0
extended|global_mmlu_lite_instruct_ibo|0|0
extended|global_mmlu_lite_instruct_ita|0|0
extended|global_mmlu_lite_instruct_jpn|0|0
extended|global_mmlu_lite_instruct_kor|0|0
extended|global_mmlu_lite_instruct_kir|0|0
extended|global_mmlu_lite_instruct_lit|0|0
extended|global_mmlu_lite_instruct_mlg|0|0
extended|global_mmlu_lite_instruct_msa|0|0
extended|global_mmlu_lite_instruct_nep|0|0
extended|global_mmlu_lite_instruct_nld|0|0
extended|global_mmlu_lite_instruct_nor|0|0
extended|global_mmlu_lite_instruct_pol|0|0
extended|global_mmlu_lite_instruct_por|0|0
extended|global_mmlu_lite_instruct_ron|0|0
extended|global_mmlu_lite_instruct_rus|0|0
extended|global_mmlu_lite_instruct_sin|0|0
extended|global_mmlu_lite_instruct_sna|0|0
extended|global_mmlu_lite_instruct_som|0|0
extended|global_mmlu_lite_instruct_srp|0|0
extended|global_mmlu_lite_instruct_swe|0|0
extended|global_mmlu_lite_instruct_swa|0|0
extended|global_mmlu_lite_instruct_tel|0|0
extended|global_mmlu_lite_instruct_tur|0|0
extended|global_mmlu_lite_instruct_ukr|0|0
extended|global_mmlu_lite_instruct_vie|0|0
extended|global_mmlu_lite_instruct_yor|0|0
extended|global_mmlu_lite_instruct_zho|0|0
extended|mmlu_pro|0|0
6 changes: 6 additions & 0 deletions examples/tasks/instruct_multilingual_test.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
extended|global_mmlu_lite_instruct_deu|0|0
extended|global_mmlu_lite_instruct_eng|0|0
extended|global_mmlu_lite_instruct_spa|0|0
extended|global_mmlu_lite_instruct_fra|0|0
extended|global_mmlu_lite_instruct_ita|0|0
extended|global_mmlu_lite_instruct_por|0|0
5 changes: 3 additions & 2 deletions src/lighteval/metrics/utils/extractive_match_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,12 @@ class IndicesExtractionConfig:

Attributes:
prefix_for_extraction (ChoicePrefix): The style to use for extracting choice indices (e.g. A,B,C or 1,2,3)
try_extract_without_anchor (bool): Whether to try extracting indices without requiring specific anchors like "answer:" or "final answer is"
try_extract_without_anchor (bool): Whether to try extracting indices without requiring specific anchors like "answer:" or "final answer is".
Recommended False for indices extraction, as some indices (for example `A` which is also a word) can lead to a lot of false positives.
"""

prefix_for_extraction: ChoicePrefix
try_extract_without_anchor: bool = True
try_extract_without_anchor: bool = False


ExtractionTarget = LatexExtractionConfig | ExprExtractionConfig | IndicesExtractionConfig
Expand Down
2 changes: 2 additions & 0 deletions src/lighteval/models/vllm/vllm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ class VLLMModelConfig(ModelConfig):
max_num_batched_tokens: PositiveInt = 2048 # maximum number of tokens per batch
subfolder: str | None = None
is_async: bool = False # Whether to use the async version or sync version of the model
enforce_eager: bool = False


class VLLMModel(LightevalModel):
Expand Down Expand Up @@ -187,6 +188,7 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]:
"seed": int(config.seed),
"max_num_seqs": int(config.max_num_seqs),
"max_num_batched_tokens": int(config.max_num_batched_tokens),
"enforce_eager": bool(config.enforce_eager),
}

if config.quantization is not None:
Expand Down
15 changes: 12 additions & 3 deletions src/lighteval/tasks/default_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -899,15 +899,24 @@ def gpqa_instruct(line, task_name: str = None):
gold_index = random.randint(0, 3)
choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
choices.insert(gold_index, line["Correct Answer"])
query_template = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\n{Question}\n\nA) {A}\nB) {B}\nC) {C}\nD) {D}"
query = query_template.format(A=choices[0], B=choices[1], C=choices[2], D=choices[3], Question=line["Question"])
instruction = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering."
query_template = "{Instruction}\n\n{Question}\n\nA) {A}\nB) {B}\nC) {C}\nD) {D}"
query = query_template.format(
# Stripping to avoid accidental extra whitespaces, present in GPQA
A=choices[0].strip(),
B=choices[1].strip(),
C=choices[2].strip(),
D=choices[3].strip(),
Question=line["Question"].strip(),
Instruction=instruction,
)

return Doc(
task_name=task_name,
query=query,
choices=LETTER_INDICES[: len(choices)],
gold_index=gold_index,
instruction=query,
instruction=instruction,
)


Expand Down
11 changes: 7 additions & 4 deletions src/lighteval/tasks/extended/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,15 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import lighteval.tasks.extended.instruct.belebele as belebele
import lighteval.tasks.extended.instruct.global_mmlu as global_mmlu
import lighteval.tasks.extended.instruct.mgsm as mgsm
import lighteval.tasks.extended.instruct.mmlu_pro as mmlu_pro
from lighteval.utils.imports import can_load_extended_tasks


AVAILABLE_EXTENDED_TASKS_MODULES = [belebele, mmlu_pro, mgsm, global_mmlu]

if can_load_extended_tasks():
import lighteval.tasks.extended.hle.main as hle
import lighteval.tasks.extended.ifeval.main as ifeval
Expand All @@ -32,7 +38,4 @@
import lighteval.tasks.extended.olympiade_bench.main as olympiad_bench
import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks

AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb]

else:
AVAILABLE_EXTENDED_TASKS_MODULES = []
AVAILABLE_EXTENDED_TASKS_MODULES.extend([ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb])
Loading
Loading