[ { "name": "dm_val", "task_type": "math", "dataset_path": "./data/dm_val.json", "is_verifiable": true, "metric_type": "accuracy", "num_samples": -1, "max_gen_len": 2048, "temperature": 0.7, "top_p": 0.8, "num_samples_per_prompt": 1 }, { "name": "aime24", "task_type": "math", "dataset_path": "./data/aime24.json", "is_verifiable": true, "metric_type": "accuracy", "num_samples": -1, "max_gen_len": 4096, "temperature": 0.7, "top_p": 0.8, "num_samples_per_prompt": 1 }, { "name": "aime25", "task_type": "math", "dataset_path": "./data/aime25.json", "is_verifiable": true, "metric_type": "accuracy", "num_samples": -1, "max_gen_len": 4096, "temperature": 0.7, "top_p": 0.8, "num_samples_per_prompt": 1 }, { "name": "amc23", "task_type": "math", "dataset_path": "./data/amc23.json", "is_verifiable": true, "metric_type": "accuracy", "num_samples": -1, "max_gen_len": 2048, "temperature": 0.7, "top_p": 0.8, "num_samples_per_prompt": 1 }, { "name": "math500", "task_type": "math", "dataset_path": "./data/math500.json", "is_verifiable": true, "metric_type": "accuracy", "num_samples": 500, "max_gen_len": 2048, "temperature": 0.7, "top_p": 0.8, "num_samples_per_prompt": 1 }, { "name": "gsm8k", "task_type": "math", "dataset_path": "./data/gsm8k.json", "is_verifiable": true, "metric_type": "accuracy", "num_samples": 500, "max_gen_len": 1024, "temperature": 0.7, "top_p": 0.8, "num_samples_per_prompt": 1 }, { "name": "mmlu_stem", "task_type": "qa", "dataset_path": "./data/mmlu_stem.json", "is_verifiable": true, "metric_type": "accuracy", "num_samples": 500, "max_gen_len": 512, "temperature": 0.3, "top_p": 0.9, "num_samples_per_prompt": 1 }, { "name": "humaneval", "task_type": "code", "dataset_path": "./data/humaneval.json", "is_verifiable": true, "metric_type": "accuracy", "num_samples": 164, "max_gen_len": 1024, "temperature": 0.2, "top_p": 0.95, "num_samples_per_prompt": 1 } ]