{ "meta": { "n_questions": 100, "total_configs": 100, "unique_llm_calls": 1666, "faiss_llm_calls": 100, "total_time_s": 5113.4 }, "faiss_baseline": { "em": 0.32, "f1": 0.4381 }, "grid_results": [ { "beta": 5.0, "lambda": 0.7, "max_iter": 1, "em": 0.36, "f1": 0.4809, "avg_faiss_overlap": 0.902, "avg_entropy": 7.1163 }, { "beta": 5.0, "lambda": 0.9, "max_iter": 3, "em": 0.36, "f1": 0.4809, "avg_faiss_overlap": 0.912, "avg_entropy": 7.1122 }, { "beta": 10.0, "lambda": 0.7, "max_iter": 1, "em": 0.36, "f1": 0.4809, "avg_faiss_overlap": 0.9, "avg_entropy": 6.8422 }, { "beta": 10.0, "lambda": 0.95, "max_iter": 8, "em": 0.36, "f1": 0.4797, "avg_faiss_overlap": 0.886, "avg_entropy": 6.8941 }, { "beta": 5.0, "lambda": 0.95, "max_iter": 8, "em": 0.35, "f1": 0.4697, "avg_faiss_overlap": 0.886, "avg_entropy": 7.1219 }, { "beta": 5.0, "lambda": 0.95, "max_iter": 3, "em": 0.35, "f1": 0.4692, "avg_faiss_overlap": 0.956, "avg_entropy": 7.09 }, { "beta": 5.0, "lambda": 0.95, "max_iter": 5, "em": 0.35, "f1": 0.4692, "avg_faiss_overlap": 0.928, "avg_entropy": 7.105 }, { "beta": 10.0, "lambda": 0.9, "max_iter": 1, "em": 0.35, "f1": 0.4692, "avg_faiss_overlap": 0.966, "avg_entropy": 6.6138 }, { "beta": 10.0, "lambda": 0.95, "max_iter": 3, "em": 0.35, "f1": 0.4692, "avg_faiss_overlap": 0.956, "avg_entropy": 6.6763 }, { "beta": 5.0, "lambda": 0.9, "max_iter": 1, "em": 0.34, "f1": 0.4672, "avg_faiss_overlap": 0.97, "avg_entropy": 7.0815 }, { "beta": 10.0, "lambda": 0.9, "max_iter": 5, "em": 0.34, "f1": 0.4637, "avg_faiss_overlap": 0.856, "avg_entropy": 6.9499 }, { "beta": 5.0, "lambda": 0.5, "max_iter": 1, "em": 0.33, "f1": 0.4627, "avg_faiss_overlap": 0.786, "avg_entropy": 7.1407 }, { "beta": 5.0, "lambda": 0.9, "max_iter": 8, "em": 0.34, "f1": 0.4625, "avg_faiss_overlap": 0.724, "avg_entropy": 7.1479 }, { "beta": 10.0, "lambda": 0.5, "max_iter": 1, "em": 0.33, "f1": 0.4622, "avg_faiss_overlap": 0.808, "avg_entropy": 6.9842 }, { "beta": 100.0, "lambda": 0.5, "max_iter": 1, "em": 0.36, "f1": 0.4608, "avg_faiss_overlap": 0.722, "avg_entropy": 0.0442 }, { "beta": 10.0, "lambda": 0.9, "max_iter": 8, "em": 0.33, "f1": 0.46, "avg_faiss_overlap": 0.744, "avg_entropy": 7.0404 }, { "beta": 5.0, "lambda": 0.8, "max_iter": 1, "em": 0.34, "f1": 0.4592, "avg_faiss_overlap": 0.944, "avg_entropy": 7.1003 }, { "beta": 10.0, "lambda": 0.8, "max_iter": 1, "em": 0.34, "f1": 0.4592, "avg_faiss_overlap": 0.938, "avg_entropy": 6.7407 }, { "beta": 10.0, "lambda": 0.95, "max_iter": 5, "em": 0.34, "f1": 0.4592, "avg_faiss_overlap": 0.928, "avg_entropy": 6.7819 }, { "beta": 5.0, "lambda": 0.95, "max_iter": 1, "em": 0.34, "f1": 0.4556, "avg_faiss_overlap": 0.984, "avg_entropy": 7.0709 }, { "beta": 10.0, "lambda": 0.95, "max_iter": 1, "em": 0.34, "f1": 0.4556, "avg_faiss_overlap": 0.984, "avg_entropy": 6.5398 }, { "beta": 20.0, "lambda": 0.9, "max_iter": 1, "em": 0.34, "f1": 0.4552, "avg_faiss_overlap": 0.966, "avg_entropy": 3.5526 }, { "beta": 20.0, "lambda": 0.5, "max_iter": 1, "em": 0.34, "f1": 0.4548, "avg_faiss_overlap": 0.796, "avg_entropy": 4.0138 }, { "beta": 5.0, "lambda": 0.8, "max_iter": 3, "em": 0.32, "f1": 0.4527, "avg_faiss_overlap": 0.8, "avg_entropy": 7.14 }, { "beta": 10.0, "lambda": 0.8, "max_iter": 3, "em": 0.32, "f1": 0.4527, "avg_faiss_overlap": 0.8, "avg_entropy": 6.9964 }, { "beta": 50.0, "lambda": 0.8, "max_iter": 3, "em": 0.34, "f1": 0.4524, "avg_faiss_overlap": 0.74, "avg_entropy": 0.1516 }, { "beta": 20.0, "lambda": 0.95, "max_iter": 1, "em": 0.34, "f1": 0.4511, "avg_faiss_overlap": 0.98, "avg_entropy": 3.5011 }, { "beta": 10.0, "lambda": 0.9, "max_iter": 3, "em": 0.33, "f1": 0.4509, "avg_faiss_overlap": 0.918, "avg_entropy": 6.828 }, { "beta": 50.0, "lambda": 0.7, "max_iter": 5, "em": 0.34, "f1": 0.4509, "avg_faiss_overlap": 0.492, "avg_entropy": 0.0065 }, { "beta": 50.0, "lambda": 0.5, "max_iter": 3, "em": 0.35, "f1": 0.4501, "avg_faiss_overlap": 0.472, "avg_entropy": 0.0137 }, { "beta": 20.0, "lambda": 0.8, "max_iter": 3, "em": 0.33, "f1": 0.4498, "avg_faiss_overlap": 0.798, "avg_entropy": 4.0296 }, { "beta": 50.0, "lambda": 0.5, "max_iter": 1, "em": 0.34, "f1": 0.4496, "avg_faiss_overlap": 0.752, "avg_entropy": 0.3624 }, { "beta": 20.0, "lambda": 0.95, "max_iter": 3, "em": 0.32, "f1": 0.4494, "avg_faiss_overlap": 0.946, "avg_entropy": 3.5994 }, { "beta": 5.0, "lambda": 0.9, "max_iter": 5, "em": 0.32, "f1": 0.4487, "avg_faiss_overlap": 0.842, "avg_entropy": 7.1313 }, { "beta": 50.0, "lambda": 0.95, "max_iter": 1, "em": 0.34, "f1": 0.4486, "avg_faiss_overlap": 0.974, "avg_entropy": 0.677 }, { "beta": 100.0, "lambda": 0.95, "max_iter": 1, "em": 0.34, "f1": 0.4464, "avg_faiss_overlap": 0.97, "avg_entropy": 0.2081 }, { "beta": 50.0, "lambda": 0.8, "max_iter": 8, "em": 0.33, "f1": 0.4459, "avg_faiss_overlap": 0.482, "avg_entropy": 0.001 }, { "beta": 100.0, "lambda": 0.8, "max_iter": 3, "em": 0.34, "f1": 0.4458, "avg_faiss_overlap": 0.704, "avg_entropy": 0.0034 }, { "beta": 100.0, "lambda": 0.8, "max_iter": 1, "em": 0.33, "f1": 0.4441, "avg_faiss_overlap": 0.878, "avg_entropy": 0.0927 }, { "beta": 20.0, "lambda": 0.7, "max_iter": 3, "em": 0.33, "f1": 0.4433, "avg_faiss_overlap": 0.676, "avg_entropy": 4.2538 }, { "beta": 50.0, "lambda": 0.95, "max_iter": 8, "em": 0.32, "f1": 0.4431, "avg_faiss_overlap": 0.808, "avg_entropy": 0.2083 }, { "beta": 100.0, "lambda": 0.95, "max_iter": 8, "em": 0.32, "f1": 0.4431, "avg_faiss_overlap": 0.794, "avg_entropy": 0.0019 }, { "beta": 50.0, "lambda": 0.8, "max_iter": 1, "em": 0.32, "f1": 0.443, "avg_faiss_overlap": 0.894, "avg_entropy": 0.5053 }, { "beta": 100.0, "lambda": 0.95, "max_iter": 3, "em": 0.33, "f1": 0.4427, "avg_faiss_overlap": 0.9, "avg_entropy": 0.0754 }, { "beta": 10.0, "lambda": 0.7, "max_iter": 3, "em": 0.31, "f1": 0.4426, "avg_faiss_overlap": 0.654, "avg_entropy": 7.0699 }, { "beta": 20.0, "lambda": 0.95, "max_iter": 8, "em": 0.33, "f1": 0.442, "avg_faiss_overlap": 0.878, "avg_entropy": 3.8365 }, { "beta": 50.0, "lambda": 0.9, "max_iter": 1, "em": 0.32, "f1": 0.4419, "avg_faiss_overlap": 0.946, "avg_entropy": 0.6043 }, { "beta": 50.0, "lambda": 0.7, "max_iter": 1, "em": 0.32, "f1": 0.4415, "avg_faiss_overlap": 0.836, "avg_entropy": 0.4413 }, { "beta": 100.0, "lambda": 0.7, "max_iter": 1, "em": 0.32, "f1": 0.4415, "avg_faiss_overlap": 0.824, "avg_entropy": 0.069 }, { "beta": 20.0, "lambda": 0.7, "max_iter": 1, "em": 0.32, "f1": 0.4413, "avg_faiss_overlap": 0.888, "avg_entropy": 3.7761 }, { "beta": 50.0, "lambda": 0.95, "max_iter": 5, "em": 0.32, "f1": 0.4407, "avg_faiss_overlap": 0.87, "avg_entropy": 0.3877 }, { "beta": 50.0, "lambda": 0.9, "max_iter": 8, "em": 0.34, "f1": 0.4402, "avg_faiss_overlap": 0.666, "avg_entropy": 0.0306 }, { "beta": 100.0, "lambda": 0.9, "max_iter": 1, "em": 0.32, "f1": 0.4397, "avg_faiss_overlap": 0.932, "avg_entropy": 0.1471 }, { "beta": 100.0, "lambda": 0.95, "max_iter": 5, "em": 0.32, "f1": 0.4391, "avg_faiss_overlap": 0.858, "avg_entropy": 0.019 }, { "beta": 50.0, "lambda": 0.8, "max_iter": 5, "em": 0.32, "f1": 0.439, "avg_faiss_overlap": 0.592, "avg_entropy": 0.024 }, { "beta": 50.0, "lambda": 0.5, "max_iter": 8, "em": 0.33, "f1": 0.4381, "avg_faiss_overlap": 0.41, "avg_entropy": 0.0 }, { "beta": 50.0, "lambda": 0.95, "max_iter": 3, "em": 0.32, "f1": 0.4377, "avg_faiss_overlap": 0.912, "avg_entropy": 0.5215 }, { "beta": 20.0, "lambda": 0.9, "max_iter": 5, "em": 0.32, "f1": 0.437, "avg_faiss_overlap": 0.846, "avg_entropy": 3.9311 }, { "beta": 100.0, "lambda": 0.7, "max_iter": 5, "em": 0.32, "f1": 0.4359, "avg_faiss_overlap": 0.474, "avg_entropy": 0.0 }, { "beta": 100.0, "lambda": 0.8, "max_iter": 8, "em": 0.32, "f1": 0.4359, "avg_faiss_overlap": 0.472, "avg_entropy": 0.0 }, { "beta": 20.0, "lambda": 0.8, "max_iter": 5, "em": 0.32, "f1": 0.4356, "avg_faiss_overlap": 0.658, "avg_entropy": 4.2886 }, { "beta": 100.0, "lambda": 0.5, "max_iter": 3, "em": 0.33, "f1": 0.4351, "avg_faiss_overlap": 0.456, "avg_entropy": 0.0 }, { "beta": 20.0, "lambda": 0.8, "max_iter": 1, "em": 0.31, "f1": 0.4349, "avg_faiss_overlap": 0.924, "avg_entropy": 3.6613 }, { "beta": 50.0, "lambda": 0.9, "max_iter": 3, "em": 0.31, "f1": 0.4344, "avg_faiss_overlap": 0.842, "avg_entropy": 0.3574 }, { "beta": 20.0, "lambda": 0.9, "max_iter": 3, "em": 0.31, "f1": 0.4313, "avg_faiss_overlap": 0.9, "avg_entropy": 3.7492 }, { "beta": 20.0, "lambda": 0.9, "max_iter": 8, "em": 0.31, "f1": 0.4302, "avg_faiss_overlap": 0.748, "avg_entropy": 4.1579 }, { "beta": 20.0, "lambda": 0.95, "max_iter": 5, "em": 0.31, "f1": 0.4299, "avg_faiss_overlap": 0.91, "avg_entropy": 3.6963 }, { "beta": 50.0, "lambda": 0.7, "max_iter": 3, "em": 0.32, "f1": 0.4299, "avg_faiss_overlap": 0.614, "avg_entropy": 0.0708 }, { "beta": 50.0, "lambda": 0.9, "max_iter": 5, "em": 0.32, "f1": 0.4291, "avg_faiss_overlap": 0.778, "avg_entropy": 0.1599 }, { "beta": 100.0, "lambda": 0.5, "max_iter": 8, "em": 0.32, "f1": 0.4281, "avg_faiss_overlap": 0.408, "avg_entropy": 0.0 }, { "beta": 50.0, "lambda": 0.7, "max_iter": 8, "em": 0.33, "f1": 0.4257, "avg_faiss_overlap": 0.426, "avg_entropy": 0.0 }, { "beta": 10.0, "lambda": 0.8, "max_iter": 5, "em": 0.29, "f1": 0.4249, "avg_faiss_overlap": 0.632, "avg_entropy": 7.0765 }, { "beta": 100.0, "lambda": 0.9, "max_iter": 3, "em": 0.29, "f1": 0.4244, "avg_faiss_overlap": 0.828, "avg_entropy": 0.019 }, { "beta": 50.0, "lambda": 0.5, "max_iter": 5, "em": 0.32, "f1": 0.4225, "avg_faiss_overlap": 0.42, "avg_entropy": 0.0 }, { "beta": 100.0, "lambda": 0.8, "max_iter": 5, "em": 0.31, "f1": 0.4165, "avg_faiss_overlap": 0.562, "avg_entropy": 0.0 }, { "beta": 100.0, "lambda": 0.7, "max_iter": 8, "em": 0.32, "f1": 0.4157, "avg_faiss_overlap": 0.422, "avg_entropy": 0.0 }, { "beta": 100.0, "lambda": 0.9, "max_iter": 8, "em": 0.32, "f1": 0.4152, "avg_faiss_overlap": 0.636, "avg_entropy": 0.0 }, { "beta": 100.0, "lambda": 0.9, "max_iter": 5, "em": 0.3, "f1": 0.4141, "avg_faiss_overlap": 0.764, "avg_entropy": 0.0014 }, { "beta": 20.0, "lambda": 0.7, "max_iter": 5, "em": 0.3, "f1": 0.4127, "avg_faiss_overlap": 0.498, "avg_entropy": 4.4813 }, { "beta": 100.0, "lambda": 0.5, "max_iter": 5, "em": 0.31, "f1": 0.4125, "avg_faiss_overlap": 0.416, "avg_entropy": 0.0 }, { "beta": 5.0, "lambda": 0.8, "max_iter": 5, "em": 0.27, "f1": 0.4054, "avg_faiss_overlap": 0.614, "avg_entropy": 7.1555 }, { "beta": 5.0, "lambda": 0.7, "max_iter": 3, "em": 0.27, "f1": 0.4051, "avg_faiss_overlap": 0.64, "avg_entropy": 7.1544 }, { "beta": 100.0, "lambda": 0.7, "max_iter": 3, "em": 0.3, "f1": 0.4049, "avg_faiss_overlap": 0.582, "avg_entropy": 0.0002 }, { "beta": 20.0, "lambda": 0.7, "max_iter": 8, "em": 0.31, "f1": 0.3988, "avg_faiss_overlap": 0.28, "avg_entropy": 4.5698 }, { "beta": 20.0, "lambda": 0.5, "max_iter": 5, "em": 0.32, "f1": 0.3946, "avg_faiss_overlap": 0.258, "avg_entropy": 4.6045 }, { "beta": 20.0, "lambda": 0.8, "max_iter": 8, "em": 0.28, "f1": 0.3927, "avg_faiss_overlap": 0.48, "avg_entropy": 4.4867 }, { "beta": 20.0, "lambda": 0.5, "max_iter": 3, "em": 0.29, "f1": 0.3925, "avg_faiss_overlap": 0.452, "avg_entropy": 4.5183 }, { "beta": 10.0, "lambda": 0.7, "max_iter": 5, "em": 0.3, "f1": 0.379, "avg_faiss_overlap": 0.334, "avg_entropy": 7.112 }, { "beta": 10.0, "lambda": 0.8, "max_iter": 8, "em": 0.28, "f1": 0.3659, "avg_faiss_overlap": 0.318, "avg_entropy": 7.1124 }, { "beta": 10.0, "lambda": 0.5, "max_iter": 3, "em": 0.28, "f1": 0.353, "avg_faiss_overlap": 0.222, "avg_entropy": 7.1169 }, { "beta": 5.0, "lambda": 0.5, "max_iter": 3, "em": 0.27, "f1": 0.3436, "avg_faiss_overlap": 0.156, "avg_entropy": 7.1645 }, { "beta": 20.0, "lambda": 0.5, "max_iter": 8, "em": 0.25, "f1": 0.3298, "avg_faiss_overlap": 0.186, "avg_entropy": 4.565 }, { "beta": 5.0, "lambda": 0.7, "max_iter": 5, "em": 0.24, "f1": 0.3274, "avg_faiss_overlap": 0.278, "avg_entropy": 7.1633 }, { "beta": 5.0, "lambda": 0.8, "max_iter": 8, "em": 0.24, "f1": 0.3274, "avg_faiss_overlap": 0.276, "avg_entropy": 7.1634 }, { "beta": 10.0, "lambda": 0.7, "max_iter": 8, "em": 0.15, "f1": 0.1995, "avg_faiss_overlap": 0.044, "avg_entropy": 7.123 }, { "beta": 10.0, "lambda": 0.5, "max_iter": 5, "em": 0.15, "f1": 0.1983, "avg_faiss_overlap": 0.022, "avg_entropy": 7.1239 }, { "beta": 5.0, "lambda": 0.5, "max_iter": 5, "em": 0.14, "f1": 0.1895, "avg_faiss_overlap": 0.018, "avg_entropy": 7.166 }, { "beta": 5.0, "lambda": 0.5, "max_iter": 8, "em": 0.14, "f1": 0.1877, "avg_faiss_overlap": 0.002, "avg_entropy": 7.1661 }, { "beta": 5.0, "lambda": 0.7, "max_iter": 8, "em": 0.12, "f1": 0.1824, "avg_faiss_overlap": 0.034, "avg_entropy": 7.1658 }, { "beta": 10.0, "lambda": 0.5, "max_iter": 8, "em": 0.12, "f1": 0.1766, "avg_faiss_overlap": 0.002, "avg_entropy": 7.1239 } ], "best_config": { "beta": 5.0, "lambda": 0.7, "max_iter": 1, "em": 0.36, "f1": 0.4809, "avg_faiss_overlap": 0.902, "avg_entropy": 7.1163 }, "top10": [ { "beta": 5.0, "lambda": 0.7, "max_iter": 1, "em": 0.36, "f1": 0.4809, "avg_faiss_overlap": 0.902, "avg_entropy": 7.1163 }, { "beta": 5.0, "lambda": 0.9, "max_iter": 3, "em": 0.36, "f1": 0.4809, "avg_faiss_overlap": 0.912, "avg_entropy": 7.1122 }, { "beta": 10.0, "lambda": 0.7, "max_iter": 1, "em": 0.36, "f1": 0.4809, "avg_faiss_overlap": 0.9, "avg_entropy": 6.8422 }, { "beta": 10.0, "lambda": 0.95, "max_iter": 8, "em": 0.36, "f1": 0.4797, "avg_faiss_overlap": 0.886, "avg_entropy": 6.8941 }, { "beta": 5.0, "lambda": 0.95, "max_iter": 8, "em": 0.35, "f1": 0.4697, "avg_faiss_overlap": 0.886, "avg_entropy": 7.1219 }, { "beta": 5.0, "lambda": 0.95, "max_iter": 3, "em": 0.35, "f1": 0.4692, "avg_faiss_overlap": 0.956, "avg_entropy": 7.09 }, { "beta": 5.0, "lambda": 0.95, "max_iter": 5, "em": 0.35, "f1": 0.4692, "avg_faiss_overlap": 0.928, "avg_entropy": 7.105 }, { "beta": 10.0, "lambda": 0.9, "max_iter": 1, "em": 0.35, "f1": 0.4692, "avg_faiss_overlap": 0.966, "avg_entropy": 6.6138 }, { "beta": 10.0, "lambda": 0.95, "max_iter": 3, "em": 0.35, "f1": 0.4692, "avg_faiss_overlap": 0.956, "avg_entropy": 6.6763 }, { "beta": 5.0, "lambda": 0.9, "max_iter": 1, "em": 0.34, "f1": 0.4672, "avg_faiss_overlap": 0.97, "avg_entropy": 7.0815 } ] }