{ "meta": { "n_questions": 100, "total_configs": 105, "unique_llm_calls": 1379, "total_time_s": 4571.0 }, "faiss_baseline": { "em": 0.32, "f1": 0.4381 }, "grid_results": [ { "config": "\u03b2=20.0_iter=1_standard", "em": 0.38, "f1": 0.4691, "avg_faiss_overlap": 0.48, "avg_entropy": 4.5305 }, { "config": "\u03b2=50.0_iter=1_standard", "em": 0.36, "f1": 0.4565, "avg_faiss_overlap": 0.508, "avg_entropy": 0.3196 }, { "config": "\u03b2=20.0_iter=1_residual_0.9", "em": 0.34, "f1": 0.4552, "avg_faiss_overlap": 0.966, "avg_entropy": 3.5526 }, { "config": "\u03b2=20.0_iter=2_residual_0.95", "em": 0.34, "f1": 0.4552, "avg_faiss_overlap": 0.966, "avg_entropy": 3.5503 }, { "config": "\u03b2=500.0_iter=1_residual_0.9", "em": 0.36, "f1": 0.4545, "avg_faiss_overlap": 0.692, "avg_entropy": 0.0074 }, { "config": "\u03b2=50.0_iter=1_normalized", "em": 0.37, "f1": 0.4539, "avg_faiss_overlap": 0.464, "avg_entropy": 1.7333 }, { "config": "\u03b2=500.0_iter=1_residual_0.95", "em": 0.36, "f1": 0.4536, "avg_faiss_overlap": 0.748, "avg_entropy": 0.013 }, { "config": "\u03b2=20.0_iter=1_residual_0.95", "em": 0.34, "f1": 0.4511, "avg_faiss_overlap": 0.98, "avg_entropy": 3.5011 }, { "config": "\u03b2=50.0_iter=2_normalized", "em": 0.37, "f1": 0.4498, "avg_faiss_overlap": 0.38, "avg_entropy": 1.0954 }, { "config": "\u03b2=20.0_iter=3_residual_0.95", "em": 0.32, "f1": 0.4494, "avg_faiss_overlap": 0.946, "avg_entropy": 3.5994 }, { "config": "\u03b2=50.0_iter=1_residual_0.95", "em": 0.34, "f1": 0.4486, "avg_faiss_overlap": 0.974, "avg_entropy": 0.677 }, { "config": "\u03b2=100.0_iter=1_residual_0.95", "em": 0.34, "f1": 0.4464, "avg_faiss_overlap": 0.97, "avg_entropy": 0.2081 }, { "config": "\u03b2=200.0_iter=1_residual_0.95", "em": 0.34, "f1": 0.4464, "avg_faiss_overlap": 0.968, "avg_entropy": 0.0722 }, { "config": "\u03b2=100.0_iter=1_normalized", "em": 0.35, "f1": 0.446, "avg_faiss_overlap": 0.508, "avg_entropy": 0.1139 }, { "config": "\u03b2=500.0_iter=2_residual_0.95", "em": 0.35, "f1": 0.4445, "avg_faiss_overlap": 0.692, "avg_entropy": 0.0037 }, { "config": "\u03b2=50.0_iter=2_residual_0.9", "em": 0.33, "f1": 0.4441, "avg_faiss_overlap": 0.886, "avg_entropy": 0.4749 }, { "config": "\u03b2=100.0_iter=2_residual_0.9", "em": 0.33, "f1": 0.4441, "avg_faiss_overlap": 0.878, "avg_entropy": 0.0623 }, { "config": "\u03b2=50.0_iter=8_residual_0.95", "em": 0.32, "f1": 0.4431, "avg_faiss_overlap": 0.808, "avg_entropy": 0.2083 }, { "config": "\u03b2=100.0_iter=8_residual_0.95", "em": 0.32, "f1": 0.4431, "avg_faiss_overlap": 0.794, "avg_entropy": 0.0019 }, { "config": "\u03b2=100.0_iter=3_residual_0.95", "em": 0.33, "f1": 0.4427, "avg_faiss_overlap": 0.9, "avg_entropy": 0.0754 }, { "config": "\u03b2=20.0_iter=8_residual_0.95", "em": 0.33, "f1": 0.442, "avg_faiss_overlap": 0.878, "avg_entropy": 3.8365 }, { "config": "\u03b2=50.0_iter=1_residual_0.9", "em": 0.32, "f1": 0.4419, "avg_faiss_overlap": 0.946, "avg_entropy": 0.6043 }, { "config": "\u03b2=50.0_iter=2_residual_0.95", "em": 0.32, "f1": 0.4419, "avg_faiss_overlap": 0.944, "avg_entropy": 0.5941 }, { "config": "\u03b2=50.0_iter=5_residual_0.95", "em": 0.32, "f1": 0.4407, "avg_faiss_overlap": 0.87, "avg_entropy": 0.3877 }, { "config": "\u03b2=200.0_iter=1_normalized", "em": 0.33, "f1": 0.4404, "avg_faiss_overlap": 0.484, "avg_entropy": 0.0109 }, { "config": "\u03b2=500.0_iter=5_residual_0.95", "em": 0.35, "f1": 0.4404, "avg_faiss_overlap": 0.546, "avg_entropy": 0.0 }, { "config": "\u03b2=50.0_iter=8_residual_0.9", "em": 0.34, "f1": 0.4402, "avg_faiss_overlap": 0.666, "avg_entropy": 0.0306 }, { "config": "\u03b2=100.0_iter=1_standard", "em": 0.33, "f1": 0.44, "avg_faiss_overlap": 0.464, "avg_entropy": 0.0196 }, { "config": "\u03b2=100.0_iter=1_residual_0.9", "em": 0.32, "f1": 0.4397, "avg_faiss_overlap": 0.932, "avg_entropy": 0.1471 }, { "config": "\u03b2=100.0_iter=2_residual_0.95", "em": 0.32, "f1": 0.4397, "avg_faiss_overlap": 0.932, "avg_entropy": 0.1268 }, { "config": "\u03b2=100.0_iter=5_residual_0.95", "em": 0.32, "f1": 0.4391, "avg_faiss_overlap": 0.858, "avg_entropy": 0.019 }, { "config": "\u03b2=20.0_iter=0_standard", "em": 0.32, "f1": 0.4381, "avg_faiss_overlap": 1.0, "avg_entropy": 3.452 }, { "config": "\u03b2=50.0_iter=0_standard", "em": 0.32, "f1": 0.4381, "avg_faiss_overlap": 1.0, "avg_entropy": 0.7723 }, { "config": "\u03b2=50.0_iter=5_standard", "em": 0.33, "f1": 0.4381, "avg_faiss_overlap": 0.408, "avg_entropy": 0.0 }, { "config": "\u03b2=50.0_iter=8_standard", "em": 0.33, "f1": 0.4381, "avg_faiss_overlap": 0.408, "avg_entropy": 0.0 }, { "config": "\u03b2=50.0_iter=8_normalized", "em": 0.34, "f1": 0.4381, "avg_faiss_overlap": 0.358, "avg_entropy": 0.0 }, { "config": "\u03b2=100.0_iter=0_standard", "em": 0.32, "f1": 0.4381, "avg_faiss_overlap": 1.0, "avg_entropy": 0.3128 }, { "config": "\u03b2=100.0_iter=2_normalized", "em": 0.33, "f1": 0.4381, "avg_faiss_overlap": 0.422, "avg_entropy": 0.0143 }, { "config": "\u03b2=100.0_iter=3_normalized", "em": 0.33, "f1": 0.4381, "avg_faiss_overlap": 0.412, "avg_entropy": 0.0 }, { "config": "\u03b2=100.0_iter=5_normalized", "em": 0.33, "f1": 0.4381, "avg_faiss_overlap": 0.41, "avg_entropy": 0.0 }, { "config": "\u03b2=100.0_iter=8_normalized", "em": 0.33, "f1": 0.4381, "avg_faiss_overlap": 0.41, "avg_entropy": 0.0 }, { "config": "\u03b2=200.0_iter=0_standard", "em": 0.32, "f1": 0.4381, "avg_faiss_overlap": 1.0, "avg_entropy": 0.1535 }, { "config": "\u03b2=50.0_iter=3_residual_0.95", "em": 0.32, "f1": 0.4377, "avg_faiss_overlap": 0.912, "avg_entropy": 0.5215 }, { "config": "\u03b2=20.0_iter=5_residual_0.9", "em": 0.32, "f1": 0.437, "avg_faiss_overlap": 0.846, "avg_entropy": 3.9311 }, { "config": "\u03b2=20.0_iter=2_residual_0.9", "em": 0.31, "f1": 0.4349, "avg_faiss_overlap": 0.926, "avg_entropy": 3.6521 }, { "config": "\u03b2=200.0_iter=1_residual_0.9", "em": 0.32, "f1": 0.4347, "avg_faiss_overlap": 0.928, "avg_entropy": 0.0466 }, { "config": "\u03b2=200.0_iter=2_residual_0.95", "em": 0.32, "f1": 0.4347, "avg_faiss_overlap": 0.928, "avg_entropy": 0.0274 }, { "config": "\u03b2=50.0_iter=3_residual_0.9", "em": 0.31, "f1": 0.4344, "avg_faiss_overlap": 0.842, "avg_entropy": 0.3574 }, { "config": "\u03b2=200.0_iter=2_residual_0.9", "em": 0.32, "f1": 0.4341, "avg_faiss_overlap": 0.876, "avg_entropy": 0.008 }, { "config": "\u03b2=200.0_iter=5_residual_0.95", "em": 0.31, "f1": 0.4341, "avg_faiss_overlap": 0.852, "avg_entropy": 0.0005 }, { "config": "\u03b2=200.0_iter=1_standard", "em": 0.33, "f1": 0.4331, "avg_faiss_overlap": 0.43, "avg_entropy": 0.0061 }, { "config": "\u03b2=200.0_iter=3_residual_0.95", "em": 0.32, "f1": 0.4327, "avg_faiss_overlap": 0.898, "avg_entropy": 0.0082 }, { "config": "\u03b2=20.0_iter=3_residual_0.9", "em": 0.31, "f1": 0.4313, "avg_faiss_overlap": 0.9, "avg_entropy": 3.7492 }, { "config": "\u03b2=500.0_iter=3_residual_0.95", "em": 0.35, "f1": 0.4312, "avg_faiss_overlap": 0.636, "avg_entropy": 0.0 }, { "config": "\u03b2=20.0_iter=8_residual_0.9", "em": 0.31, "f1": 0.4302, "avg_faiss_overlap": 0.748, "avg_entropy": 4.1579 }, { "config": "\u03b2=20.0_iter=5_residual_0.95", "em": 0.31, "f1": 0.4299, "avg_faiss_overlap": 0.91, "avg_entropy": 3.6963 }, { "config": "\u03b2=50.0_iter=3_normalized", "em": 0.33, "f1": 0.4291, "avg_faiss_overlap": 0.368, "avg_entropy": 0.6769 }, { "config": "\u03b2=50.0_iter=5_residual_0.9", "em": 0.32, "f1": 0.4291, "avg_faiss_overlap": 0.778, "avg_entropy": 0.1599 }, { "config": "\u03b2=50.0_iter=3_standard", "em": 0.32, "f1": 0.4281, "avg_faiss_overlap": 0.41, "avg_entropy": 0.0016 }, { "config": "\u03b2=50.0_iter=5_normalized", "em": 0.33, "f1": 0.4281, "avg_faiss_overlap": 0.356, "avg_entropy": 0.1417 }, { "config": "\u03b2=100.0_iter=3_standard", "em": 0.32, "f1": 0.4281, "avg_faiss_overlap": 0.406, "avg_entropy": 0.0 }, { "config": "\u03b2=100.0_iter=5_standard", "em": 0.32, "f1": 0.4281, "avg_faiss_overlap": 0.406, "avg_entropy": 0.0 }, { "config": "\u03b2=100.0_iter=8_standard", "em": 0.32, "f1": 0.4281, "avg_faiss_overlap": 0.406, "avg_entropy": 0.0 }, { "config": "\u03b2=200.0_iter=2_standard", "em": 0.32, "f1": 0.4281, "avg_faiss_overlap": 0.408, "avg_entropy": 0.0 }, { "config": "\u03b2=200.0_iter=2_normalized", "em": 0.32, "f1": 0.4281, "avg_faiss_overlap": 0.408, "avg_entropy": 0.0 }, { "config": "\u03b2=200.0_iter=3_standard", "em": 0.32, "f1": 0.4281, "avg_faiss_overlap": 0.406, "avg_entropy": 0.0 }, { "config": "\u03b2=200.0_iter=3_normalized", "em": 0.32, "f1": 0.4281, "avg_faiss_overlap": 0.406, "avg_entropy": 0.0 }, { "config": "\u03b2=200.0_iter=5_standard", "em": 0.32, "f1": 0.4281, "avg_faiss_overlap": 0.406, "avg_entropy": 0.0 }, { "config": "\u03b2=200.0_iter=5_normalized", "em": 0.32, "f1": 0.4281, "avg_faiss_overlap": 0.406, "avg_entropy": 0.0 }, { "config": "\u03b2=200.0_iter=8_standard", "em": 0.32, "f1": 0.4281, "avg_faiss_overlap": 0.406, "avg_entropy": 0.0 }, { "config": "\u03b2=200.0_iter=8_normalized", "em": 0.32, "f1": 0.4281, "avg_faiss_overlap": 0.406, "avg_entropy": 0.0 }, { "config": "\u03b2=50.0_iter=2_standard", "em": 0.32, "f1": 0.4265, "avg_faiss_overlap": 0.422, "avg_entropy": 0.0481 }, { "config": "\u03b2=500.0_iter=3_residual_0.9", "em": 0.33, "f1": 0.4265, "avg_faiss_overlap": 0.51, "avg_entropy": 0.0 }, { "config": "\u03b2=100.0_iter=3_residual_0.9", "em": 0.29, "f1": 0.4244, "avg_faiss_overlap": 0.828, "avg_entropy": 0.019 }, { "config": "\u03b2=200.0_iter=3_residual_0.9", "em": 0.29, "f1": 0.4244, "avg_faiss_overlap": 0.824, "avg_entropy": 0.0021 }, { "config": "\u03b2=500.0_iter=0_standard", "em": 0.33, "f1": 0.4236, "avg_faiss_overlap": 0.798, "avg_entropy": 0.0746 }, { "config": "\u03b2=200.0_iter=8_residual_0.95", "em": 0.3, "f1": 0.4231, "avg_faiss_overlap": 0.786, "avg_entropy": 0.0 }, { "config": "\u03b2=100.0_iter=2_standard", "em": 0.31, "f1": 0.4181, "avg_faiss_overlap": 0.41, "avg_entropy": 0.0 }, { "config": "\u03b2=100.0_iter=8_residual_0.9", "em": 0.32, "f1": 0.4152, "avg_faiss_overlap": 0.636, "avg_entropy": 0.0 }, { "config": "\u03b2=200.0_iter=8_residual_0.9", "em": 0.32, "f1": 0.4152, "avg_faiss_overlap": 0.634, "avg_entropy": 0.0 }, { "config": "\u03b2=100.0_iter=5_residual_0.9", "em": 0.3, "f1": 0.4141, "avg_faiss_overlap": 0.764, "avg_entropy": 0.0014 }, { "config": "\u03b2=500.0_iter=2_residual_0.9", "em": 0.32, "f1": 0.4098, "avg_faiss_overlap": 0.584, "avg_entropy": 0.0001 }, { "config": "\u03b2=200.0_iter=5_residual_0.9", "em": 0.29, "f1": 0.4041, "avg_faiss_overlap": 0.754, "avg_entropy": 0.0 }, { "config": "\u03b2=500.0_iter=5_residual_0.9", "em": 0.31, "f1": 0.3949, "avg_faiss_overlap": 0.34, "avg_entropy": 0.0 }, { "config": "\u03b2=500.0_iter=8_residual_0.95", "em": 0.29, "f1": 0.3839, "avg_faiss_overlap": 0.424, "avg_entropy": 0.0 }, { "config": "\u03b2=500.0_iter=8_residual_0.9", "em": 0.27, "f1": 0.3743, "avg_faiss_overlap": 0.228, "avg_entropy": 0.0 }, { "config": "\u03b2=20.0_iter=2_standard", "em": 0.29, "f1": 0.3713, "avg_faiss_overlap": 0.248, "avg_entropy": 4.6996 }, { "config": "\u03b2=500.0_iter=1_normalized", "em": 0.27, "f1": 0.3693, "avg_faiss_overlap": 0.236, "avg_entropy": 0.0018 }, { "config": "\u03b2=500.0_iter=1_standard", "em": 0.25, "f1": 0.35, "avg_faiss_overlap": 0.22, "avg_entropy": 0.0003 }, { "config": "\u03b2=500.0_iter=2_standard", "em": 0.25, "f1": 0.35, "avg_faiss_overlap": 0.202, "avg_entropy": 0.0 }, { "config": "\u03b2=500.0_iter=2_normalized", "em": 0.25, "f1": 0.35, "avg_faiss_overlap": 0.202, "avg_entropy": 0.0 }, { "config": "\u03b2=500.0_iter=3_standard", "em": 0.25, "f1": 0.35, "avg_faiss_overlap": 0.202, "avg_entropy": 0.0 }, { "config": "\u03b2=500.0_iter=3_normalized", "em": 0.25, "f1": 0.35, "avg_faiss_overlap": 0.202, "avg_entropy": 0.0 }, { "config": "\u03b2=500.0_iter=5_standard", "em": 0.25, "f1": 0.35, "avg_faiss_overlap": 0.202, "avg_entropy": 0.0 }, { "config": "\u03b2=500.0_iter=5_normalized", "em": 0.25, "f1": 0.35, "avg_faiss_overlap": 0.202, "avg_entropy": 0.0 }, { "config": "\u03b2=500.0_iter=8_standard", "em": 0.25, "f1": 0.35, "avg_faiss_overlap": 0.202, "avg_entropy": 0.0 }, { "config": "\u03b2=500.0_iter=8_normalized", "em": 0.25, "f1": 0.35, "avg_faiss_overlap": 0.202, "avg_entropy": 0.0 }, { "config": "\u03b2=20.0_iter=1_normalized", "em": 0.27, "f1": 0.3367, "avg_faiss_overlap": 0.14, "avg_entropy": 6.5894 }, { "config": "\u03b2=20.0_iter=3_standard", "em": 0.27, "f1": 0.332, "avg_faiss_overlap": 0.184, "avg_entropy": 4.6947 }, { "config": "\u03b2=20.0_iter=5_standard", "em": 0.24, "f1": 0.3116, "avg_faiss_overlap": 0.162, "avg_entropy": 4.6875 }, { "config": "\u03b2=20.0_iter=8_standard", "em": 0.23, "f1": 0.3016, "avg_faiss_overlap": 0.16, "avg_entropy": 4.684 }, { "config": "\u03b2=20.0_iter=2_normalized", "em": 0.18, "f1": 0.2255, "avg_faiss_overlap": 0.044, "avg_entropy": 6.4837 }, { "config": "\u03b2=20.0_iter=8_normalized", "em": 0.14, "f1": 0.2221, "avg_faiss_overlap": 0.02, "avg_entropy": 6.3573 }, { "config": "\u03b2=20.0_iter=3_normalized", "em": 0.14, "f1": 0.2155, "avg_faiss_overlap": 0.024, "avg_entropy": 6.4174 }, { "config": "\u03b2=20.0_iter=5_normalized", "em": 0.13, "f1": 0.1961, "avg_faiss_overlap": 0.02, "avg_entropy": 6.3707 } ], "best_config": { "config": "\u03b2=20.0_iter=1_standard", "em": 0.38, "f1": 0.4691, "avg_faiss_overlap": 0.48, "avg_entropy": 4.5305 }, "top10": [ { "config": "\u03b2=20.0_iter=1_standard", "em": 0.38, "f1": 0.4691, "avg_faiss_overlap": 0.48, "avg_entropy": 4.5305 }, { "config": "\u03b2=50.0_iter=1_standard", "em": 0.36, "f1": 0.4565, "avg_faiss_overlap": 0.508, "avg_entropy": 0.3196 }, { "config": "\u03b2=20.0_iter=1_residual_0.9", "em": 0.34, "f1": 0.4552, "avg_faiss_overlap": 0.966, "avg_entropy": 3.5526 }, { "config": "\u03b2=20.0_iter=2_residual_0.95", "em": 0.34, "f1": 0.4552, "avg_faiss_overlap": 0.966, "avg_entropy": 3.5503 }, { "config": "\u03b2=500.0_iter=1_residual_0.9", "em": 0.36, "f1": 0.4545, "avg_faiss_overlap": 0.692, "avg_entropy": 0.0074 }, { "config": "\u03b2=50.0_iter=1_normalized", "em": 0.37, "f1": 0.4539, "avg_faiss_overlap": 0.464, "avg_entropy": 1.7333 }, { "config": "\u03b2=500.0_iter=1_residual_0.95", "em": 0.36, "f1": 0.4536, "avg_faiss_overlap": 0.748, "avg_entropy": 0.013 }, { "config": "\u03b2=20.0_iter=1_residual_0.95", "em": 0.34, "f1": 0.4511, "avg_faiss_overlap": 0.98, "avg_entropy": 3.5011 }, { "config": "\u03b2=50.0_iter=2_normalized", "em": 0.37, "f1": 0.4498, "avg_faiss_overlap": 0.38, "avg_entropy": 1.0954 }, { "config": "\u03b2=20.0_iter=3_residual_0.95", "em": 0.32, "f1": 0.4494, "avg_faiss_overlap": 0.946, "avg_entropy": 3.5994 } ] }