diff options
Diffstat (limited to 'data/processed/residual_grid_results.json')
| -rw-r--r-- | data/processed/residual_grid_results.json | 1016 |
1 files changed, 1016 insertions, 0 deletions
diff --git a/data/processed/residual_grid_results.json b/data/processed/residual_grid_results.json new file mode 100644 index 0000000..c5b6d28 --- /dev/null +++ b/data/processed/residual_grid_results.json @@ -0,0 +1,1016 @@ +{ + "meta": { + "n_questions": 100, + "total_configs": 100, + "unique_llm_calls": 1666, + "faiss_llm_calls": 100, + "total_time_s": 5113.4 + }, + "faiss_baseline": { + "em": 0.32, + "f1": 0.4381 + }, + "grid_results": [ + { + "beta": 5.0, + "lambda": 0.7, + "max_iter": 1, + "em": 0.36, + "f1": 0.4809, + "avg_faiss_overlap": 0.902, + "avg_entropy": 7.1163 + }, + { + "beta": 5.0, + "lambda": 0.9, + "max_iter": 3, + "em": 0.36, + "f1": 0.4809, + "avg_faiss_overlap": 0.912, + "avg_entropy": 7.1122 + }, + { + "beta": 10.0, + "lambda": 0.7, + "max_iter": 1, + "em": 0.36, + "f1": 0.4809, + "avg_faiss_overlap": 0.9, + "avg_entropy": 6.8422 + }, + { + "beta": 10.0, + "lambda": 0.95, + "max_iter": 8, + "em": 0.36, + "f1": 0.4797, + "avg_faiss_overlap": 0.886, + "avg_entropy": 6.8941 + }, + { + "beta": 5.0, + "lambda": 0.95, + "max_iter": 8, + "em": 0.35, + "f1": 0.4697, + "avg_faiss_overlap": 0.886, + "avg_entropy": 7.1219 + }, + { + "beta": 5.0, + "lambda": 0.95, + "max_iter": 3, + "em": 0.35, + "f1": 0.4692, + "avg_faiss_overlap": 0.956, + "avg_entropy": 7.09 + }, + { + "beta": 5.0, + "lambda": 0.95, + "max_iter": 5, + "em": 0.35, + "f1": 0.4692, + "avg_faiss_overlap": 0.928, + "avg_entropy": 7.105 + }, + { + "beta": 10.0, + "lambda": 0.9, + "max_iter": 1, + "em": 0.35, + "f1": 0.4692, + "avg_faiss_overlap": 0.966, + "avg_entropy": 6.6138 + }, + { + "beta": 10.0, + "lambda": 0.95, + "max_iter": 3, + "em": 0.35, + "f1": 0.4692, + "avg_faiss_overlap": 0.956, + "avg_entropy": 6.6763 + }, + { + "beta": 5.0, + "lambda": 0.9, + "max_iter": 1, + "em": 0.34, + "f1": 0.4672, + "avg_faiss_overlap": 0.97, + "avg_entropy": 7.0815 + }, + { + "beta": 10.0, + "lambda": 0.9, + "max_iter": 5, + "em": 0.34, + "f1": 0.4637, + "avg_faiss_overlap": 0.856, + "avg_entropy": 6.9499 + }, + { + "beta": 5.0, + "lambda": 0.5, + "max_iter": 1, + "em": 0.33, + "f1": 0.4627, + "avg_faiss_overlap": 0.786, + "avg_entropy": 7.1407 + }, + { + "beta": 5.0, + "lambda": 0.9, + "max_iter": 8, + "em": 0.34, + "f1": 0.4625, + "avg_faiss_overlap": 0.724, + "avg_entropy": 7.1479 + }, + { + "beta": 10.0, + "lambda": 0.5, + "max_iter": 1, + "em": 0.33, + "f1": 0.4622, + "avg_faiss_overlap": 0.808, + "avg_entropy": 6.9842 + }, + { + "beta": 100.0, + "lambda": 0.5, + "max_iter": 1, + "em": 0.36, + "f1": 0.4608, + "avg_faiss_overlap": 0.722, + "avg_entropy": 0.0442 + }, + { + "beta": 10.0, + "lambda": 0.9, + "max_iter": 8, + "em": 0.33, + "f1": 0.46, + "avg_faiss_overlap": 0.744, + "avg_entropy": 7.0404 + }, + { + "beta": 5.0, + "lambda": 0.8, + "max_iter": 1, + "em": 0.34, + "f1": 0.4592, + "avg_faiss_overlap": 0.944, + "avg_entropy": 7.1003 + }, + { + "beta": 10.0, + "lambda": 0.8, + "max_iter": 1, + "em": 0.34, + "f1": 0.4592, + "avg_faiss_overlap": 0.938, + "avg_entropy": 6.7407 + }, + { + "beta": 10.0, + "lambda": 0.95, + "max_iter": 5, + "em": 0.34, + "f1": 0.4592, + "avg_faiss_overlap": 0.928, + "avg_entropy": 6.7819 + }, + { + "beta": 5.0, + "lambda": 0.95, + "max_iter": 1, + "em": 0.34, + "f1": 0.4556, + "avg_faiss_overlap": 0.984, + "avg_entropy": 7.0709 + }, + { + "beta": 10.0, + "lambda": 0.95, + "max_iter": 1, + "em": 0.34, + "f1": 0.4556, + "avg_faiss_overlap": 0.984, + "avg_entropy": 6.5398 + }, + { + "beta": 20.0, + "lambda": 0.9, + "max_iter": 1, + "em": 0.34, + "f1": 0.4552, + "avg_faiss_overlap": 0.966, + "avg_entropy": 3.5526 + }, + { + "beta": 20.0, + "lambda": 0.5, + "max_iter": 1, + "em": 0.34, + "f1": 0.4548, + "avg_faiss_overlap": 0.796, + "avg_entropy": 4.0138 + }, + { + "beta": 5.0, + "lambda": 0.8, + "max_iter": 3, + "em": 0.32, + "f1": 0.4527, + "avg_faiss_overlap": 0.8, + "avg_entropy": 7.14 + }, + { + "beta": 10.0, + "lambda": 0.8, + "max_iter": 3, + "em": 0.32, + "f1": 0.4527, + "avg_faiss_overlap": 0.8, + "avg_entropy": 6.9964 + }, + { + "beta": 50.0, + "lambda": 0.8, + "max_iter": 3, + "em": 0.34, + "f1": 0.4524, + "avg_faiss_overlap": 0.74, + "avg_entropy": 0.1516 + }, + { + "beta": 20.0, + "lambda": 0.95, + "max_iter": 1, + "em": 0.34, + "f1": 0.4511, + "avg_faiss_overlap": 0.98, + "avg_entropy": 3.5011 + }, + { + "beta": 10.0, + "lambda": 0.9, + "max_iter": 3, + "em": 0.33, + "f1": 0.4509, + "avg_faiss_overlap": 0.918, + "avg_entropy": 6.828 + }, + { + "beta": 50.0, + "lambda": 0.7, + "max_iter": 5, + "em": 0.34, + "f1": 0.4509, + "avg_faiss_overlap": 0.492, + "avg_entropy": 0.0065 + }, + { + "beta": 50.0, + "lambda": 0.5, + "max_iter": 3, + "em": 0.35, + "f1": 0.4501, + "avg_faiss_overlap": 0.472, + "avg_entropy": 0.0137 + }, + { + "beta": 20.0, + "lambda": 0.8, + "max_iter": 3, + "em": 0.33, + "f1": 0.4498, + "avg_faiss_overlap": 0.798, + "avg_entropy": 4.0296 + }, + { + "beta": 50.0, + "lambda": 0.5, + "max_iter": 1, + "em": 0.34, + "f1": 0.4496, + "avg_faiss_overlap": 0.752, + "avg_entropy": 0.3624 + }, + { + "beta": 20.0, + "lambda": 0.95, + "max_iter": 3, + "em": 0.32, + "f1": 0.4494, + "avg_faiss_overlap": 0.946, + "avg_entropy": 3.5994 + }, + { + "beta": 5.0, + "lambda": 0.9, + "max_iter": 5, + "em": 0.32, + "f1": 0.4487, + "avg_faiss_overlap": 0.842, + "avg_entropy": 7.1313 + }, + { + "beta": 50.0, + "lambda": 0.95, + "max_iter": 1, + "em": 0.34, + "f1": 0.4486, + "avg_faiss_overlap": 0.974, + "avg_entropy": 0.677 + }, + { + "beta": 100.0, + "lambda": 0.95, + "max_iter": 1, + "em": 0.34, + "f1": 0.4464, + "avg_faiss_overlap": 0.97, + "avg_entropy": 0.2081 + }, + { + "beta": 50.0, + "lambda": 0.8, + "max_iter": 8, + "em": 0.33, + "f1": 0.4459, + "avg_faiss_overlap": 0.482, + "avg_entropy": 0.001 + }, + { + "beta": 100.0, + "lambda": 0.8, + "max_iter": 3, + "em": 0.34, + "f1": 0.4458, + "avg_faiss_overlap": 0.704, + "avg_entropy": 0.0034 + }, + { + "beta": 100.0, + "lambda": 0.8, + "max_iter": 1, + "em": 0.33, + "f1": 0.4441, + "avg_faiss_overlap": 0.878, + "avg_entropy": 0.0927 + }, + { + "beta": 20.0, + "lambda": 0.7, + "max_iter": 3, + "em": 0.33, + "f1": 0.4433, + "avg_faiss_overlap": 0.676, + "avg_entropy": 4.2538 + }, + { + "beta": 50.0, + "lambda": 0.95, + "max_iter": 8, + "em": 0.32, + "f1": 0.4431, + "avg_faiss_overlap": 0.808, + "avg_entropy": 0.2083 + }, + { + "beta": 100.0, + "lambda": 0.95, + "max_iter": 8, + "em": 0.32, + "f1": 0.4431, + "avg_faiss_overlap": 0.794, + "avg_entropy": 0.0019 + }, + { + "beta": 50.0, + "lambda": 0.8, + "max_iter": 1, + "em": 0.32, + "f1": 0.443, + "avg_faiss_overlap": 0.894, + "avg_entropy": 0.5053 + }, + { + "beta": 100.0, + "lambda": 0.95, + "max_iter": 3, + "em": 0.33, + "f1": 0.4427, + "avg_faiss_overlap": 0.9, + "avg_entropy": 0.0754 + }, + { + "beta": 10.0, + "lambda": 0.7, + "max_iter": 3, + "em": 0.31, + "f1": 0.4426, + "avg_faiss_overlap": 0.654, + "avg_entropy": 7.0699 + }, + { + "beta": 20.0, + "lambda": 0.95, + "max_iter": 8, + "em": 0.33, + "f1": 0.442, + "avg_faiss_overlap": 0.878, + "avg_entropy": 3.8365 + }, + { + "beta": 50.0, + "lambda": 0.9, + "max_iter": 1, + "em": 0.32, + "f1": 0.4419, + "avg_faiss_overlap": 0.946, + "avg_entropy": 0.6043 + }, + { + "beta": 50.0, + "lambda": 0.7, + "max_iter": 1, + "em": 0.32, + "f1": 0.4415, + "avg_faiss_overlap": 0.836, + "avg_entropy": 0.4413 + }, + { + "beta": 100.0, + "lambda": 0.7, + "max_iter": 1, + "em": 0.32, + "f1": 0.4415, + "avg_faiss_overlap": 0.824, + "avg_entropy": 0.069 + }, + { + "beta": 20.0, + "lambda": 0.7, + "max_iter": 1, + "em": 0.32, + "f1": 0.4413, + "avg_faiss_overlap": 0.888, + "avg_entropy": 3.7761 + }, + { + "beta": 50.0, + "lambda": 0.95, + "max_iter": 5, + "em": 0.32, + "f1": 0.4407, + "avg_faiss_overlap": 0.87, + "avg_entropy": 0.3877 + }, + { + "beta": 50.0, + "lambda": 0.9, + "max_iter": 8, + "em": 0.34, + "f1": 0.4402, + "avg_faiss_overlap": 0.666, + "avg_entropy": 0.0306 + }, + { + "beta": 100.0, + "lambda": 0.9, + "max_iter": 1, + "em": 0.32, + "f1": 0.4397, + "avg_faiss_overlap": 0.932, + "avg_entropy": 0.1471 + }, + { + "beta": 100.0, + "lambda": 0.95, + "max_iter": 5, + "em": 0.32, + "f1": 0.4391, + "avg_faiss_overlap": 0.858, + "avg_entropy": 0.019 + }, + { + "beta": 50.0, + "lambda": 0.8, + "max_iter": 5, + "em": 0.32, + "f1": 0.439, + "avg_faiss_overlap": 0.592, + "avg_entropy": 0.024 + }, + { + "beta": 50.0, + "lambda": 0.5, + "max_iter": 8, + "em": 0.33, + "f1": 0.4381, + "avg_faiss_overlap": 0.41, + "avg_entropy": 0.0 + }, + { + "beta": 50.0, + "lambda": 0.95, + "max_iter": 3, + "em": 0.32, + "f1": 0.4377, + "avg_faiss_overlap": 0.912, + "avg_entropy": 0.5215 + }, + { + "beta": 20.0, + "lambda": 0.9, + "max_iter": 5, + "em": 0.32, + "f1": 0.437, + "avg_faiss_overlap": 0.846, + "avg_entropy": 3.9311 + }, + { + "beta": 100.0, + "lambda": 0.7, + "max_iter": 5, + "em": 0.32, + "f1": 0.4359, + "avg_faiss_overlap": 0.474, + "avg_entropy": 0.0 + }, + { + "beta": 100.0, + "lambda": 0.8, + "max_iter": 8, + "em": 0.32, + "f1": 0.4359, + "avg_faiss_overlap": 0.472, + "avg_entropy": 0.0 + }, + { + "beta": 20.0, + "lambda": 0.8, + "max_iter": 5, + "em": 0.32, + "f1": 0.4356, + "avg_faiss_overlap": 0.658, + "avg_entropy": 4.2886 + }, + { + "beta": 100.0, + "lambda": 0.5, + "max_iter": 3, + "em": 0.33, + "f1": 0.4351, + "avg_faiss_overlap": 0.456, + "avg_entropy": 0.0 + }, + { + "beta": 20.0, + "lambda": 0.8, + "max_iter": 1, + "em": 0.31, + "f1": 0.4349, + "avg_faiss_overlap": 0.924, + "avg_entropy": 3.6613 + }, + { + "beta": 50.0, + "lambda": 0.9, + "max_iter": 3, + "em": 0.31, + "f1": 0.4344, + "avg_faiss_overlap": 0.842, + "avg_entropy": 0.3574 + }, + { + "beta": 20.0, + "lambda": 0.9, + "max_iter": 3, + "em": 0.31, + "f1": 0.4313, + "avg_faiss_overlap": 0.9, + "avg_entropy": 3.7492 + }, + { + "beta": 20.0, + "lambda": 0.9, + "max_iter": 8, + "em": 0.31, + "f1": 0.4302, + "avg_faiss_overlap": 0.748, + "avg_entropy": 4.1579 + }, + { + "beta": 20.0, + "lambda": 0.95, + "max_iter": 5, + "em": 0.31, + "f1": 0.4299, + "avg_faiss_overlap": 0.91, + "avg_entropy": 3.6963 + }, + { + "beta": 50.0, + "lambda": 0.7, + "max_iter": 3, + "em": 0.32, + "f1": 0.4299, + "avg_faiss_overlap": 0.614, + "avg_entropy": 0.0708 + }, + { + "beta": 50.0, + "lambda": 0.9, + "max_iter": 5, + "em": 0.32, + "f1": 0.4291, + "avg_faiss_overlap": 0.778, + "avg_entropy": 0.1599 + }, + { + "beta": 100.0, + "lambda": 0.5, + "max_iter": 8, + "em": 0.32, + "f1": 0.4281, + "avg_faiss_overlap": 0.408, + "avg_entropy": 0.0 + }, + { + "beta": 50.0, + "lambda": 0.7, + "max_iter": 8, + "em": 0.33, + "f1": 0.4257, + "avg_faiss_overlap": 0.426, + "avg_entropy": 0.0 + }, + { + "beta": 10.0, + "lambda": 0.8, + "max_iter": 5, + "em": 0.29, + "f1": 0.4249, + "avg_faiss_overlap": 0.632, + "avg_entropy": 7.0765 + }, + { + "beta": 100.0, + "lambda": 0.9, + "max_iter": 3, + "em": 0.29, + "f1": 0.4244, + "avg_faiss_overlap": 0.828, + "avg_entropy": 0.019 + }, + { + "beta": 50.0, + "lambda": 0.5, + "max_iter": 5, + "em": 0.32, + "f1": 0.4225, + "avg_faiss_overlap": 0.42, + "avg_entropy": 0.0 + }, + { + "beta": 100.0, + "lambda": 0.8, + "max_iter": 5, + "em": 0.31, + "f1": 0.4165, + "avg_faiss_overlap": 0.562, + "avg_entropy": 0.0 + }, + { + "beta": 100.0, + "lambda": 0.7, + "max_iter": 8, + "em": 0.32, + "f1": 0.4157, + "avg_faiss_overlap": 0.422, + "avg_entropy": 0.0 + }, + { + "beta": 100.0, + "lambda": 0.9, + "max_iter": 8, + "em": 0.32, + "f1": 0.4152, + "avg_faiss_overlap": 0.636, + "avg_entropy": 0.0 + }, + { + "beta": 100.0, + "lambda": 0.9, + "max_iter": 5, + "em": 0.3, + "f1": 0.4141, + "avg_faiss_overlap": 0.764, + "avg_entropy": 0.0014 + }, + { + "beta": 20.0, + "lambda": 0.7, + "max_iter": 5, + "em": 0.3, + "f1": 0.4127, + "avg_faiss_overlap": 0.498, + "avg_entropy": 4.4813 + }, + { + "beta": 100.0, + "lambda": 0.5, + "max_iter": 5, + "em": 0.31, + "f1": 0.4125, + "avg_faiss_overlap": 0.416, + "avg_entropy": 0.0 + }, + { + "beta": 5.0, + "lambda": 0.8, + "max_iter": 5, + "em": 0.27, + "f1": 0.4054, + "avg_faiss_overlap": 0.614, + "avg_entropy": 7.1555 + }, + { + "beta": 5.0, + "lambda": 0.7, + "max_iter": 3, + "em": 0.27, + "f1": 0.4051, + "avg_faiss_overlap": 0.64, + "avg_entropy": 7.1544 + }, + { + "beta": 100.0, + "lambda": 0.7, + "max_iter": 3, + "em": 0.3, + "f1": 0.4049, + "avg_faiss_overlap": 0.582, + "avg_entropy": 0.0002 + }, + { + "beta": 20.0, + "lambda": 0.7, + "max_iter": 8, + "em": 0.31, + "f1": 0.3988, + "avg_faiss_overlap": 0.28, + "avg_entropy": 4.5698 + }, + { + "beta": 20.0, + "lambda": 0.5, + "max_iter": 5, + "em": 0.32, + "f1": 0.3946, + "avg_faiss_overlap": 0.258, + "avg_entropy": 4.6045 + }, + { + "beta": 20.0, + "lambda": 0.8, + "max_iter": 8, + "em": 0.28, + "f1": 0.3927, + "avg_faiss_overlap": 0.48, + "avg_entropy": 4.4867 + }, + { + "beta": 20.0, + "lambda": 0.5, + "max_iter": 3, + "em": 0.29, + "f1": 0.3925, + "avg_faiss_overlap": 0.452, + "avg_entropy": 4.5183 + }, + { + "beta": 10.0, + "lambda": 0.7, + "max_iter": 5, + "em": 0.3, + "f1": 0.379, + "avg_faiss_overlap": 0.334, + "avg_entropy": 7.112 + }, + { + "beta": 10.0, + "lambda": 0.8, + "max_iter": 8, + "em": 0.28, + "f1": 0.3659, + "avg_faiss_overlap": 0.318, + "avg_entropy": 7.1124 + }, + { + "beta": 10.0, + "lambda": 0.5, + "max_iter": 3, + "em": 0.28, + "f1": 0.353, + "avg_faiss_overlap": 0.222, + "avg_entropy": 7.1169 + }, + { + "beta": 5.0, + "lambda": 0.5, + "max_iter": 3, + "em": 0.27, + "f1": 0.3436, + "avg_faiss_overlap": 0.156, + "avg_entropy": 7.1645 + }, + { + "beta": 20.0, + "lambda": 0.5, + "max_iter": 8, + "em": 0.25, + "f1": 0.3298, + "avg_faiss_overlap": 0.186, + "avg_entropy": 4.565 + }, + { + "beta": 5.0, + "lambda": 0.7, + "max_iter": 5, + "em": 0.24, + "f1": 0.3274, + "avg_faiss_overlap": 0.278, + "avg_entropy": 7.1633 + }, + { + "beta": 5.0, + "lambda": 0.8, + "max_iter": 8, + "em": 0.24, + "f1": 0.3274, + "avg_faiss_overlap": 0.276, + "avg_entropy": 7.1634 + }, + { + "beta": 10.0, + "lambda": 0.7, + "max_iter": 8, + "em": 0.15, + "f1": 0.1995, + "avg_faiss_overlap": 0.044, + "avg_entropy": 7.123 + }, + { + "beta": 10.0, + "lambda": 0.5, + "max_iter": 5, + "em": 0.15, + "f1": 0.1983, + "avg_faiss_overlap": 0.022, + "avg_entropy": 7.1239 + }, + { + "beta": 5.0, + "lambda": 0.5, + "max_iter": 5, + "em": 0.14, + "f1": 0.1895, + "avg_faiss_overlap": 0.018, + "avg_entropy": 7.166 + }, + { + "beta": 5.0, + "lambda": 0.5, + "max_iter": 8, + "em": 0.14, + "f1": 0.1877, + "avg_faiss_overlap": 0.002, + "avg_entropy": 7.1661 + }, + { + "beta": 5.0, + "lambda": 0.7, + "max_iter": 8, + "em": 0.12, + "f1": 0.1824, + "avg_faiss_overlap": 0.034, + "avg_entropy": 7.1658 + }, + { + "beta": 10.0, + "lambda": 0.5, + "max_iter": 8, + "em": 0.12, + "f1": 0.1766, + "avg_faiss_overlap": 0.002, + "avg_entropy": 7.1239 + } + ], + "best_config": { + "beta": 5.0, + "lambda": 0.7, + "max_iter": 1, + "em": 0.36, + "f1": 0.4809, + "avg_faiss_overlap": 0.902, + "avg_entropy": 7.1163 + }, + "top10": [ + { + "beta": 5.0, + "lambda": 0.7, + "max_iter": 1, + "em": 0.36, + "f1": 0.4809, + "avg_faiss_overlap": 0.902, + "avg_entropy": 7.1163 + }, + { + "beta": 5.0, + "lambda": 0.9, + "max_iter": 3, + "em": 0.36, + "f1": 0.4809, + "avg_faiss_overlap": 0.912, + "avg_entropy": 7.1122 + }, + { + "beta": 10.0, + "lambda": 0.7, + "max_iter": 1, + "em": 0.36, + "f1": 0.4809, + "avg_faiss_overlap": 0.9, + "avg_entropy": 6.8422 + }, + { + "beta": 10.0, + "lambda": 0.95, + "max_iter": 8, + "em": 0.36, + "f1": 0.4797, + "avg_faiss_overlap": 0.886, + "avg_entropy": 6.8941 + }, + { + "beta": 5.0, + "lambda": 0.95, + "max_iter": 8, + "em": 0.35, + "f1": 0.4697, + "avg_faiss_overlap": 0.886, + "avg_entropy": 7.1219 + }, + { + "beta": 5.0, + "lambda": 0.95, + "max_iter": 3, + "em": 0.35, + "f1": 0.4692, + "avg_faiss_overlap": 0.956, + "avg_entropy": 7.09 + }, + { + "beta": 5.0, + "lambda": 0.95, + "max_iter": 5, + "em": 0.35, + "f1": 0.4692, + "avg_faiss_overlap": 0.928, + "avg_entropy": 7.105 + }, + { + "beta": 10.0, + "lambda": 0.9, + "max_iter": 1, + "em": 0.35, + "f1": 0.4692, + "avg_faiss_overlap": 0.966, + "avg_entropy": 6.6138 + }, + { + "beta": 10.0, + "lambda": 0.95, + "max_iter": 3, + "em": 0.35, + "f1": 0.4692, + "avg_faiss_overlap": 0.956, + "avg_entropy": 6.6763 + }, + { + "beta": 5.0, + "lambda": 0.9, + "max_iter": 1, + "em": 0.34, + "f1": 0.4672, + "avg_faiss_overlap": 0.97, + "avg_entropy": 7.0815 + } + ] +}
\ No newline at end of file |
