diff options
Diffstat (limited to 'runs/seeds_sweep.log')
| -rw-r--r-- | runs/seeds_sweep.log | 234 |
1 files changed, 234 insertions, 0 deletions
diff --git a/runs/seeds_sweep.log b/runs/seeds_sweep.log new file mode 100644 index 0000000..7e1b08f --- /dev/null +++ b/runs/seeds_sweep.log @@ -0,0 +1,234 @@ +host=timan1.cs.illinois.edu gpu=0 start=2026-06-16T11:50:28-05:00 +===== seed=0 full none ===== +ep20 solve_rate=0.000 mean_conflicts=114.06 +ep40 solve_rate=0.163 mean_conflicts=61.27 +ep60 solve_rate=0.410 mean_conflicts=17.81 +ep80 solve_rate=0.610 mean_conflicts=4.32 +ep100 solve_rate=0.440 mean_conflicts=16.00 +ep120 solve_rate=0.287 mean_conflicts=19.54 +ep140 solve_rate=0.090 mean_conflicts=59.83 +ep150 solve_rate=0.063 mean_conflicts=70.46 +[color_full_none_n50_k3_p0.2_T3_ns3_s0] best solve_rate=0.61 mean_conflicts=4.323 @ep80 (155.3s) + wrote /home/yurenh2/rrog/runs/ckpt_color_full_none_n50_k3_p0.2_T3_ns3_s0.pt +[full] LE n=300 fail_rate=0.39 | lambda1 SOLVED mean -0.3206 (n=183) | UNSOLVED mean +0.0485 (n=117) | sep=+0.3692 | AUROC(fail|lambda1)=0.919 | mean_lambda1=-0.1767 +[pe=none s0] deterministic solve_rate = 0.627 (n=150, K=16) + sigma pass@K lam-sel random perRoll AUROC(s|-lam) + 0.2 0.900 0.773 0.640 0.595 0.883 + wrote /home/yurenh2/rrog/runs/ptrm_color_full_none_n50_k3_p0.2_T3_ns3_s0.json +===== seed=0 full rwse ===== +ep20 solve_rate=0.000 mean_conflicts=51.22 +ep40 solve_rate=0.083 mean_conflicts=55.16 +ep60 solve_rate=0.203 mean_conflicts=58.38 +ep80 solve_rate=0.350 mean_conflicts=37.23 +ep100 solve_rate=0.383 mean_conflicts=31.48 +ep120 solve_rate=0.203 mean_conflicts=53.56 +ep140 solve_rate=0.273 mean_conflicts=37.79 +ep150 solve_rate=0.280 mean_conflicts=36.98 +[color_full_rwse_n50_k3_p0.2_T3_ns3_s0] best solve_rate=0.3833 mean_conflicts=31.483 @ep100 (153.3s) + wrote /home/yurenh2/rrog/runs/ckpt_color_full_rwse_n50_k3_p0.2_T3_ns3_s0.pt +[full] LE n=300 fail_rate=0.62 | lambda1 SOLVED mean -0.1834 (n=115) | UNSOLVED mean +0.0100 (n=185) | sep=+0.1934 | AUROC(fail|lambda1)=0.817 | mean_lambda1=-0.0641 +[pe=rwse s0] deterministic solve_rate = 0.407 (n=150, K=16) + sigma pass@K lam-sel random perRoll AUROC(s|-lam) + 0.2 0.740 0.587 0.493 0.426 0.808 + wrote /home/yurenh2/rrog/runs/ptrm_color_full_rwse_n50_k3_p0.2_T3_ns3_s0.json +===== seed=0 1step none ===== +ep20 solve_rate=0.000 mean_conflicts=44.63 +ep40 solve_rate=0.123 mean_conflicts=19.01 +ep60 solve_rate=0.200 mean_conflicts=18.33 +ep80 solve_rate=0.067 mean_conflicts=42.81 +ep100 solve_rate=0.027 mean_conflicts=42.48 +ep120 solve_rate=0.087 mean_conflicts=23.53 +ep140 solve_rate=0.100 mean_conflicts=10.46 +ep150 solve_rate=0.127 mean_conflicts=18.32 +[color_1step_none_n50_k3_p0.2_T3_ns3_s0] best solve_rate=0.2 mean_conflicts=18.333 @ep60 (127.1s) + wrote /home/yurenh2/rrog/runs/ckpt_color_1step_none_n50_k3_p0.2_T3_ns3_s0.pt +[1step] LE n=300 fail_rate=0.80 | lambda1 SOLVED mean -0.2978 (n=60) | UNSOLVED mean -0.1209 (n=240) | sep=+0.1769 | AUROC(fail|lambda1)=0.770 | mean_lambda1=-0.1563 +===== seed=1 full none ===== +ep20 solve_rate=0.000 mean_conflicts=76.94 +ep40 solve_rate=0.070 mean_conflicts=41.69 +ep60 solve_rate=0.207 mean_conflicts=33.02 +ep80 solve_rate=0.453 mean_conflicts=11.75 +ep100 solve_rate=0.203 mean_conflicts=21.24 +ep120 solve_rate=0.163 mean_conflicts=19.60 +ep140 solve_rate=0.067 mean_conflicts=41.63 +ep150 solve_rate=0.027 mean_conflicts=50.77 +[color_full_none_n50_k3_p0.2_T3_ns3_s1] best solve_rate=0.4533 mean_conflicts=11.753 @ep80 (151.0s) + wrote /home/yurenh2/rrog/runs/ckpt_color_full_none_n50_k3_p0.2_T3_ns3_s1.pt +[full] LE n=300 fail_rate=0.55 | lambda1 SOLVED mean -0.3182 (n=136) | UNSOLVED mean -0.0426 (n=164) | sep=+0.2757 | AUROC(fail|lambda1)=0.864 | mean_lambda1=-0.1675 +[pe=none s1] deterministic solve_rate = 0.480 (n=150, K=16) + sigma pass@K lam-sel random perRoll AUROC(s|-lam) + 0.2 0.740 0.593 0.440 0.456 0.869 + wrote /home/yurenh2/rrog/runs/ptrm_color_full_none_n50_k3_p0.2_T3_ns3_s1.json +===== seed=1 full rwse ===== +ep20 solve_rate=0.000 mean_conflicts=115.59 +ep40 solve_rate=0.100 mean_conflicts=91.86 +ep60 solve_rate=0.327 mean_conflicts=34.60 +ep80 solve_rate=0.097 mean_conflicts=88.34 +ep100 solve_rate=0.140 mean_conflicts=56.41 +ep120 solve_rate=0.500 mean_conflicts=16.20 +ep140 solve_rate=0.537 mean_conflicts=13.68 +ep150 solve_rate=0.500 mean_conflicts=16.04 +[color_full_rwse_n50_k3_p0.2_T3_ns3_s1] best solve_rate=0.5367 mean_conflicts=13.68 @ep140 (152.1s) + wrote /home/yurenh2/rrog/runs/ckpt_color_full_rwse_n50_k3_p0.2_T3_ns3_s1.pt +[full] LE n=300 fail_rate=0.46 | lambda1 SOLVED mean +0.0219 (n=161) | UNSOLVED mean +0.1773 (n=139) | sep=+0.1554 | AUROC(fail|lambda1)=0.774 | mean_lambda1=+0.0939 +[pe=rwse s1] deterministic solve_rate = 0.547 (n=150, K=16) + sigma pass@K lam-sel random perRoll AUROC(s|-lam) + 0.2 0.947 0.727 0.553 0.507 0.814 + wrote /home/yurenh2/rrog/runs/ptrm_color_full_rwse_n50_k3_p0.2_T3_ns3_s1.json +===== seed=1 1step none ===== +ep20 solve_rate=0.000 mean_conflicts=47.30 +ep40 solve_rate=0.157 mean_conflicts=34.06 +ep60 solve_rate=0.297 mean_conflicts=7.36 +ep80 solve_rate=0.333 mean_conflicts=8.70 +ep100 solve_rate=0.193 mean_conflicts=13.99 +ep120 solve_rate=0.233 mean_conflicts=6.94 +ep140 solve_rate=0.227 mean_conflicts=6.85 +ep150 solve_rate=0.320 mean_conflicts=5.31 +[color_1step_none_n50_k3_p0.2_T3_ns3_s1] best solve_rate=0.3333 mean_conflicts=8.7 @ep80 (123.3s) + wrote /home/yurenh2/rrog/runs/ckpt_color_1step_none_n50_k3_p0.2_T3_ns3_s1.pt +[1step] LE n=300 fail_rate=0.67 | lambda1 SOLVED mean -0.2773 (n=100) | UNSOLVED mean +0.0189 (n=200) | sep=+0.2963 | AUROC(fail|lambda1)=0.846 | mean_lambda1=-0.0798 +===== seed=2 full none ===== +ep20 solve_rate=0.000 mean_conflicts=137.74 +ep40 solve_rate=0.087 mean_conflicts=44.48 +ep60 solve_rate=0.140 mean_conflicts=42.89 +ep80 solve_rate=0.413 mean_conflicts=16.23 +ep100 solve_rate=0.123 mean_conflicts=60.71 +ep120 solve_rate=0.027 mean_conflicts=58.64 +ep140 solve_rate=0.017 mean_conflicts=79.05 +ep150 solve_rate=0.013 mean_conflicts=83.54 +[color_full_none_n50_k3_p0.2_T3_ns3_s2] best solve_rate=0.4133 mean_conflicts=16.23 @ep80 (154.4s) + wrote /home/yurenh2/rrog/runs/ckpt_color_full_none_n50_k3_p0.2_T3_ns3_s2.pt +[full] LE n=300 fail_rate=0.59 | lambda1 SOLVED mean -0.2913 (n=124) | UNSOLVED mean -0.0796 (n=176) | sep=+0.2117 | AUROC(fail|lambda1)=0.830 | mean_lambda1=-0.1671 +[pe=none s2] deterministic solve_rate = 0.447 (n=150, K=16) + sigma pass@K lam-sel random perRoll AUROC(s|-lam) + 0.2 0.773 0.620 0.467 0.435 0.821 + wrote /home/yurenh2/rrog/runs/ptrm_color_full_none_n50_k3_p0.2_T3_ns3_s2.json +===== seed=2 full rwse ===== +ep20 solve_rate=0.000 mean_conflicts=89.84 +ep40 solve_rate=0.110 mean_conflicts=36.78 +ep60 solve_rate=0.400 mean_conflicts=9.39 +ep80 solve_rate=0.560 mean_conflicts=3.40 +ep100 solve_rate=0.463 mean_conflicts=23.90 +ep120 solve_rate=0.550 mean_conflicts=16.37 +ep140 solve_rate=0.557 mean_conflicts=23.72 +ep150 solve_rate=0.510 mean_conflicts=34.19 +[color_full_rwse_n50_k3_p0.2_T3_ns3_s2] best solve_rate=0.56 mean_conflicts=3.4 @ep80 (151.8s) + wrote /home/yurenh2/rrog/runs/ckpt_color_full_rwse_n50_k3_p0.2_T3_ns3_s2.pt +[full] LE n=300 fail_rate=0.44 | lambda1 SOLVED mean -0.2328 (n=168) | UNSOLVED mean +0.0707 (n=132) | sep=+0.3034 | AUROC(fail|lambda1)=0.883 | mean_lambda1=-0.0993 +[pe=rwse s2] deterministic solve_rate = 0.600 (n=150, K=16) + sigma pass@K lam-sel random perRoll AUROC(s|-lam) + 0.2 0.893 0.787 0.573 0.556 0.873 + wrote /home/yurenh2/rrog/runs/ptrm_color_full_rwse_n50_k3_p0.2_T3_ns3_s2.json +===== seed=2 1step none ===== +ep20 solve_rate=0.000 mean_conflicts=38.00 +ep40 solve_rate=0.153 mean_conflicts=18.17 +ep60 solve_rate=0.327 mean_conflicts=7.32 +ep80 solve_rate=0.320 mean_conflicts=5.31 +ep100 solve_rate=0.357 mean_conflicts=4.67 +ep120 solve_rate=0.293 mean_conflicts=7.30 +ep140 solve_rate=0.273 mean_conflicts=7.96 +ep150 solve_rate=0.227 mean_conflicts=10.33 +[color_1step_none_n50_k3_p0.2_T3_ns3_s2] best solve_rate=0.3567 mean_conflicts=4.67 @ep100 (131.6s) + wrote /home/yurenh2/rrog/runs/ckpt_color_1step_none_n50_k3_p0.2_T3_ns3_s2.pt +[1step] LE n=300 fail_rate=0.64 | lambda1 SOLVED mean -0.2153 (n=107) | UNSOLVED mean +0.0916 (n=193) | sep=+0.3069 | AUROC(fail|lambda1)=0.867 | mean_lambda1=-0.0179 +===== seed=3 full none ===== +ep20 solve_rate=0.000 mean_conflicts=62.81 +ep40 solve_rate=0.207 mean_conflicts=33.51 +ep60 solve_rate=0.460 mean_conflicts=15.68 +ep80 solve_rate=0.033 mean_conflicts=114.30 +ep100 solve_rate=0.000 mean_conflicts=82.87 +ep120 solve_rate=0.000 mean_conflicts=85.01 +ep140 solve_rate=0.000 mean_conflicts=77.61 +ep150 solve_rate=0.000 mean_conflicts=79.58 +[color_full_none_n50_k3_p0.2_T3_ns3_s3] best solve_rate=0.46 mean_conflicts=15.68 @ep60 (151.6s) + wrote /home/yurenh2/rrog/runs/ckpt_color_full_none_n50_k3_p0.2_T3_ns3_s3.pt +[full] LE n=300 fail_rate=0.54 | lambda1 SOLVED mean -0.4216 (n=138) | UNSOLVED mean -0.0826 (n=162) | sep=+0.3390 | AUROC(fail|lambda1)=0.886 | mean_lambda1=-0.2385 +[pe=none s3] deterministic solve_rate = 0.493 (n=150, K=16) + sigma pass@K lam-sel random perRoll AUROC(s|-lam) + 0.2 0.733 0.587 0.493 0.472 0.856 + wrote /home/yurenh2/rrog/runs/ptrm_color_full_none_n50_k3_p0.2_T3_ns3_s3.json +===== seed=3 full rwse ===== +ep20 solve_rate=0.000 mean_conflicts=139.52 +ep40 solve_rate=0.043 mean_conflicts=135.26 +ep60 solve_rate=0.313 mean_conflicts=32.37 +ep80 solve_rate=0.370 mean_conflicts=14.78 +ep100 solve_rate=0.490 mean_conflicts=12.46 +ep120 solve_rate=0.067 mean_conflicts=89.87 +ep140 solve_rate=0.137 mean_conflicts=53.29 +ep150 solve_rate=0.127 mean_conflicts=46.23 +[color_full_rwse_n50_k3_p0.2_T3_ns3_s3] best solve_rate=0.49 mean_conflicts=12.46 @ep100 (152.1s) + wrote /home/yurenh2/rrog/runs/ckpt_color_full_rwse_n50_k3_p0.2_T3_ns3_s3.pt +[full] LE n=300 fail_rate=0.51 | lambda1 SOLVED mean -0.0438 (n=147) | UNSOLVED mean +0.1618 (n=153) | sep=+0.2056 | AUROC(fail|lambda1)=0.830 | mean_lambda1=+0.0610 +[pe=rwse s3] deterministic solve_rate = 0.507 (n=150, K=16) + sigma pass@K lam-sel random perRoll AUROC(s|-lam) + 0.2 0.893 0.727 0.473 0.497 0.823 + wrote /home/yurenh2/rrog/runs/ptrm_color_full_rwse_n50_k3_p0.2_T3_ns3_s3.json +===== seed=3 1step none ===== +ep20 solve_rate=0.000 mean_conflicts=54.09 +ep40 solve_rate=0.103 mean_conflicts=17.25 +ep60 solve_rate=0.187 mean_conflicts=28.48 +ep80 solve_rate=0.120 mean_conflicts=29.85 +ep100 solve_rate=0.200 mean_conflicts=16.34 +ep120 solve_rate=0.337 mean_conflicts=7.23 +ep140 solve_rate=0.157 mean_conflicts=16.67 +ep150 solve_rate=0.087 mean_conflicts=19.67 +[color_1step_none_n50_k3_p0.2_T3_ns3_s3] best solve_rate=0.3367 mean_conflicts=7.227 @ep120 (124.3s) + wrote /home/yurenh2/rrog/runs/ckpt_color_1step_none_n50_k3_p0.2_T3_ns3_s3.pt +[1step] LE n=300 fail_rate=0.66 | lambda1 SOLVED mean -0.0124 (n=101) | UNSOLVED mean +0.1694 (n=199) | sep=+0.1818 | AUROC(fail|lambda1)=0.786 | mean_lambda1=+0.1082 +===== seed=4 full none ===== +ep20 solve_rate=0.000 mean_conflicts=67.14 +ep40 solve_rate=0.200 mean_conflicts=24.37 +ep60 solve_rate=0.347 mean_conflicts=27.20 +ep80 solve_rate=0.127 mean_conflicts=41.29 +ep100 solve_rate=0.180 mean_conflicts=35.82 +ep120 solve_rate=0.073 mean_conflicts=66.69 +ep140 solve_rate=0.070 mean_conflicts=74.02 +ep150 solve_rate=0.073 mean_conflicts=81.13 +[color_full_none_n50_k3_p0.2_T3_ns3_s4] best solve_rate=0.3467 mean_conflicts=27.197 @ep60 (149.8s) + wrote /home/yurenh2/rrog/runs/ckpt_color_full_none_n50_k3_p0.2_T3_ns3_s4.pt +[full] LE n=300 fail_rate=0.65 | lambda1 SOLVED mean -0.4008 (n=104) | UNSOLVED mean -0.1440 (n=196) | sep=+0.2567 | AUROC(fail|lambda1)=0.842 | mean_lambda1=-0.2330 +[pe=none s4] deterministic solve_rate = 0.353 (n=150, K=16) + sigma pass@K lam-sel random perRoll AUROC(s|-lam) + 0.2 0.547 0.427 0.373 0.358 0.835 + wrote /home/yurenh2/rrog/runs/ptrm_color_full_none_n50_k3_p0.2_T3_ns3_s4.json +===== seed=4 full rwse ===== +ep20 solve_rate=0.000 mean_conflicts=56.06 +ep40 solve_rate=0.050 mean_conflicts=55.24 +ep60 solve_rate=0.303 mean_conflicts=39.93 +ep80 solve_rate=0.357 mean_conflicts=15.34 +ep100 solve_rate=0.547 mean_conflicts=6.19 +ep120 solve_rate=0.440 mean_conflicts=18.82 +ep140 solve_rate=0.240 mean_conflicts=41.55 +ep150 solve_rate=0.233 mean_conflicts=46.02 +[color_full_rwse_n50_k3_p0.2_T3_ns3_s4] best solve_rate=0.5467 mean_conflicts=6.193 @ep100 (154.2s) + wrote /home/yurenh2/rrog/runs/ckpt_color_full_rwse_n50_k3_p0.2_T3_ns3_s4.pt +[full] LE n=300 fail_rate=0.45 | lambda1 SOLVED mean -0.1742 (n=164) | UNSOLVED mean +0.1084 (n=136) | sep=+0.2827 | AUROC(fail|lambda1)=0.880 | mean_lambda1=-0.0461 +[pe=rwse s4] deterministic solve_rate = 0.547 (n=150, K=16) + sigma pass@K lam-sel random perRoll AUROC(s|-lam) + 0.2 0.940 0.853 0.493 0.557 0.868 + wrote /home/yurenh2/rrog/runs/ptrm_color_full_rwse_n50_k3_p0.2_T3_ns3_s4.json +===== seed=4 1step none ===== +ep20 solve_rate=0.000 mean_conflicts=63.11 +ep40 solve_rate=0.187 mean_conflicts=11.62 +ep60 solve_rate=0.237 mean_conflicts=22.39 +ep80 solve_rate=0.200 mean_conflicts=24.78 +ep100 solve_rate=0.183 mean_conflicts=15.49 +ep120 solve_rate=0.207 mean_conflicts=12.25 +ep140 solve_rate=0.093 mean_conflicts=21.66 +ep150 solve_rate=0.067 mean_conflicts=30.98 +[color_1step_none_n50_k3_p0.2_T3_ns3_s4] best solve_rate=0.2367 mean_conflicts=22.393 @ep60 (120.8s) + wrote /home/yurenh2/rrog/runs/ckpt_color_1step_none_n50_k3_p0.2_T3_ns3_s4.pt +[1step] LE n=300 fail_rate=0.76 | lambda1 SOLVED mean -0.2996 (n=71) | UNSOLVED mean -0.0523 (n=229) | sep=+0.2473 | AUROC(fail|lambda1)=0.835 | mean_lambda1=-0.1108 +===== AGGREGATE ===== +=== best solve_rate (deterministic, EMA) === + ('1step', 'none'): 0.293±0.062 (n=5) + ('full', 'none'): 0.457±0.087 (n=5) + ('full', 'rwse'): 0.503±0.064 (n=5) +=== LE AUROC(fail|lambda1) === + ('1step', 'none'): 0.821±0.036 (n=5) + ('full', 'none'): 0.868±0.032 (n=5) + ('full', 'rwse'): 0.837±0.041 (n=5) +=== PTRM sigma=0.2 === + ('full', 'none'): det 0.480±0.088 (n=5) | pass@K 0.739±0.113 (n=5) | lambda-sel 0.600±0.110 (n=5) | AUROC 0.853±0.023 (n=5) + ('full', 'rwse'): det 0.521±0.065 (n=5) | pass@K 0.883±0.075 (n=5) | lambda-sel 0.736±0.088 (n=5) | AUROC 0.837±0.028 (n=5) +done=2026-06-16T12:41:20-05:00 |
