diff options
Diffstat (limited to 'research/flossing/initial_perturb_robustness')
14 files changed, 250 insertions, 0 deletions
diff --git a/research/flossing/initial_perturb_robustness/plots/initial_perturb_robustness_combined.csv b/research/flossing/initial_perturb_robustness/plots/initial_perturb_robustness_combined.csv new file mode 100644 index 0000000..12fdf18 --- /dev/null +++ b/research/flossing/initial_perturb_robustness/plots/initial_perturb_robustness_combined.csv @@ -0,0 +1,28 @@ +label,sigma,n_samples,rollouts,ckpt_root,ckpt_name,perturb,noise_distribution,mean_rollout_exact,mean_rollout_token_acc,pass_at_k,all_k,correct_count_mean,correct_count_std,correct_count_q10,correct_count_q50,correct_count_q90,zero_frac,full_frac
+trm_baseline_best,0.0,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8658000230789185,0.9502986073493958,0.8658000230789185,0.8658000230789185,6.926400184631348,2.7269365787506104,0.0,8.0,8.0,0.13420000672340393,0.8658000230789185
+trm_baseline_best,3e-05,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.868399977684021,0.9510218501091003,0.9455999732017517,0.769599974155426,6.947199821472168,2.309980869293213,3.0,8.0,8.0,0.0544000007212162,0.769599974155426
+trm_baseline_best,0.0001,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8675500154495239,0.9506199955940247,0.9485999941825867,0.769599974155426,6.940400123596191,2.3122386932373047,2.0,8.0,8.0,0.05139999836683273,0.769599974155426
+trm_baseline_best,0.0003,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8685500025749207,0.9512001872062683,0.9476000070571899,0.7698000073432922,6.948400020599365,2.310267925262451,3.0,8.0,8.0,0.052400000393390656,0.7698000073432922
+trm_baseline_best,0.001,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8694999814033508,0.9514188766479492,0.9488000273704529,0.7712000012397766,6.955999851226807,2.289206027984619,3.0,8.0,8.0,0.05119999870657921,0.7712000012397766
+trm_baseline_best,0.003,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8676750063896179,0.950739860534668,0.9448000192642212,0.7666000127792358,6.941400051116943,2.311615467071533,3.0,8.0,8.0,0.0551999993622303,0.7666000127792358
+trm_baseline_best,0.01,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669000267982483,0.9505268335342407,0.9452000260353088,0.7635999917984009,6.935200214385986,2.3034324645996094,3.0,8.0,8.0,0.05480000004172325,0.7635999917984009
+trm_baseline_best,0.03,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8665000200271606,0.9502792954444885,0.9503999948501587,0.7573999762535095,6.932000160217285,2.2814416885375977,3.0,8.0,8.0,0.04960000142455101,0.7573999762535095
+trm_baseline_best,0.1,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8664500117301941,0.9503796696662903,0.9553999900817871,0.7558000087738037,6.931600093841553,2.2636523246765137,3.0,8.0,8.0,0.044599998742341995,0.7558000087738037
+trm_multi4_best,0.0,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8920000195503235,0.9585950970649719,0.8920000195503235,0.8920000195503235,7.136000156402588,2.4830431938171387,0.0,8.0,8.0,0.1080000028014183,0.8920000195503235
+trm_multi4_best,3e-05,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8941249847412109,0.9596672654151917,0.9577999711036682,0.8095999956130981,7.1529998779296875,2.0940847396850586,4.0,8.0,8.0,0.0421999990940094,0.8095999956130981
+trm_multi4_best,0.0001,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8919249773025513,0.9588295817375183,0.9574000239372253,0.8040000200271606,7.13539981842041,2.107004165649414,4.0,8.0,8.0,0.04259999841451645,0.8040000200271606
+trm_multi4_best,0.0003,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8939999938011169,0.9595761895179749,0.9570000171661377,0.8118000030517578,7.1519999504089355,2.106584072113037,4.0,8.0,8.0,0.0430000014603138,0.8118000030517578
+trm_multi4_best,0.001,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8934500217437744,0.9593700170516968,0.9535999894142151,0.8091999888420105,7.147600173950195,2.1107852458953857,4.0,8.0,8.0,0.04639999940991402,0.8091999888420105
+trm_multi4_best,0.003,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8934749960899353,0.9593802690505981,0.9593999981880188,0.8101999759674072,7.147799968719482,2.1013221740722656,4.0,8.0,8.0,0.0406000018119812,0.8101999759674072
+trm_multi4_best,0.01,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8938249945640564,0.9593154191970825,0.9599999785423279,0.7993999719619751,7.150599956512451,2.065894365310669,4.0,8.0,8.0,0.03999999910593033,0.7993999719619751
+trm_multi4_best,0.03,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8929749727249146,0.9591527581214905,0.9606000185012817,0.7986000180244446,7.143799781799316,2.0669593811035156,4.0,8.0,8.0,0.039400000125169754,0.7986000180244446
+trm_multi4_best,0.1,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8917250037193298,0.9587355256080627,0.9639999866485596,0.79339998960495,7.133800029754639,2.062788724899292,4.0,8.0,8.0,0.035999998450279236,0.79339998960495
+trm_multi4_final,0.0,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8452000021934509,0.9396741390228271,0.8452000021934509,0.8452000021934509,6.761600017547607,2.8937113285064697,0.0,8.0,8.0,0.15479999780654907,0.8452000021934509
+trm_multi4_final,3e-05,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.840499997138977,0.9371709823608398,0.9114000201225281,0.7591999769210815,6.723999977111816,2.6249237060546875,1.0,8.0,8.0,0.08860000222921371,0.7591999769210815
+trm_multi4_final,0.0001,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8407999873161316,0.9373589754104614,0.9136000275611877,0.7577999830245972,6.726399898529053,2.612344264984131,1.0,8.0,8.0,0.08640000224113464,0.7577999830245972
+trm_multi4_final,0.0003,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8399249911308289,0.9371064305305481,0.9092000126838684,0.7572000026702881,6.719399929046631,2.6245501041412354,1.0,8.0,8.0,0.09080000221729279,0.7572000026702881
+trm_multi4_final,0.001,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8385249972343445,0.9361668229103088,0.9089999794960022,0.7588000297546387,6.708199977874756,2.6401236057281494,1.0,8.0,8.0,0.09099999815225601,0.7588000297546387
+trm_multi4_final,0.003,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8406999707221985,0.9372812509536743,0.9165999889373779,0.753000020980835,6.725599765777588,2.6047465801239014,1.0,8.0,8.0,0.08340000361204147,0.753000020980835
+trm_multi4_final,0.01,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.838100016117096,0.9362147450447083,0.9132000207901001,0.7477999925613403,6.704800128936768,2.612366199493408,1.0,8.0,8.0,0.0868000015616417,0.7477999925613403
+trm_multi4_final,0.03,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8380249738693237,0.9364882707595825,0.9197999835014343,0.7378000020980835,6.70419979095459,2.575946807861328,1.0,8.0,8.0,0.08020000159740448,0.7378000020980835
+trm_multi4_final,0.1,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.841949999332428,0.9376598596572876,0.9305999875068665,0.7305999994277954,6.735599994659424,2.5063304901123047,2.0,8.0,8.0,0.06939999759197235,0.7305999994277954
diff --git a/research/flossing/initial_perturb_robustness/plots/initial_perturb_robustness_conditional.csv b/research/flossing/initial_perturb_robustness/plots/initial_perturb_robustness_conditional.csv new file mode 100644 index 0000000..b7bf3a1 --- /dev/null +++ b/research/flossing/initial_perturb_robustness/plots/initial_perturb_robustness_conditional.csv @@ -0,0 +1,28 @@ +label,sigma,clean_acc,n_clean_success,n_clean_fail,retain_mean_on_clean_success,allK_on_clean_success,rescue_mean_on_clean_fail,passK_on_clean_fail
+baseline best,0.0,0.8658,4329,671,1.0,1.0,0.0,0.0
+baseline best,2.9999999242136255e-05,0.8658,4329,671,0.9607010857010857,0.8833448833448834,0.2729135618479881,0.6140089418777943
+baseline best,9.999999747378752e-05,0.8658,4329,671,0.959026334026334,0.8838068838068838,0.27738450074515647,0.6363636363636364
+baseline best,0.0003000000142492354,0.8658,4329,671,0.9602102102102102,0.8824208824208825,0.27719821162444114,0.6274217585692996
+baseline best,0.0010000000474974513,0.8658,4329,671,0.9605855855855856,0.8854238854238854,0.2818554396423249,0.6348733233979136
+baseline best,0.003000000026077032,0.8658,4329,671,0.9599214599214599,0.8791868791868792,0.2725409836065574,0.6080476900149031
+baseline best,0.009999999776482582,0.8658,4329,671,0.9584488334488335,0.8768768768768769,0.27626676602086436,0.6005961251862891
+baseline best,0.029999999329447746,0.8658,4329,671,0.9548972048972049,0.86994686994687,0.29619970193740686,0.6482861400894188
+baseline best,0.10000000149011612,0.8658,4329,671,0.9538865788865789,0.8655578655578655,0.3023472429210134,0.6795827123695977
+multi4 best,0.0,0.892,4460,540,1.0,1.0,0.0,0.0
+multi4 best,2.9999999242136255e-05,0.892,4460,540,0.9675728699551569,0.9042600896860986,0.2875,0.6314814814814815
+multi4 best,9.999999747378752e-05,0.892,4460,540,0.965695067264574,0.897982062780269,0.2826388888888889,0.6222222222222222
+multi4 best,0.0003000000142492354,0.892,4460,540,0.9668721973094171,0.905829596412556,0.29212962962962963,0.6296296296296297
+multi4 best,0.0010000000474974513,0.892,4460,540,0.9665078475336323,0.9013452914798207,0.2900462962962963,0.5981481481481481
+multi4 best,0.003000000026077032,0.892,4460,540,0.9664798206278027,0.9035874439461884,0.29050925925925924,0.6481481481481481
+multi4 best,0.009999999776482582,0.892,4460,540,0.965695067264574,0.8917040358744395,0.3002314814814815,0.6425925925925926
+multi4 best,0.029999999329447746,0.892,4460,540,0.9631726457399103,0.8905829596412556,0.31319444444444444,0.6555555555555556
+multi4 best,0.10000000149011612,0.892,4460,540,0.9617432735426009,0.8847533632286996,0.31342592592592594,0.6833333333333333
+multi4 final,0.0,0.8452,4226,774,1.0,1.0,0.0,0.0
+multi4 final,2.9999999242136255e-05,0.8452,4226,774,0.9613700899195456,0.8946994794131566,0.18055555555555555,0.4483204134366925
+multi4 final,9.999999747378752e-05,0.8452,4226,774,0.959979886417416,0.8928064363464269,0.19008397932816537,0.4625322997416021
+multi4 final,0.0003000000142492354,0.8452,4226,774,0.9602756743965926,0.8916232844297208,0.18281653746770027,0.4405684754521964
+multi4 final,0.0010000000474974513,0.8452,4226,774,0.9595362044486512,0.8937529578797918,0.17781007751937986,0.4418604651162791
+multi4 final,0.003000000026077032,0.8452,4226,774,0.9595362044486512,0.8868906767628963,0.19186046511627908,0.4844961240310077
+multi4 final,0.009999999776482582,0.8452,4226,774,0.9556613819214387,0.8797917652626597,0.19622093023255813,0.46511627906976744
+multi4 final,0.029999999329447746,0.8452,4226,774,0.9528513961192617,0.8684335068622812,0.21107881136950904,0.5064599483204134
+multi4 final,0.10000000149011612,0.8452,4226,774,0.949420255560814,0.8563653573118788,0.25516795865633074,0.5762273901808785
diff --git a/research/flossing/initial_perturb_robustness/smoke_baseline.meta.json b/research/flossing/initial_perturb_robustness/smoke_baseline.meta.json new file mode 100644 index 0000000..099f473 --- /dev/null +++ b/research/flossing/initial_perturb_robustness/smoke_baseline.meta.json @@ -0,0 +1,22 @@ +{ + "args": { + "batch_size": 8, + "ckpt_name": "step_58590", + "ckpt_root": "/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro", + "label": "trm_baseline_smoke", + "n_samples": 16, + "noise_distribution": "gaussian", + "out_prefix": "research/flossing/initial_perturb_robustness/smoke_baseline", + "perturb": "both", + "rollouts": 2, + "seed": 20260605, + "sigmas": "0,0.001" + }, + "config_global_batch_size": 768, + "data_path": "/home/yurenh2/rrm/data/sudoku-extreme-1k-aug-1000", + "n_samples": 16, + "sigmas": [ + 0.0, + 0.001 + ] +}
\ No newline at end of file diff --git a/research/flossing/initial_perturb_robustness/smoke_baseline.summary.csv b/research/flossing/initial_perturb_robustness/smoke_baseline.summary.csv new file mode 100644 index 0000000..c5fd40c --- /dev/null +++ b/research/flossing/initial_perturb_robustness/smoke_baseline.summary.csv @@ -0,0 +1,3 @@ +label,sigma,n_samples,rollouts,ckpt_root,ckpt_name,perturb,noise_distribution,mean_rollout_exact,mean_rollout_token_acc,pass_at_k,all_k,correct_count_mean,correct_count_std,correct_count_q10,correct_count_q50,correct_count_q90,zero_frac,full_frac
+trm_baseline_smoke,0.0,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.875,0.9537037014961243,0.875,0.875,1.75,0.6614378094673157,1.0,2.0,2.0,0.125,0.875
+trm_baseline_smoke,0.001,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8125,0.9309413433074951,0.875,0.75,1.625,0.6959705352783203,0.5,2.0,2.0,0.125,0.75
diff --git a/research/flossing/initial_perturb_robustness/smoke_baseline_b32k8.meta.json b/research/flossing/initial_perturb_robustness/smoke_baseline_b32k8.meta.json new file mode 100644 index 0000000..00b7c1c --- /dev/null +++ b/research/flossing/initial_perturb_robustness/smoke_baseline_b32k8.meta.json @@ -0,0 +1,21 @@ +{ + "args": { + "batch_size": 32, + "ckpt_name": "step_58590", + "ckpt_root": "/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro", + "label": "trm_baseline_b32k8_smoke", + "n_samples": 32, + "noise_distribution": "gaussian", + "out_prefix": "research/flossing/initial_perturb_robustness/smoke_baseline_b32k8", + "perturb": "both", + "rollouts": 8, + "seed": 20260605, + "sigmas": "0" + }, + "config_global_batch_size": 768, + "data_path": "/home/yurenh2/rrm/data/sudoku-extreme-1k-aug-1000", + "n_samples": 32, + "sigmas": [ + 0.0 + ] +}
\ No newline at end of file diff --git a/research/flossing/initial_perturb_robustness/smoke_baseline_b32k8.summary.csv b/research/flossing/initial_perturb_robustness/smoke_baseline_b32k8.summary.csv new file mode 100644 index 0000000..e7ca899 --- /dev/null +++ b/research/flossing/initial_perturb_robustness/smoke_baseline_b32k8.summary.csv @@ -0,0 +1,2 @@ +label,sigma,n_samples,rollouts,ckpt_root,ckpt_name,perturb,noise_distribution,mean_rollout_exact,mean_rollout_token_acc,pass_at_k,all_k,correct_count_mean,correct_count_std,correct_count_q10,correct_count_q50,correct_count_q90,zero_frac,full_frac
+trm_baseline_b32k8_smoke,0.0,32,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.90625,0.9637345671653748,0.90625,0.90625,7.25,2.3318448066711426,8.0,8.0,8.0,0.09375,0.90625
diff --git a/research/flossing/initial_perturb_robustness/smoke_plots/initial_perturb_robustness_combined.csv b/research/flossing/initial_perturb_robustness/smoke_plots/initial_perturb_robustness_combined.csv new file mode 100644 index 0000000..f7c3ccc --- /dev/null +++ b/research/flossing/initial_perturb_robustness/smoke_plots/initial_perturb_robustness_combined.csv @@ -0,0 +1,4 @@ +label,sigma,n_samples,rollouts,ckpt_root,ckpt_name,perturb,noise_distribution,mean_rollout_exact,mean_rollout_token_acc,pass_at_k,all_k,correct_count_mean,correct_count_std,correct_count_q10,correct_count_q50,correct_count_q90,zero_frac,full_frac
+trm_baseline_smoke,0.0,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.875,0.9537037014961243,0.875,0.875,1.75,0.6614378094673157,1.0,2.0,2.0,0.125,0.875
+trm_baseline_smoke,0.001,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8125,0.9309413433074951,0.875,0.75,1.625,0.6959705352783203,0.5,2.0,2.0,0.125,0.75
+trm_baseline_b32k8_smoke,0.0,32,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.90625,0.9637345671653748,0.90625,0.90625,7.25,2.3318448066711426,8.0,8.0,8.0,0.09375,0.90625
diff --git a/research/flossing/initial_perturb_robustness/trm_baseline_best_step58590_n5000_k8.meta.json b/research/flossing/initial_perturb_robustness/trm_baseline_best_step58590_n5000_k8.meta.json new file mode 100644 index 0000000..06f0b35 --- /dev/null +++ b/research/flossing/initial_perturb_robustness/trm_baseline_best_step58590_n5000_k8.meta.json @@ -0,0 +1,29 @@ +{ + "args": { + "batch_size": 32, + "ckpt_name": "step_58590", + "ckpt_root": "/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro", + "label": "trm_baseline_best", + "n_samples": 5000, + "noise_distribution": "gaussian", + "out_prefix": "research/flossing/initial_perturb_robustness/trm_baseline_best_step58590_n5000_k8", + "perturb": "both", + "rollouts": 8, + "seed": 20260605, + "sigmas": "0,0.00003,0.0001,0.0003,0.001,0.003,0.01,0.03,0.1" + }, + "config_global_batch_size": 768, + "data_path": "/home/yurenh2/rrm/data/sudoku-extreme-1k-aug-1000", + "n_samples": 5000, + "sigmas": [ + 0.0, + 3e-05, + 0.0001, + 0.0003, + 0.001, + 0.003, + 0.01, + 0.03, + 0.1 + ] +}
\ No newline at end of file diff --git a/research/flossing/initial_perturb_robustness/trm_baseline_best_step58590_n5000_k8.summary.csv b/research/flossing/initial_perturb_robustness/trm_baseline_best_step58590_n5000_k8.summary.csv new file mode 100644 index 0000000..94b9d4a --- /dev/null +++ b/research/flossing/initial_perturb_robustness/trm_baseline_best_step58590_n5000_k8.summary.csv @@ -0,0 +1,10 @@ +label,sigma,n_samples,rollouts,ckpt_root,ckpt_name,perturb,noise_distribution,mean_rollout_exact,mean_rollout_token_acc,pass_at_k,all_k,correct_count_mean,correct_count_std,correct_count_q10,correct_count_q50,correct_count_q90,zero_frac,full_frac
+trm_baseline_best,0.0,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8658000230789185,0.9502986073493958,0.8658000230789185,0.8658000230789185,6.926400184631348,2.7269365787506104,0.0,8.0,8.0,0.13420000672340393,0.8658000230789185
+trm_baseline_best,3e-05,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.868399977684021,0.9510218501091003,0.9455999732017517,0.769599974155426,6.947199821472168,2.309980869293213,3.0,8.0,8.0,0.0544000007212162,0.769599974155426
+trm_baseline_best,0.0001,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8675500154495239,0.9506199955940247,0.9485999941825867,0.769599974155426,6.940400123596191,2.3122386932373047,2.0,8.0,8.0,0.05139999836683273,0.769599974155426
+trm_baseline_best,0.0003,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8685500025749207,0.9512001872062683,0.9476000070571899,0.7698000073432922,6.948400020599365,2.310267925262451,3.0,8.0,8.0,0.052400000393390656,0.7698000073432922
+trm_baseline_best,0.001,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8694999814033508,0.9514188766479492,0.9488000273704529,0.7712000012397766,6.955999851226807,2.289206027984619,3.0,8.0,8.0,0.05119999870657921,0.7712000012397766
+trm_baseline_best,0.003,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8676750063896179,0.950739860534668,0.9448000192642212,0.7666000127792358,6.941400051116943,2.311615467071533,3.0,8.0,8.0,0.0551999993622303,0.7666000127792358
+trm_baseline_best,0.01,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669000267982483,0.9505268335342407,0.9452000260353088,0.7635999917984009,6.935200214385986,2.3034324645996094,3.0,8.0,8.0,0.05480000004172325,0.7635999917984009
+trm_baseline_best,0.03,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8665000200271606,0.9502792954444885,0.9503999948501587,0.7573999762535095,6.932000160217285,2.2814416885375977,3.0,8.0,8.0,0.04960000142455101,0.7573999762535095
+trm_baseline_best,0.1,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8664500117301941,0.9503796696662903,0.9553999900817871,0.7558000087738037,6.931600093841553,2.2636523246765137,3.0,8.0,8.0,0.044599998742341995,0.7558000087738037
diff --git a/research/flossing/initial_perturb_robustness/trm_multi4_best_step35805_n5000_k8.meta.json b/research/flossing/initial_perturb_robustness/trm_multi4_best_step35805_n5000_k8.meta.json new file mode 100644 index 0000000..ec77dd2 --- /dev/null +++ b/research/flossing/initial_perturb_robustness/trm_multi4_best_step35805_n5000_k8.meta.json @@ -0,0 +1,29 @@ +{ + "args": { + "batch_size": 32, + "ckpt_name": "step_35805", + "ckpt_root": "/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro", + "label": "trm_multi4_best", + "n_samples": 5000, + "noise_distribution": "gaussian", + "out_prefix": "research/flossing/initial_perturb_robustness/trm_multi4_best_step35805_n5000_k8", + "perturb": "both", + "rollouts": 8, + "seed": 20260605, + "sigmas": "0,0.00003,0.0001,0.0003,0.001,0.003,0.01,0.03,0.1" + }, + "config_global_batch_size": 768, + "data_path": "/home/yurenh2/rrm/data/sudoku-extreme-1k-aug-1000", + "n_samples": 5000, + "sigmas": [ + 0.0, + 3e-05, + 0.0001, + 0.0003, + 0.001, + 0.003, + 0.01, + 0.03, + 0.1 + ] +}
\ No newline at end of file diff --git a/research/flossing/initial_perturb_robustness/trm_multi4_best_step35805_n5000_k8.summary.csv b/research/flossing/initial_perturb_robustness/trm_multi4_best_step35805_n5000_k8.summary.csv new file mode 100644 index 0000000..0f352ca --- /dev/null +++ b/research/flossing/initial_perturb_robustness/trm_multi4_best_step35805_n5000_k8.summary.csv @@ -0,0 +1,10 @@ +label,sigma,n_samples,rollouts,ckpt_root,ckpt_name,perturb,noise_distribution,mean_rollout_exact,mean_rollout_token_acc,pass_at_k,all_k,correct_count_mean,correct_count_std,correct_count_q10,correct_count_q50,correct_count_q90,zero_frac,full_frac
+trm_multi4_best,0.0,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8920000195503235,0.9585950970649719,0.8920000195503235,0.8920000195503235,7.136000156402588,2.4830431938171387,0.0,8.0,8.0,0.1080000028014183,0.8920000195503235
+trm_multi4_best,3e-05,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8941249847412109,0.9596672654151917,0.9577999711036682,0.8095999956130981,7.1529998779296875,2.0940847396850586,4.0,8.0,8.0,0.0421999990940094,0.8095999956130981
+trm_multi4_best,0.0001,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8919249773025513,0.9588295817375183,0.9574000239372253,0.8040000200271606,7.13539981842041,2.107004165649414,4.0,8.0,8.0,0.04259999841451645,0.8040000200271606
+trm_multi4_best,0.0003,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8939999938011169,0.9595761895179749,0.9570000171661377,0.8118000030517578,7.1519999504089355,2.106584072113037,4.0,8.0,8.0,0.0430000014603138,0.8118000030517578
+trm_multi4_best,0.001,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8934500217437744,0.9593700170516968,0.9535999894142151,0.8091999888420105,7.147600173950195,2.1107852458953857,4.0,8.0,8.0,0.04639999940991402,0.8091999888420105
+trm_multi4_best,0.003,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8934749960899353,0.9593802690505981,0.9593999981880188,0.8101999759674072,7.147799968719482,2.1013221740722656,4.0,8.0,8.0,0.0406000018119812,0.8101999759674072
+trm_multi4_best,0.01,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8938249945640564,0.9593154191970825,0.9599999785423279,0.7993999719619751,7.150599956512451,2.065894365310669,4.0,8.0,8.0,0.03999999910593033,0.7993999719619751
+trm_multi4_best,0.03,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8929749727249146,0.9591527581214905,0.9606000185012817,0.7986000180244446,7.143799781799316,2.0669593811035156,4.0,8.0,8.0,0.039400000125169754,0.7986000180244446
+trm_multi4_best,0.1,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8917250037193298,0.9587355256080627,0.9639999866485596,0.79339998960495,7.133800029754639,2.062788724899292,4.0,8.0,8.0,0.035999998450279236,0.79339998960495
diff --git a/research/flossing/initial_perturb_robustness/trm_multi4_final_step65100_n5000_k8.meta.json b/research/flossing/initial_perturb_robustness/trm_multi4_final_step65100_n5000_k8.meta.json new file mode 100644 index 0000000..b4fd2a1 --- /dev/null +++ b/research/flossing/initial_perturb_robustness/trm_multi4_final_step65100_n5000_k8.meta.json @@ -0,0 +1,29 @@ +{ + "args": { + "batch_size": 32, + "ckpt_name": "step_65100", + "ckpt_root": "/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro", + "label": "trm_multi4_final", + "n_samples": 5000, + "noise_distribution": "gaussian", + "out_prefix": "research/flossing/initial_perturb_robustness/trm_multi4_final_step65100_n5000_k8", + "perturb": "both", + "rollouts": 8, + "seed": 20260605, + "sigmas": "0,0.00003,0.0001,0.0003,0.001,0.003,0.01,0.03,0.1" + }, + "config_global_batch_size": 768, + "data_path": "/home/yurenh2/rrm/data/sudoku-extreme-1k-aug-1000", + "n_samples": 5000, + "sigmas": [ + 0.0, + 3e-05, + 0.0001, + 0.0003, + 0.001, + 0.003, + 0.01, + 0.03, + 0.1 + ] +}
\ No newline at end of file diff --git a/research/flossing/initial_perturb_robustness/trm_multi4_final_step65100_n5000_k8.summary.csv b/research/flossing/initial_perturb_robustness/trm_multi4_final_step65100_n5000_k8.summary.csv new file mode 100644 index 0000000..b4387d7 --- /dev/null +++ b/research/flossing/initial_perturb_robustness/trm_multi4_final_step65100_n5000_k8.summary.csv @@ -0,0 +1,10 @@ +label,sigma,n_samples,rollouts,ckpt_root,ckpt_name,perturb,noise_distribution,mean_rollout_exact,mean_rollout_token_acc,pass_at_k,all_k,correct_count_mean,correct_count_std,correct_count_q10,correct_count_q50,correct_count_q90,zero_frac,full_frac
+trm_multi4_final,0.0,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8452000021934509,0.9396741390228271,0.8452000021934509,0.8452000021934509,6.761600017547607,2.8937113285064697,0.0,8.0,8.0,0.15479999780654907,0.8452000021934509
+trm_multi4_final,3e-05,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.840499997138977,0.9371709823608398,0.9114000201225281,0.7591999769210815,6.723999977111816,2.6249237060546875,1.0,8.0,8.0,0.08860000222921371,0.7591999769210815
+trm_multi4_final,0.0001,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8407999873161316,0.9373589754104614,0.9136000275611877,0.7577999830245972,6.726399898529053,2.612344264984131,1.0,8.0,8.0,0.08640000224113464,0.7577999830245972
+trm_multi4_final,0.0003,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8399249911308289,0.9371064305305481,0.9092000126838684,0.7572000026702881,6.719399929046631,2.6245501041412354,1.0,8.0,8.0,0.09080000221729279,0.7572000026702881
+trm_multi4_final,0.001,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8385249972343445,0.9361668229103088,0.9089999794960022,0.7588000297546387,6.708199977874756,2.6401236057281494,1.0,8.0,8.0,0.09099999815225601,0.7588000297546387
+trm_multi4_final,0.003,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8406999707221985,0.9372812509536743,0.9165999889373779,0.753000020980835,6.725599765777588,2.6047465801239014,1.0,8.0,8.0,0.08340000361204147,0.753000020980835
+trm_multi4_final,0.01,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.838100016117096,0.9362147450447083,0.9132000207901001,0.7477999925613403,6.704800128936768,2.612366199493408,1.0,8.0,8.0,0.0868000015616417,0.7477999925613403
+trm_multi4_final,0.03,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8380249738693237,0.9364882707595825,0.9197999835014343,0.7378000020980835,6.70419979095459,2.575946807861328,1.0,8.0,8.0,0.08020000159740448,0.7378000020980835
+trm_multi4_final,0.1,5000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.841949999332428,0.9376598596572876,0.9305999875068665,0.7305999994277954,6.735599994659424,2.5063304901123047,2.0,8.0,8.0,0.06939999759197235,0.7305999994277954
diff --git a/research/flossing/initial_perturb_robustness/watch_and_plot.sh b/research/flossing/initial_perturb_robustness/watch_and_plot.sh new file mode 100755 index 0000000..eb2ba4f --- /dev/null +++ b/research/flossing/initial_perturb_robustness/watch_and_plot.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +set -euo pipefail +ROOT=/home/yurenh2/rrm +PY=/home/yurenh2/miniconda3/envs/rrm/bin/python +cd "${ROOT}" +PIDS=( + research/flossing/initial_perturb_robustness/logs/trm_baseline_best_step58590_n5000_k8.pid + research/flossing/initial_perturb_robustness/logs/trm_multi4_best_step35805_n5000_k8.pid + research/flossing/initial_perturb_robustness/logs/trm_multi4_final_step65100_n5000_k8.pid +) +for pf in "${PIDS[@]}"; do + pid=$(cat "${pf}") + echo "watch ${pf}: ${pid}" + while kill -0 "${pid}" 2>/dev/null; do + sleep 60 + done + echo "done ${pf}: ${pid}" +done +"${PY}" research/flossing/plot_initial_perturb_robustness.py \ + --summaries \ + research/flossing/initial_perturb_robustness/trm_baseline_best_step58590_n5000_k8.summary.csv \ + research/flossing/initial_perturb_robustness/trm_multi4_best_step35805_n5000_k8.summary.csv \ + research/flossing/initial_perturb_robustness/trm_multi4_final_step65100_n5000_k8.summary.csv \ + --out-dir research/flossing/initial_perturb_robustness/plots +nvidia-smi --query-gpu=index,memory.used,memory.total,utilization.gpu --format=csv,noheader,nounits > research/flossing/initial_perturb_robustness/plots/final_gpu_status.txt |
