From 66e0d8b9fd4d0f7a2231d689c055e26fdf1cf04a Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Sat, 13 Jun 2026 12:35:36 -0500 Subject: rrm workspace: TRM/HRM/SRM code, Maze dataset, dynamical-analysis pipeline Curated export for clone-and-run Maze training (2x A6000) + diagnostics. trm/hrm pretrain.py carry trajectory-augmentation code (backward-compatible). Heavy artifacts (checkpoints/wandb/npz) gitignored; see PROVENANCE.md. Co-Authored-By: Claude Fable 5 --- .../plots/late_perturb_robustness_combined.csv | 91 ++++++++++++++++++++++ .../smoke_baseline.meta.json | 27 +++++++ .../smoke_baseline.summary.csv | 5 ++ .../smoke_baseline_expandedclean.meta.json | 27 +++++++ .../smoke_baseline_expandedclean.summary.csv | 5 ++ .../smoke_baseline_fastclean.meta.json | 27 +++++++ .../smoke_baseline_fastclean.summary.csv | 5 ++ .../late_perturb_robustness_combined.csv | 5 ++ ...baseline_best_step58590_n3000_k8_late.meta.json | 34 ++++++++ ...seline_best_step58590_n3000_k8_late.summary.csv | 31 ++++++++ ...m_multi4_best_step35805_n3000_k8_late.meta.json | 34 ++++++++ ...multi4_best_step35805_n3000_k8_late.summary.csv | 31 ++++++++ ..._multi4_final_step65100_n3000_k8_late.meta.json | 34 ++++++++ ...ulti4_final_step65100_n3000_k8_late.summary.csv | 31 ++++++++ .../late_perturb_robustness/watch_and_plot.sh | 32 ++++++++ 15 files changed, 419 insertions(+) create mode 100644 research/flossing/late_perturb_robustness/plots/late_perturb_robustness_combined.csv create mode 100644 research/flossing/late_perturb_robustness/smoke_baseline.meta.json create mode 100644 research/flossing/late_perturb_robustness/smoke_baseline.summary.csv create mode 100644 research/flossing/late_perturb_robustness/smoke_baseline_expandedclean.meta.json create mode 100644 research/flossing/late_perturb_robustness/smoke_baseline_expandedclean.summary.csv create mode 100644 research/flossing/late_perturb_robustness/smoke_baseline_fastclean.meta.json create mode 100644 research/flossing/late_perturb_robustness/smoke_baseline_fastclean.summary.csv create mode 100644 research/flossing/late_perturb_robustness/smoke_plots/late_perturb_robustness_combined.csv create mode 100644 research/flossing/late_perturb_robustness/trm_baseline_best_step58590_n3000_k8_late.meta.json create mode 100644 research/flossing/late_perturb_robustness/trm_baseline_best_step58590_n3000_k8_late.summary.csv create mode 100644 research/flossing/late_perturb_robustness/trm_multi4_best_step35805_n3000_k8_late.meta.json create mode 100644 research/flossing/late_perturb_robustness/trm_multi4_best_step35805_n3000_k8_late.summary.csv create mode 100644 research/flossing/late_perturb_robustness/trm_multi4_final_step65100_n3000_k8_late.meta.json create mode 100644 research/flossing/late_perturb_robustness/trm_multi4_final_step65100_n3000_k8_late.summary.csv create mode 100755 research/flossing/late_perturb_robustness/watch_and_plot.sh (limited to 'research/flossing/late_perturb_robustness') diff --git a/research/flossing/late_perturb_robustness/plots/late_perturb_robustness_combined.csv b/research/flossing/late_perturb_robustness/plots/late_perturb_robustness_combined.csv new file mode 100644 index 0000000..144b4c5 --- /dev/null +++ b/research/flossing/late_perturb_robustness/plots/late_perturb_robustness_combined.csv @@ -0,0 +1,91 @@ +label,perturb_after,sigma,n_samples,rollouts,ckpt_root,ckpt_name,perturb,noise_distribution,mean_rollout_exact,mean_rollout_token_acc,pass_at_k,all_k,correct_count_mean,correct_count_std,zero_frac,full_frac,clean_acc,retain_mean_on_clean_success,allK_on_clean_success,rescue_mean_on_clean_fail,passK_on_clean_fail +trm_baseline_best,0,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.950645923614502,0.8669999837875366,0.8669999837875366,6.935999870300293,2.71659779548645,0.13300000131130219,0.8669999837875366,0.8669999837875366,1.0,1.0,0.0,0.0 +trm_baseline_best,0,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.9504892826080322,0.9483333230018616,0.7639999985694885,6.935999870300293,2.303454875946045,0.05166666582226753,0.7639999985694885,0.8669999837875366,0.9578046798706055,0.875048041343689,0.2750626504421234,0.6315789222717285 +trm_baseline_best,0,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8684166669845581,0.9509722590446472,0.9456666707992554,0.7683333158493042,6.947333335876465,2.3008460998535156,0.05433333292603493,0.7683333158493042,0.8669999837875366,0.9602556824684143,0.8800461292266846,0.2697368562221527,0.6090225577354431 +trm_baseline_best,0,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.9504145383834839,0.9463333487510681,0.765333354473114,6.935999870300293,2.3080809116363525,0.05366666615009308,0.765333354473114,0.8669999837875366,0.9579008221626282,0.8785082697868347,0.2744360864162445,0.6190476417541504 +trm_baseline_best,0,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8675000071525574,0.9505621790885925,0.9496666789054871,0.7616666555404663,6.940000057220459,2.2861320972442627,0.050333332270383835,0.7616666555404663,0.8669999837875366,0.9567474126815796,0.8723567724227905,0.2857142984867096,0.6390977501869202 +trm_baseline_best,0,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8682083487510681,0.9508457183837891,0.9506666660308838,0.7593333125114441,6.945666790008545,2.265108108520508,0.04933333396911621,0.7593333125114441,0.8669999837875366,0.956122636795044,0.8696655035018921,0.2951127886772156,0.6441102623939514 +trm_baseline_best,4,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.950645923614502,0.8669999837875366,0.8669999837875366,6.935999870300293,2.71659779548645,0.13300000131130219,0.8669999837875366,0.8669999837875366,1.0,1.0,0.0,0.0 +trm_baseline_best,4,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8666250109672546,0.9501558542251587,0.9350000023841858,0.8009999990463257,6.933000087738037,2.4135406017303467,0.06499999761581421,0.8009999990463257,0.8669999837875366,0.9687620401382446,0.9211841821670532,0.20081453025341034,0.5263158082962036 +trm_baseline_best,4,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8679583072662354,0.950590968132019,0.9359999895095825,0.7996666431427002,6.943666458129883,2.404542922973633,0.06400000303983688,0.7996666431427002,0.8669999837875366,0.970588207244873,0.9200307726860046,0.19893483817577362,0.5338345766067505 +trm_baseline_best,4,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8680416941642761,0.9508261680603027,0.9399999976158142,0.8013333082199097,6.944333553314209,2.3910739421844482,0.05999999865889549,0.8013333082199097,0.8669999837875366,0.9685697555541992,0.9227220416069031,0.21271929144859314,0.5664160251617432 +trm_baseline_best,4,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8660833239555359,0.9499126076698303,0.934333324432373,0.7953333258628845,6.928666591644287,2.413485288619995,0.06566666811704636,0.7953333258628845,0.8669999837875366,0.967608630657196,0.915032684803009,0.20426064729690552,0.523809552192688 +trm_baseline_best,4,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8659999966621399,0.9500890970230103,0.9403333067893982,0.7973333597183228,6.927999973297119,2.399336576461792,0.05966666713356972,0.7973333597183228,0.8669999837875366,0.9667435884475708,0.9161860942840576,0.20927318930625916,0.5664160251617432 +trm_baseline_best,8,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.950645923614502,0.8669999837875366,0.8669999837875366,6.935999870300293,2.71659779548645,0.13300000131130219,0.8669999837875366,0.8669999837875366,1.0,1.0,0.0,0.0 +trm_baseline_best,8,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669583201408386,0.9507402181625366,0.9226666688919067,0.8306666612625122,6.935666561126709,2.5147950649261475,0.07733333110809326,0.8306666612625122,0.8669999837875366,0.9784217476844788,0.9573240876197815,0.14035087823867798,0.4436090290546417 +trm_baseline_best,8,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8678333163261414,0.9507989287376404,0.9246666431427002,0.831333339214325,6.942666530609131,2.5057361125946045,0.07533333450555801,0.831333339214325,0.8669999837875366,0.9795271158218384,0.9588619470596313,0.13972431421279907,0.451127827167511 +trm_baseline_best,8,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8676666617393494,0.9505447745323181,0.9229999780654907,0.8306666612625122,6.941333293914795,2.51287841796875,0.07699999958276749,0.8306666612625122,0.8669999837875366,0.980199933052063,0.9573240876197815,0.13408520817756653,0.4411027431488037 +trm_baseline_best,8,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8682083487510681,0.950954258441925,0.9236666560173035,0.8286666870117188,6.945666790008545,2.4972081184387207,0.07633333653211594,0.8286666870117188,0.8669999837875366,0.9788542985916138,0.9554017782211304,0.14692983031272888,0.448621541261673 +trm_baseline_best,8,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8687083125114441,0.9511321783065796,0.9286666512489319,0.824999988079071,6.949666500091553,2.46991229057312,0.07133333384990692,0.824999988079071,0.8669999837875366,0.9784698486328125,0.9504036903381348,0.15319548547267914,0.48120301961898804 +trm_baseline_best,12,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.950645923614502,0.8669999837875366,0.8669999837875366,6.935999870300293,2.71659779548645,0.13300000131130219,0.8669999837875366,0.8669999837875366,1.0,1.0,0.0,0.0 +trm_baseline_best,12,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8673333525657654,0.9506542682647705,0.8926666378974915,0.856333315372467,6.938666820526123,2.641887903213501,0.10733333230018616,0.856333315372467,0.8669999837875366,0.9928392767906189,0.9873125553131104,0.049185462296009064,0.2005012482404709 +trm_baseline_best,12,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8668749928474426,0.9505452513694763,0.8896666765213013,0.8560000061988831,6.934999942779541,2.651435136795044,0.11033333092927933,0.8560000061988831,0.8669999837875366,0.9924548268318176,0.9869281053543091,0.048245612531900406,0.18045112490653992 +trm_baseline_best,12,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669583201408386,0.9506661891937256,0.8913333415985107,0.8556666374206543,6.935666561126709,2.6479289531707764,0.10866666585206985,0.8556666374206543,0.8669999837875366,0.991782009601593,0.9869281053543091,0.05325814709067345,0.19548872113227844 +trm_baseline_best,12,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8671666383743286,0.9506459832191467,0.8989999890327454,0.8539999723434448,6.937333106994629,2.629715919494629,0.10100000351667404,0.8539999723434448,0.8669999837875366,0.991733968257904,0.985005795955658,0.055137842893600464,0.24561403691768646 +trm_baseline_best,12,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8677916526794434,0.950908362865448,0.9043333530426025,0.8516666889190674,6.942333221435547,2.602372169494629,0.09566666930913925,0.8516666889190674,0.8669999837875366,0.9901480078697205,0.9823144674301147,0.07017543911933899,0.2882205545902252 +trm_baseline_best,15,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.950645923614502,0.8669999837875366,0.8669999837875366,6.935999870300293,2.71659779548645,0.13300000131130219,0.8669999837875366,0.8669999837875366,1.0,1.0,0.0,0.0 +trm_baseline_best,15,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.9506445527076721,0.8669999837875366,0.8669999837875366,6.935999870300293,2.71659779548645,0.13300000131130219,0.8669999837875366,0.8669999837875366,1.0,1.0,0.0,0.0 +trm_baseline_best,15,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.9506717920303345,0.8669999837875366,0.8669999837875366,6.935999870300293,2.71659779548645,0.13300000131130219,0.8669999837875366,0.8669999837875366,1.0,1.0,0.0,0.0 +trm_baseline_best,15,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.9506852030754089,0.8669999837875366,0.8669999837875366,6.935999870300293,2.71659779548645,0.13300000131130219,0.8669999837875366,0.8669999837875366,1.0,1.0,0.0,0.0 +trm_baseline_best,15,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8670833110809326,0.9507201910018921,0.8673333525657654,0.8669999837875366,6.936666488647461,2.7151405811309814,0.1326666623353958,0.8669999837875366,0.8669999837875366,1.0,1.0,0.0006265664123930037,0.002506265649572015 +trm_baseline_best,15,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8670833110809326,0.9508404731750488,0.8686666488647461,0.8656666874885559,6.936666488647461,2.7110862731933594,0.1313333362340927,0.8656666874885559,0.8669999837875366,0.9997597336769104,0.9984621405601501,0.0021929824724793434,0.01253132801502943 +trm_multi4_best,0,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8939999938011169,0.959773600101471,0.8939999938011169,0.8939999938011169,7.1519999504089355,2.462700843811035,0.10599999874830246,0.8939999938011169,0.8939999938011169,1.0,1.0,0.0,0.0 +trm_multi4_best,0,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8957083225250244,0.9601898789405823,0.9583333134651184,0.8119999766349792,7.165666580200195,2.081879138946533,0.0416666679084301,0.8119999766349792,0.8939999938011169,0.9684004187583923,0.9030573964118958,0.28262579441070557,0.6194968819618225 +trm_multi4_best,0,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8957916498184204,0.9602490067481995,0.9606666564941406,0.8130000233650208,7.166333198547363,2.062037706375122,0.03933333232998848,0.8130000233650208,0.8939999938011169,0.9666293859481812,0.9038031101226807,0.2983490526676178,0.6352201104164124 +trm_multi4_best,0,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8947083353996277,0.9597618579864502,0.9599999785423279,0.8063333630561829,7.1576666831970215,2.0646567344665527,0.03999999910593033,0.8063333630561829,0.8939999938011169,0.9667226076126099,0.8963460326194763,0.28734275698661804,0.6320754885673523 +trm_multi4_best,0,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8953750133514404,0.9600813984870911,0.9603333473205566,0.8009999990463257,7.163000106811523,2.049495220184326,0.03966666758060455,0.8009999990463257,0.8939999938011169,0.9666759967803955,0.8918717503547668,0.29402515292167664,0.6446540951728821 +trm_multi4_best,0,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8950416445732117,0.9601003527641296,0.9629999995231628,0.7983333468437195,7.160333156585693,2.0425376892089844,0.03700000047683716,0.7983333468437195,0.8939999938011169,0.9635066986083984,0.8851603269577026,0.3176100552082062,0.6666666865348816 +trm_multi4_best,4,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8939999938011169,0.959773600101471,0.8939999938011169,0.8939999938011169,7.1519999504089355,2.462700843811035,0.10599999874830246,0.8939999938011169,0.8939999938011169,1.0,1.0,0.0,0.0 +trm_multi4_best,4,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8946250081062317,0.9599500894546509,0.9473333358764648,0.8433333039283752,7.1570000648498535,2.1906659603118896,0.052666667848825455,0.8433333039283752,0.8939999938011169,0.9753448963165283,0.9392244815826416,0.21383647620677948,0.5251572132110596 +trm_multi4_best,4,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8962500095367432,0.9606265425682068,0.9523333311080933,0.843999981880188,7.170000076293945,2.1701996326446533,0.04766666516661644,0.843999981880188,0.8939999938011169,0.976416826248169,0.9410887360572815,0.22012577950954437,0.5691823959350586 +trm_multi4_best,4,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8963750004768372,0.9604902863502502,0.9483333230018616,0.8446666598320007,7.171000003814697,2.171119213104248,0.05166666582226753,0.8446666598320007,0.8939999938011169,0.9760906100273132,0.9410887360572815,0.22405660152435303,0.5314465165138245 +trm_multi4_best,4,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8968333601951599,0.9606810808181763,0.9506666660308838,0.8403333425521851,7.174666881561279,2.1587400436401367,0.04933333396911621,0.8403333425521851,0.8939999938011169,0.9759973883628845,0.9377330541610718,0.2291666716337204,0.544025182723999 +trm_multi4_best,4,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8944583535194397,0.9597752094268799,0.9526666402816772,0.8326666951179504,7.155666828155518,2.1656641960144043,0.047333333641290665,0.8326666951179504,0.8939999938011169,0.9737602472305298,0.9284116625785828,0.22562892735004425,0.5628930926322937 +trm_multi4_best,8,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8939999938011169,0.959773600101471,0.8939999938011169,0.8939999938011169,7.1519999504089355,2.462700843811035,0.10599999874830246,0.8939999938011169,0.8939999938011169,1.0,1.0,0.0,0.0 +trm_multi4_best,8,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8962083458900452,0.9605055451393127,0.9399999976158142,0.8696666955947876,7.169666767120361,2.2701423168182373,0.05999999865889549,0.8696666955947876,0.8939999938011169,0.9840604066848755,0.9709172248840332,0.15526729822158813,0.45597484707832336 +trm_multi4_best,8,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8970000147819519,0.9608615636825562,0.9416666626930237,0.8696666955947876,7.176000118255615,2.254556179046631,0.05833333358168602,0.8696666955947876,0.8939999938011169,0.9844798445701599,0.9709172248840332,0.1591981202363968,0.46855345368385315 +trm_multi4_best,8,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8955416679382324,0.9602766633033752,0.9399999976158142,0.8673333525657654,7.164333343505859,2.2744951248168945,0.05999999865889549,0.8673333525657654,0.8939999938011169,0.9835943579673767,0.9690529704093933,0.1529088020324707,0.45597484707832336 +trm_multi4_best,8,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8949999809265137,0.9600179195404053,0.9436666369438171,0.8650000095367432,7.159999847412109,2.2588493824005127,0.056333333253860474,0.8650000095367432,0.8939999938011169,0.9836875200271606,0.9668158292770386,0.14701257646083832,0.47484275698661804 +trm_multi4_best,8,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8958333134651184,0.9602567553520203,0.9449999928474426,0.8643333315849304,7.166666507720947,2.251863479614258,0.054999999701976776,0.8643333315849304,0.8939999938011169,0.9831282496452332,0.9656972289085388,0.1595911979675293,0.49685534834861755 +trm_multi4_best,12,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8939999938011169,0.959773600101471,0.8939999938011169,0.8939999938011169,7.1519999504089355,2.462700843811035,0.10599999874830246,0.8939999938011169,0.8939999938011169,1.0,1.0,0.0,0.0 +trm_multi4_best,12,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8956666588783264,0.9604059457778931,0.9129999876022339,0.887333333492279,7.165333271026611,2.3898391723632812,0.08699999749660492,0.887333333492279,0.8939999938011169,0.9954324960708618,0.9925428628921509,0.05424528196454048,0.19182389974594116 +trm_multi4_best,12,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8966249823570251,0.9608678221702576,0.9193333387374878,0.887333333492279,7.172999858856201,2.3672215938568115,0.0806666687130928,0.887333333492279,0.8939999938011169,0.995712161064148,0.9925428628921509,0.06092767417430878,0.24528302252292633 +trm_multi4_best,12,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8974583148956299,0.9608935713768005,0.9213333129882812,0.887666642665863,7.179666519165039,2.3568453788757324,0.07866666465997696,0.887666642665863,0.8939999938011169,0.9956655502319336,0.9925428628921509,0.069182388484478,0.26729559898376465 +trm_multi4_best,12,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8966249823570251,0.9606748819351196,0.9223333597183228,0.8870000243186951,7.172999858856201,2.3598878383636475,0.07766667008399963,0.8870000243186951,0.8939999938011169,0.9950596690177917,0.9921700358390808,0.06643081456422806,0.276729553937912 +trm_multi4_best,12,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8972083330154419,0.9610205292701721,0.9259999990463257,0.8849999904632568,7.177666664123535,2.338397264480591,0.07400000095367432,0.8849999904632568,0.8939999938011169,0.9935682415962219,0.9899328947067261,0.08451257646083832,0.31446540355682373 +trm_multi4_best,15,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8939999938011169,0.959773600101471,0.8939999938011169,0.8939999938011169,7.1519999504089355,2.462700843811035,0.10599999874830246,0.8939999938011169,0.8939999938011169,1.0,1.0,0.0,0.0 +trm_multi4_best,15,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8939999938011169,0.9598419666290283,0.8939999938011169,0.8939999938011169,7.1519999504089355,2.462700843811035,0.10599999874830246,0.8939999938011169,0.8939999938011169,1.0,1.0,0.0,0.0 +trm_multi4_best,15,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8939999938011169,0.9597874879837036,0.8939999938011169,0.8939999938011169,7.1519999504089355,2.462700843811035,0.10599999874830246,0.8939999938011169,0.8939999938011169,1.0,1.0,0.0,0.0 +trm_multi4_best,15,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8939999938011169,0.9598518013954163,0.8939999938011169,0.8939999938011169,7.1519999504089355,2.462700843811035,0.10599999874830246,0.8939999938011169,0.8939999938011169,1.0,1.0,0.0,0.0 +trm_multi4_best,15,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8941249847412109,0.9598509073257446,0.8946666717529297,0.8939999938011169,7.1529998779296875,2.4601335525512695,0.10533333569765091,0.8939999938011169,0.8939999938011169,1.0,1.0,0.001179245300590992,0.006289307959377766 +trm_multi4_best,15,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8942499756813049,0.9599588513374329,0.8956666588783264,0.893666684627533,7.1539998054504395,2.456070899963379,0.10433333367109299,0.893666684627533,0.8939999938011169,0.9999067783355713,0.9996271729469299,0.003144653979688883,0.015723271295428276 +trm_multi4_final,0,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8286666870117188,0.9328684210777283,0.8286666870117188,0.8286666870117188,6.62933349609375,3.014399766921997,0.17133332788944244,0.8286666870117188,0.8286666870117188,1.0,1.0,0.0,0.0 +trm_multi4_final,0,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8319166898727417,0.9341394901275635,0.9086666703224182,0.7463333606719971,6.655333518981934,2.654029369354248,0.09133332967758179,0.7463333606719971,0.8286666870117188,0.9579645991325378,0.8942075371742249,0.22227627038955688,0.4902723729610443 +trm_multi4_final,0,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8315416574478149,0.93404221534729,0.9143333435058594,0.7419999837875366,6.6523332595825195,2.6464054584503174,0.08566666394472122,0.7419999837875366,0.8286666870117188,0.9564561247825623,0.8897827863693237,0.22738327085971832,0.5252918004989624 +trm_multi4_final,0,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8301666378974915,0.9333322644233704,0.9143333435058594,0.7356666922569275,6.641333103179932,2.639954090118408,0.08566666394472122,0.7356666922569275,0.8286666870117188,0.9536906480789185,0.879324197769165,0.23273345828056335,0.5272373557090759 +trm_multi4_final,0,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8319166898727417,0.934055507183075,0.9213333129882812,0.7239999771118164,6.655333518981934,2.594970464706421,0.07866666465997696,0.7239999771118164,0.8286666870117188,0.9528358578681946,0.8688656687736511,0.2470817118883133,0.5661478638648987 +trm_multi4_final,0,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8354583382606506,0.9354007244110107,0.9269999861717224,0.7246666550636292,6.683666706085205,2.5519142150878906,0.0729999989271164,0.7246666550636292,0.8286666870117188,0.9477574229240417,0.861625075340271,0.29231518507003784,0.5992217659950256 +trm_multi4_final,4,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8286666870117188,0.9328684210777283,0.8286666870117188,0.8286666870117188,6.62933349609375,3.014399766921997,0.17133332788944244,0.8286666870117188,0.8286666870117188,1.0,1.0,0.0,0.0 +trm_multi4_final,4,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8335833549499512,0.9348888397216797,0.8993333578109741,0.7756666541099548,6.668666839599609,2.732560157775879,0.10066666454076767,0.7756666541099548,0.8286666870117188,0.970585286617279,0.9336283206939697,0.17096303403377533,0.43968871235847473 +trm_multi4_final,4,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8352500200271606,0.9354547262191772,0.9020000100135803,0.7749999761581421,6.682000160217285,2.7150585651397705,0.09799999743700027,0.7749999761581421,0.8286666870117188,0.971238911151886,0.9328238368034363,0.17752918601036072,0.4494163393974304 +trm_multi4_final,4,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8327916860580444,0.9343785643577576,0.8989999890327454,0.7720000147819519,6.6623334884643555,2.7240254878997803,0.10100000351667404,0.7720000147819519,0.8286666870117188,0.9690265655517578,0.9292035102844238,0.17388132214546204,0.4319066107273102 +trm_multi4_final,4,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8316666483879089,0.9340242743492126,0.8986666798591614,0.7730000019073486,6.6533331871032715,2.7423510551452637,0.10133333504199982,0.7730000019073486,0.8286666870117188,0.9677695035934448,0.9267899990081787,0.17339494824409485,0.4319066107273102 +trm_multi4_final,4,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8324583172798157,0.9344562888145447,0.8989999890327454,0.7680000066757202,6.659666538238525,2.7232038974761963,0.10100000351667404,0.7680000066757202,0.8286666870117188,0.9691271185874939,0.9239742755889893,0.1714494228363037,0.4319066107273102 +trm_multi4_final,8,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8286666870117188,0.9328684210777283,0.8286666870117188,0.8286666870117188,6.62933349609375,3.014399766921997,0.17133332788944244,0.8286666870117188,0.8286666870117188,1.0,1.0,0.0,0.0 +trm_multi4_final,8,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8332499861717224,0.934550940990448,0.8849999904632568,0.8013333082199097,6.665999889373779,2.8155124187469482,0.11500000208616257,0.8013333082199097,0.8286666870117188,0.9811444282531738,0.9662107825279236,0.11794747412204742,0.344357967376709 +trm_multi4_final,8,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8323749899864197,0.9341363906860352,0.8830000162124634,0.8016666769981384,6.658999919891357,2.8219470977783203,0.11699999868869781,0.8016666769981384,0.8286666870117188,0.9805410504341125,0.967015266418457,0.1157587543129921,0.3346303403377533 +trm_multi4_final,8,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8352916836738586,0.9354629516601562,0.8856666684150696,0.8016666769981384,6.682333469390869,2.7959651947021484,0.11433333158493042,0.8016666769981384,0.8286666870117188,0.9822506308555603,0.9666130542755127,0.12451361864805222,0.344357967376709 +trm_multi4_final,8,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8335000276565552,0.9347721338272095,0.890333354473114,0.7993333339691162,6.668000221252441,2.803647994995117,0.10966666787862778,0.7993333339691162,0.8286666870117188,0.9798873662948608,0.9625905156135559,0.1254863739013672,0.3696497976779938 +trm_multi4_final,8,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8331666588783264,0.9347808957099915,0.8916666507720947,0.7960000038146973,6.665333271026611,2.7949953079223633,0.10833333432674408,0.7960000038146973,0.8286666870117188,0.9781777858734131,0.9593724608421326,0.13180933892726898,0.38910505175590515 +trm_multi4_final,12,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8286666870117188,0.9328684210777283,0.8286666870117188,0.8286666870117188,6.62933349609375,3.014399766921997,0.17133332788944244,0.8286666870117188,0.8286666870117188,1.0,1.0,0.0,0.0 +trm_multi4_final,12,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8307916522026062,0.9338425993919373,0.8569999933242798,0.8209999799728394,6.64633321762085,2.936537742614746,0.14300000667572021,0.8209999799728394,0.8286666870117188,0.9936142563819885,0.9907481670379639,0.043287936598062515,0.1750972718000412 +trm_multi4_final,12,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8307499885559082,0.9337176084518433,0.8603333234786987,0.8209999799728394,6.645999908447266,2.9328060150146484,0.13966666162014008,0.8209999799728394,0.8286666870117188,0.9935137033462524,0.9907481670379639,0.04353112727403641,0.18871594965457916 +trm_multi4_final,12,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8307499885559082,0.9339007139205933,0.8583333492279053,0.8203333616256714,6.645999908447266,2.93530535697937,0.14166666567325592,0.8203333616256714,0.8286666870117188,0.993463397026062,0.9899436831474304,0.0437743179500103,0.17898832261562347 +trm_multi4_final,12,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.831166684627533,0.9338744282722473,0.8616666793823242,0.8190000057220459,6.649333477020264,2.921477794647217,0.13833333551883698,0.8190000057220459,0.8286666870117188,0.9930108785629272,0.9883346557617188,0.04839494079351425,0.20428015291690826 +trm_multi4_final,12,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8309583067893982,0.9341686964035034,0.8683333396911621,0.8149999976158142,6.6476664543151855,2.9015274047851562,0.1316666603088379,0.8149999976158142,0.8286666870117188,0.9908487796783447,0.9835076332092285,0.057636186480522156,0.24319066107273102 +trm_multi4_final,15,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8286666870117188,0.9328684210777283,0.8286666870117188,0.8286666870117188,6.62933349609375,3.014399766921997,0.17133332788944244,0.8286666870117188,0.8286666870117188,1.0,1.0,0.0,0.0 +trm_multi4_final,15,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8286666870117188,0.9330503940582275,0.8286666870117188,0.8286666870117188,6.62933349609375,3.014399766921997,0.17133332788944244,0.8286666870117188,0.8286666870117188,1.0,1.0,0.0,0.0 +trm_multi4_final,15,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8287083506584167,0.9331271052360535,0.8289999961853027,0.8286666870117188,6.629666805267334,3.0137219429016113,0.17100000381469727,0.8286666870117188,0.8286666870117188,1.0,1.0,0.00024319066142197698,0.0019455252913758159 +trm_multi4_final,15,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8286250233650208,0.9330225586891174,0.8289999961853027,0.828000009059906,6.629000186920166,3.0135293006896973,0.17100000381469727,0.828000009059906,0.8286666870117188,0.9998994469642639,0.9991955161094666,0.00024319066142197698,0.0019455252913758159 +trm_multi4_final,15,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8287500143051147,0.9331192970275879,0.8306666612625122,0.8276666402816772,6.630000114440918,3.009169340133667,0.1693333387374878,0.8276666402816772,0.8286666870117188,0.999698281288147,0.9987932443618774,0.0019455252913758159,0.011673151515424252 +trm_multi4_final,15,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8292916417121887,0.9331137537956238,0.8330000042915344,0.8276666402816772,6.63433313369751,3.0008811950683594,0.16699999570846558,0.8276666402816772,0.8286666870117188,0.9995977282524109,0.9987932443618774,0.0055933850817382336,0.02529182843863964 diff --git a/research/flossing/late_perturb_robustness/smoke_baseline.meta.json b/research/flossing/late_perturb_robustness/smoke_baseline.meta.json new file mode 100644 index 0000000..3ceb2d3 --- /dev/null +++ b/research/flossing/late_perturb_robustness/smoke_baseline.meta.json @@ -0,0 +1,27 @@ +{ + "args": { + "batch_size": 8, + "ckpt_name": "step_58590", + "ckpt_root": "/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro", + "label": "smoke_baseline", + "n_samples": 16, + "noise_distribution": "gaussian", + "out_prefix": "research/flossing/late_perturb_robustness/smoke_baseline", + "perturb": "both", + "perturb_afters": "0,8", + "rollouts": 2, + "seed": 20260606, + "sigmas": "0,0.01" + }, + "config_global_batch_size": 768, + "data_path": "/home/yurenh2/rrm/data/sudoku-extreme-1k-aug-1000", + "n_samples": 16, + "perturb_afters": [ + 0, + 8 + ], + "sigmas": [ + 0.0, + 0.01 + ] +} \ No newline at end of file diff --git a/research/flossing/late_perturb_robustness/smoke_baseline.summary.csv b/research/flossing/late_perturb_robustness/smoke_baseline.summary.csv new file mode 100644 index 0000000..ec4de26 --- /dev/null +++ b/research/flossing/late_perturb_robustness/smoke_baseline.summary.csv @@ -0,0 +1,5 @@ +label,perturb_after,sigma,n_samples,rollouts,ckpt_root,ckpt_name,perturb,noise_distribution,mean_rollout_exact,mean_rollout_token_acc,pass_at_k,all_k,correct_count_mean,correct_count_std,zero_frac,full_frac,clean_acc,retain_mean_on_clean_success,allK_on_clean_success,rescue_mean_on_clean_fail,passK_on_clean_fail +smoke_baseline,0,0.0,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.875,0.9614197611808777,0.875,0.875,1.75,0.6614378094673157,0.125,0.875,0.875,1.0,1.0,0.0,0.0 +smoke_baseline,0,0.01,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.9375,0.9834104776382446,1.0,0.875,1.875,0.33071890473365784,0.0,0.875,0.875,0.9642857313156128,0.9285714030265808,0.75,1.0 +smoke_baseline,8,0.0,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.875,0.9614197611808777,0.875,0.875,1.75,0.6614378094673157,0.125,0.875,0.875,1.0,1.0,0.0,0.0 +smoke_baseline,8,0.01,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.90625,0.96875,0.9375,0.875,1.8125,0.5266343355178833,0.0625,0.875,0.875,1.0,1.0,0.25,0.5 diff --git a/research/flossing/late_perturb_robustness/smoke_baseline_expandedclean.meta.json b/research/flossing/late_perturb_robustness/smoke_baseline_expandedclean.meta.json new file mode 100644 index 0000000..4f308b6 --- /dev/null +++ b/research/flossing/late_perturb_robustness/smoke_baseline_expandedclean.meta.json @@ -0,0 +1,27 @@ +{ + "args": { + "batch_size": 8, + "ckpt_name": "step_58590", + "ckpt_root": "/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro", + "label": "smoke_baseline_expandedclean", + "n_samples": 16, + "noise_distribution": "gaussian", + "out_prefix": "research/flossing/late_perturb_robustness/smoke_baseline_expandedclean", + "perturb": "both", + "perturb_afters": "0,8", + "rollouts": 2, + "seed": 20260606, + "sigmas": "0,0.01" + }, + "config_global_batch_size": 768, + "data_path": "/home/yurenh2/rrm/data/sudoku-extreme-1k-aug-1000", + "n_samples": 16, + "perturb_afters": [ + 0, + 8 + ], + "sigmas": [ + 0.0, + 0.01 + ] +} \ No newline at end of file diff --git a/research/flossing/late_perturb_robustness/smoke_baseline_expandedclean.summary.csv b/research/flossing/late_perturb_robustness/smoke_baseline_expandedclean.summary.csv new file mode 100644 index 0000000..ffe22cf --- /dev/null +++ b/research/flossing/late_perturb_robustness/smoke_baseline_expandedclean.summary.csv @@ -0,0 +1,5 @@ +label,perturb_after,sigma,n_samples,rollouts,ckpt_root,ckpt_name,perturb,noise_distribution,mean_rollout_exact,mean_rollout_token_acc,pass_at_k,all_k,correct_count_mean,correct_count_std,zero_frac,full_frac,clean_acc,retain_mean_on_clean_success,allK_on_clean_success,rescue_mean_on_clean_fail,passK_on_clean_fail +smoke_baseline_expandedclean,0,0.0,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.875,0.9614197611808777,0.875,0.875,1.75,0.6614378094673157,0.125,0.875,0.875,1.0,1.0,0.0,0.0 +smoke_baseline_expandedclean,0,0.01,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.9375,0.9834104776382446,1.0,0.875,1.875,0.33071890473365784,0.0,0.875,0.875,0.9642857313156128,0.9285714030265808,0.75,1.0 +smoke_baseline_expandedclean,8,0.0,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.875,0.9614197611808777,0.875,0.875,1.75,0.6614378094673157,0.125,0.875,0.875,1.0,1.0,0.0,0.0 +smoke_baseline_expandedclean,8,0.01,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.90625,0.96875,0.9375,0.875,1.8125,0.5266343355178833,0.0625,0.875,0.875,1.0,1.0,0.25,0.5 diff --git a/research/flossing/late_perturb_robustness/smoke_baseline_fastclean.meta.json b/research/flossing/late_perturb_robustness/smoke_baseline_fastclean.meta.json new file mode 100644 index 0000000..9665371 --- /dev/null +++ b/research/flossing/late_perturb_robustness/smoke_baseline_fastclean.meta.json @@ -0,0 +1,27 @@ +{ + "args": { + "batch_size": 8, + "ckpt_name": "step_58590", + "ckpt_root": "/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro", + "label": "smoke_baseline_fastclean", + "n_samples": 16, + "noise_distribution": "gaussian", + "out_prefix": "research/flossing/late_perturb_robustness/smoke_baseline_fastclean", + "perturb": "both", + "perturb_afters": "0,8", + "rollouts": 2, + "seed": 20260606, + "sigmas": "0,0.01" + }, + "config_global_batch_size": 768, + "data_path": "/home/yurenh2/rrm/data/sudoku-extreme-1k-aug-1000", + "n_samples": 16, + "perturb_afters": [ + 0, + 8 + ], + "sigmas": [ + 0.0, + 0.01 + ] +} \ No newline at end of file diff --git a/research/flossing/late_perturb_robustness/smoke_baseline_fastclean.summary.csv b/research/flossing/late_perturb_robustness/smoke_baseline_fastclean.summary.csv new file mode 100644 index 0000000..fb4e3a6 --- /dev/null +++ b/research/flossing/late_perturb_robustness/smoke_baseline_fastclean.summary.csv @@ -0,0 +1,5 @@ +label,perturb_after,sigma,n_samples,rollouts,ckpt_root,ckpt_name,perturb,noise_distribution,mean_rollout_exact,mean_rollout_token_acc,pass_at_k,all_k,correct_count_mean,correct_count_std,zero_frac,full_frac,clean_acc,retain_mean_on_clean_success,allK_on_clean_success,rescue_mean_on_clean_fail,passK_on_clean_fail +smoke_baseline_fastclean,0,0.0,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.875,0.9614197611808777,0.875,0.875,1.75,0.6614378094673157,0.125,0.875,0.9375,0.9333333373069763,0.9333333373069763,0.0,0.0 +smoke_baseline_fastclean,0,0.01,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.9375,0.9834104776382446,1.0,0.875,1.875,0.33071890473365784,0.0,0.875,0.9375,0.9333333373069763,0.8666666746139526,1.0,1.0 +smoke_baseline_fastclean,8,0.0,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.875,0.9614197611808777,0.875,0.875,1.75,0.6614378094673157,0.125,0.875,0.9375,0.9333333373069763,0.9333333373069763,0.0,0.0 +smoke_baseline_fastclean,8,0.01,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.90625,0.96875,0.9375,0.875,1.8125,0.5266343355178833,0.0625,0.875,0.9375,0.9666666388511658,0.9333333373069763,0.0,0.0 diff --git a/research/flossing/late_perturb_robustness/smoke_plots/late_perturb_robustness_combined.csv b/research/flossing/late_perturb_robustness/smoke_plots/late_perturb_robustness_combined.csv new file mode 100644 index 0000000..ec4de26 --- /dev/null +++ b/research/flossing/late_perturb_robustness/smoke_plots/late_perturb_robustness_combined.csv @@ -0,0 +1,5 @@ +label,perturb_after,sigma,n_samples,rollouts,ckpt_root,ckpt_name,perturb,noise_distribution,mean_rollout_exact,mean_rollout_token_acc,pass_at_k,all_k,correct_count_mean,correct_count_std,zero_frac,full_frac,clean_acc,retain_mean_on_clean_success,allK_on_clean_success,rescue_mean_on_clean_fail,passK_on_clean_fail +smoke_baseline,0,0.0,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.875,0.9614197611808777,0.875,0.875,1.75,0.6614378094673157,0.125,0.875,0.875,1.0,1.0,0.0,0.0 +smoke_baseline,0,0.01,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.9375,0.9834104776382446,1.0,0.875,1.875,0.33071890473365784,0.0,0.875,0.875,0.9642857313156128,0.9285714030265808,0.75,1.0 +smoke_baseline,8,0.0,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.875,0.9614197611808777,0.875,0.875,1.75,0.6614378094673157,0.125,0.875,0.875,1.0,1.0,0.0,0.0 +smoke_baseline,8,0.01,16,2,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.90625,0.96875,0.9375,0.875,1.8125,0.5266343355178833,0.0625,0.875,0.875,1.0,1.0,0.25,0.5 diff --git a/research/flossing/late_perturb_robustness/trm_baseline_best_step58590_n3000_k8_late.meta.json b/research/flossing/late_perturb_robustness/trm_baseline_best_step58590_n3000_k8_late.meta.json new file mode 100644 index 0000000..6911972 --- /dev/null +++ b/research/flossing/late_perturb_robustness/trm_baseline_best_step58590_n3000_k8_late.meta.json @@ -0,0 +1,34 @@ +{ + "args": { + "batch_size": 32, + "ckpt_name": "step_58590", + "ckpt_root": "/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro", + "label": "trm_baseline_best", + "n_samples": 3000, + "noise_distribution": "gaussian", + "out_prefix": "research/flossing/late_perturb_robustness/trm_baseline_best_step58590_n3000_k8_late", + "perturb": "both", + "perturb_afters": "0,4,8,12,15", + "rollouts": 8, + "seed": 20260606, + "sigmas": "0,0.001,0.003,0.01,0.03,0.1" + }, + "config_global_batch_size": 768, + "data_path": "/home/yurenh2/rrm/data/sudoku-extreme-1k-aug-1000", + "n_samples": 3000, + "perturb_afters": [ + 0, + 4, + 8, + 12, + 15 + ], + "sigmas": [ + 0.0, + 0.001, + 0.003, + 0.01, + 0.03, + 0.1 + ] +} \ No newline at end of file diff --git a/research/flossing/late_perturb_robustness/trm_baseline_best_step58590_n3000_k8_late.summary.csv b/research/flossing/late_perturb_robustness/trm_baseline_best_step58590_n3000_k8_late.summary.csv new file mode 100644 index 0000000..9f376be --- /dev/null +++ b/research/flossing/late_perturb_robustness/trm_baseline_best_step58590_n3000_k8_late.summary.csv @@ -0,0 +1,31 @@ +label,perturb_after,sigma,n_samples,rollouts,ckpt_root,ckpt_name,perturb,noise_distribution,mean_rollout_exact,mean_rollout_token_acc,pass_at_k,all_k,correct_count_mean,correct_count_std,zero_frac,full_frac,clean_acc,retain_mean_on_clean_success,allK_on_clean_success,rescue_mean_on_clean_fail,passK_on_clean_fail +trm_baseline_best,0,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.950645923614502,0.8669999837875366,0.8669999837875366,6.935999870300293,2.71659779548645,0.13300000131130219,0.8669999837875366,0.8669999837875366,1.0,1.0,0.0,0.0 +trm_baseline_best,0,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.9504892826080322,0.9483333230018616,0.7639999985694885,6.935999870300293,2.303454875946045,0.05166666582226753,0.7639999985694885,0.8669999837875366,0.9578046798706055,0.875048041343689,0.2750626504421234,0.6315789222717285 +trm_baseline_best,0,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8684166669845581,0.9509722590446472,0.9456666707992554,0.7683333158493042,6.947333335876465,2.3008460998535156,0.05433333292603493,0.7683333158493042,0.8669999837875366,0.9602556824684143,0.8800461292266846,0.2697368562221527,0.6090225577354431 +trm_baseline_best,0,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.9504145383834839,0.9463333487510681,0.765333354473114,6.935999870300293,2.3080809116363525,0.05366666615009308,0.765333354473114,0.8669999837875366,0.9579008221626282,0.8785082697868347,0.2744360864162445,0.6190476417541504 +trm_baseline_best,0,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8675000071525574,0.9505621790885925,0.9496666789054871,0.7616666555404663,6.940000057220459,2.2861320972442627,0.050333332270383835,0.7616666555404663,0.8669999837875366,0.9567474126815796,0.8723567724227905,0.2857142984867096,0.6390977501869202 +trm_baseline_best,0,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8682083487510681,0.9508457183837891,0.9506666660308838,0.7593333125114441,6.945666790008545,2.265108108520508,0.04933333396911621,0.7593333125114441,0.8669999837875366,0.956122636795044,0.8696655035018921,0.2951127886772156,0.6441102623939514 +trm_baseline_best,4,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.950645923614502,0.8669999837875366,0.8669999837875366,6.935999870300293,2.71659779548645,0.13300000131130219,0.8669999837875366,0.8669999837875366,1.0,1.0,0.0,0.0 +trm_baseline_best,4,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8666250109672546,0.9501558542251587,0.9350000023841858,0.8009999990463257,6.933000087738037,2.4135406017303467,0.06499999761581421,0.8009999990463257,0.8669999837875366,0.9687620401382446,0.9211841821670532,0.20081453025341034,0.5263158082962036 +trm_baseline_best,4,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8679583072662354,0.950590968132019,0.9359999895095825,0.7996666431427002,6.943666458129883,2.404542922973633,0.06400000303983688,0.7996666431427002,0.8669999837875366,0.970588207244873,0.9200307726860046,0.19893483817577362,0.5338345766067505 +trm_baseline_best,4,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8680416941642761,0.9508261680603027,0.9399999976158142,0.8013333082199097,6.944333553314209,2.3910739421844482,0.05999999865889549,0.8013333082199097,0.8669999837875366,0.9685697555541992,0.9227220416069031,0.21271929144859314,0.5664160251617432 +trm_baseline_best,4,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8660833239555359,0.9499126076698303,0.934333324432373,0.7953333258628845,6.928666591644287,2.413485288619995,0.06566666811704636,0.7953333258628845,0.8669999837875366,0.967608630657196,0.915032684803009,0.20426064729690552,0.523809552192688 +trm_baseline_best,4,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8659999966621399,0.9500890970230103,0.9403333067893982,0.7973333597183228,6.927999973297119,2.399336576461792,0.05966666713356972,0.7973333597183228,0.8669999837875366,0.9667435884475708,0.9161860942840576,0.20927318930625916,0.5664160251617432 +trm_baseline_best,8,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.950645923614502,0.8669999837875366,0.8669999837875366,6.935999870300293,2.71659779548645,0.13300000131130219,0.8669999837875366,0.8669999837875366,1.0,1.0,0.0,0.0 +trm_baseline_best,8,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669583201408386,0.9507402181625366,0.9226666688919067,0.8306666612625122,6.935666561126709,2.5147950649261475,0.07733333110809326,0.8306666612625122,0.8669999837875366,0.9784217476844788,0.9573240876197815,0.14035087823867798,0.4436090290546417 +trm_baseline_best,8,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8678333163261414,0.9507989287376404,0.9246666431427002,0.831333339214325,6.942666530609131,2.5057361125946045,0.07533333450555801,0.831333339214325,0.8669999837875366,0.9795271158218384,0.9588619470596313,0.13972431421279907,0.451127827167511 +trm_baseline_best,8,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8676666617393494,0.9505447745323181,0.9229999780654907,0.8306666612625122,6.941333293914795,2.51287841796875,0.07699999958276749,0.8306666612625122,0.8669999837875366,0.980199933052063,0.9573240876197815,0.13408520817756653,0.4411027431488037 +trm_baseline_best,8,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8682083487510681,0.950954258441925,0.9236666560173035,0.8286666870117188,6.945666790008545,2.4972081184387207,0.07633333653211594,0.8286666870117188,0.8669999837875366,0.9788542985916138,0.9554017782211304,0.14692983031272888,0.448621541261673 +trm_baseline_best,8,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8687083125114441,0.9511321783065796,0.9286666512489319,0.824999988079071,6.949666500091553,2.46991229057312,0.07133333384990692,0.824999988079071,0.8669999837875366,0.9784698486328125,0.9504036903381348,0.15319548547267914,0.48120301961898804 +trm_baseline_best,12,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.950645923614502,0.8669999837875366,0.8669999837875366,6.935999870300293,2.71659779548645,0.13300000131130219,0.8669999837875366,0.8669999837875366,1.0,1.0,0.0,0.0 +trm_baseline_best,12,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8673333525657654,0.9506542682647705,0.8926666378974915,0.856333315372467,6.938666820526123,2.641887903213501,0.10733333230018616,0.856333315372467,0.8669999837875366,0.9928392767906189,0.9873125553131104,0.049185462296009064,0.2005012482404709 +trm_baseline_best,12,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8668749928474426,0.9505452513694763,0.8896666765213013,0.8560000061988831,6.934999942779541,2.651435136795044,0.11033333092927933,0.8560000061988831,0.8669999837875366,0.9924548268318176,0.9869281053543091,0.048245612531900406,0.18045112490653992 +trm_baseline_best,12,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669583201408386,0.9506661891937256,0.8913333415985107,0.8556666374206543,6.935666561126709,2.6479289531707764,0.10866666585206985,0.8556666374206543,0.8669999837875366,0.991782009601593,0.9869281053543091,0.05325814709067345,0.19548872113227844 +trm_baseline_best,12,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8671666383743286,0.9506459832191467,0.8989999890327454,0.8539999723434448,6.937333106994629,2.629715919494629,0.10100000351667404,0.8539999723434448,0.8669999837875366,0.991733968257904,0.985005795955658,0.055137842893600464,0.24561403691768646 +trm_baseline_best,12,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8677916526794434,0.950908362865448,0.9043333530426025,0.8516666889190674,6.942333221435547,2.602372169494629,0.09566666930913925,0.8516666889190674,0.8669999837875366,0.9901480078697205,0.9823144674301147,0.07017543911933899,0.2882205545902252 +trm_baseline_best,15,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.950645923614502,0.8669999837875366,0.8669999837875366,6.935999870300293,2.71659779548645,0.13300000131130219,0.8669999837875366,0.8669999837875366,1.0,1.0,0.0,0.0 +trm_baseline_best,15,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.9506445527076721,0.8669999837875366,0.8669999837875366,6.935999870300293,2.71659779548645,0.13300000131130219,0.8669999837875366,0.8669999837875366,1.0,1.0,0.0,0.0 +trm_baseline_best,15,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.9506717920303345,0.8669999837875366,0.8669999837875366,6.935999870300293,2.71659779548645,0.13300000131130219,0.8669999837875366,0.8669999837875366,1.0,1.0,0.0,0.0 +trm_baseline_best,15,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8669999837875366,0.9506852030754089,0.8669999837875366,0.8669999837875366,6.935999870300293,2.71659779548645,0.13300000131130219,0.8669999837875366,0.8669999837875366,1.0,1.0,0.0,0.0 +trm_baseline_best,15,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8670833110809326,0.9507201910018921,0.8673333525657654,0.8669999837875366,6.936666488647461,2.7151405811309814,0.1326666623353958,0.8669999837875366,0.8669999837875366,1.0,1.0,0.0006265664123930037,0.002506265649572015 +trm_baseline_best,15,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro,step_58590,both,gaussian,0.8670833110809326,0.9508404731750488,0.8686666488647461,0.8656666874885559,6.936666488647461,2.7110862731933594,0.1313333362340927,0.8656666874885559,0.8669999837875366,0.9997597336769104,0.9984621405601501,0.0021929824724793434,0.01253132801502943 diff --git a/research/flossing/late_perturb_robustness/trm_multi4_best_step35805_n3000_k8_late.meta.json b/research/flossing/late_perturb_robustness/trm_multi4_best_step35805_n3000_k8_late.meta.json new file mode 100644 index 0000000..4fb15fb --- /dev/null +++ b/research/flossing/late_perturb_robustness/trm_multi4_best_step35805_n3000_k8_late.meta.json @@ -0,0 +1,34 @@ +{ + "args": { + "batch_size": 32, + "ckpt_name": "step_35805", + "ckpt_root": "/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro", + "label": "trm_multi4_best", + "n_samples": 3000, + "noise_distribution": "gaussian", + "out_prefix": "research/flossing/late_perturb_robustness/trm_multi4_best_step35805_n3000_k8_late", + "perturb": "both", + "perturb_afters": "0,4,8,12,15", + "rollouts": 8, + "seed": 20260606, + "sigmas": "0,0.001,0.003,0.01,0.03,0.1" + }, + "config_global_batch_size": 768, + "data_path": "/home/yurenh2/rrm/data/sudoku-extreme-1k-aug-1000", + "n_samples": 3000, + "perturb_afters": [ + 0, + 4, + 8, + 12, + 15 + ], + "sigmas": [ + 0.0, + 0.001, + 0.003, + 0.01, + 0.03, + 0.1 + ] +} \ No newline at end of file diff --git a/research/flossing/late_perturb_robustness/trm_multi4_best_step35805_n3000_k8_late.summary.csv b/research/flossing/late_perturb_robustness/trm_multi4_best_step35805_n3000_k8_late.summary.csv new file mode 100644 index 0000000..01a8526 --- /dev/null +++ b/research/flossing/late_perturb_robustness/trm_multi4_best_step35805_n3000_k8_late.summary.csv @@ -0,0 +1,31 @@ +label,perturb_after,sigma,n_samples,rollouts,ckpt_root,ckpt_name,perturb,noise_distribution,mean_rollout_exact,mean_rollout_token_acc,pass_at_k,all_k,correct_count_mean,correct_count_std,zero_frac,full_frac,clean_acc,retain_mean_on_clean_success,allK_on_clean_success,rescue_mean_on_clean_fail,passK_on_clean_fail +trm_multi4_best,0,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8939999938011169,0.959773600101471,0.8939999938011169,0.8939999938011169,7.1519999504089355,2.462700843811035,0.10599999874830246,0.8939999938011169,0.8939999938011169,1.0,1.0,0.0,0.0 +trm_multi4_best,0,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8957083225250244,0.9601898789405823,0.9583333134651184,0.8119999766349792,7.165666580200195,2.081879138946533,0.0416666679084301,0.8119999766349792,0.8939999938011169,0.9684004187583923,0.9030573964118958,0.28262579441070557,0.6194968819618225 +trm_multi4_best,0,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8957916498184204,0.9602490067481995,0.9606666564941406,0.8130000233650208,7.166333198547363,2.062037706375122,0.03933333232998848,0.8130000233650208,0.8939999938011169,0.9666293859481812,0.9038031101226807,0.2983490526676178,0.6352201104164124 +trm_multi4_best,0,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8947083353996277,0.9597618579864502,0.9599999785423279,0.8063333630561829,7.1576666831970215,2.0646567344665527,0.03999999910593033,0.8063333630561829,0.8939999938011169,0.9667226076126099,0.8963460326194763,0.28734275698661804,0.6320754885673523 +trm_multi4_best,0,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8953750133514404,0.9600813984870911,0.9603333473205566,0.8009999990463257,7.163000106811523,2.049495220184326,0.03966666758060455,0.8009999990463257,0.8939999938011169,0.9666759967803955,0.8918717503547668,0.29402515292167664,0.6446540951728821 +trm_multi4_best,0,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8950416445732117,0.9601003527641296,0.9629999995231628,0.7983333468437195,7.160333156585693,2.0425376892089844,0.03700000047683716,0.7983333468437195,0.8939999938011169,0.9635066986083984,0.8851603269577026,0.3176100552082062,0.6666666865348816 +trm_multi4_best,4,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8939999938011169,0.959773600101471,0.8939999938011169,0.8939999938011169,7.1519999504089355,2.462700843811035,0.10599999874830246,0.8939999938011169,0.8939999938011169,1.0,1.0,0.0,0.0 +trm_multi4_best,4,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8946250081062317,0.9599500894546509,0.9473333358764648,0.8433333039283752,7.1570000648498535,2.1906659603118896,0.052666667848825455,0.8433333039283752,0.8939999938011169,0.9753448963165283,0.9392244815826416,0.21383647620677948,0.5251572132110596 +trm_multi4_best,4,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8962500095367432,0.9606265425682068,0.9523333311080933,0.843999981880188,7.170000076293945,2.1701996326446533,0.04766666516661644,0.843999981880188,0.8939999938011169,0.976416826248169,0.9410887360572815,0.22012577950954437,0.5691823959350586 +trm_multi4_best,4,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8963750004768372,0.9604902863502502,0.9483333230018616,0.8446666598320007,7.171000003814697,2.171119213104248,0.05166666582226753,0.8446666598320007,0.8939999938011169,0.9760906100273132,0.9410887360572815,0.22405660152435303,0.5314465165138245 +trm_multi4_best,4,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8968333601951599,0.9606810808181763,0.9506666660308838,0.8403333425521851,7.174666881561279,2.1587400436401367,0.04933333396911621,0.8403333425521851,0.8939999938011169,0.9759973883628845,0.9377330541610718,0.2291666716337204,0.544025182723999 +trm_multi4_best,4,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8944583535194397,0.9597752094268799,0.9526666402816772,0.8326666951179504,7.155666828155518,2.1656641960144043,0.047333333641290665,0.8326666951179504,0.8939999938011169,0.9737602472305298,0.9284116625785828,0.22562892735004425,0.5628930926322937 +trm_multi4_best,8,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8939999938011169,0.959773600101471,0.8939999938011169,0.8939999938011169,7.1519999504089355,2.462700843811035,0.10599999874830246,0.8939999938011169,0.8939999938011169,1.0,1.0,0.0,0.0 +trm_multi4_best,8,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8962083458900452,0.9605055451393127,0.9399999976158142,0.8696666955947876,7.169666767120361,2.2701423168182373,0.05999999865889549,0.8696666955947876,0.8939999938011169,0.9840604066848755,0.9709172248840332,0.15526729822158813,0.45597484707832336 +trm_multi4_best,8,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8970000147819519,0.9608615636825562,0.9416666626930237,0.8696666955947876,7.176000118255615,2.254556179046631,0.05833333358168602,0.8696666955947876,0.8939999938011169,0.9844798445701599,0.9709172248840332,0.1591981202363968,0.46855345368385315 +trm_multi4_best,8,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8955416679382324,0.9602766633033752,0.9399999976158142,0.8673333525657654,7.164333343505859,2.2744951248168945,0.05999999865889549,0.8673333525657654,0.8939999938011169,0.9835943579673767,0.9690529704093933,0.1529088020324707,0.45597484707832336 +trm_multi4_best,8,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8949999809265137,0.9600179195404053,0.9436666369438171,0.8650000095367432,7.159999847412109,2.2588493824005127,0.056333333253860474,0.8650000095367432,0.8939999938011169,0.9836875200271606,0.9668158292770386,0.14701257646083832,0.47484275698661804 +trm_multi4_best,8,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8958333134651184,0.9602567553520203,0.9449999928474426,0.8643333315849304,7.166666507720947,2.251863479614258,0.054999999701976776,0.8643333315849304,0.8939999938011169,0.9831282496452332,0.9656972289085388,0.1595911979675293,0.49685534834861755 +trm_multi4_best,12,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8939999938011169,0.959773600101471,0.8939999938011169,0.8939999938011169,7.1519999504089355,2.462700843811035,0.10599999874830246,0.8939999938011169,0.8939999938011169,1.0,1.0,0.0,0.0 +trm_multi4_best,12,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8956666588783264,0.9604059457778931,0.9129999876022339,0.887333333492279,7.165333271026611,2.3898391723632812,0.08699999749660492,0.887333333492279,0.8939999938011169,0.9954324960708618,0.9925428628921509,0.05424528196454048,0.19182389974594116 +trm_multi4_best,12,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8966249823570251,0.9608678221702576,0.9193333387374878,0.887333333492279,7.172999858856201,2.3672215938568115,0.0806666687130928,0.887333333492279,0.8939999938011169,0.995712161064148,0.9925428628921509,0.06092767417430878,0.24528302252292633 +trm_multi4_best,12,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8974583148956299,0.9608935713768005,0.9213333129882812,0.887666642665863,7.179666519165039,2.3568453788757324,0.07866666465997696,0.887666642665863,0.8939999938011169,0.9956655502319336,0.9925428628921509,0.069182388484478,0.26729559898376465 +trm_multi4_best,12,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8966249823570251,0.9606748819351196,0.9223333597183228,0.8870000243186951,7.172999858856201,2.3598878383636475,0.07766667008399963,0.8870000243186951,0.8939999938011169,0.9950596690177917,0.9921700358390808,0.06643081456422806,0.276729553937912 +trm_multi4_best,12,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8972083330154419,0.9610205292701721,0.9259999990463257,0.8849999904632568,7.177666664123535,2.338397264480591,0.07400000095367432,0.8849999904632568,0.8939999938011169,0.9935682415962219,0.9899328947067261,0.08451257646083832,0.31446540355682373 +trm_multi4_best,15,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8939999938011169,0.959773600101471,0.8939999938011169,0.8939999938011169,7.1519999504089355,2.462700843811035,0.10599999874830246,0.8939999938011169,0.8939999938011169,1.0,1.0,0.0,0.0 +trm_multi4_best,15,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8939999938011169,0.9598419666290283,0.8939999938011169,0.8939999938011169,7.1519999504089355,2.462700843811035,0.10599999874830246,0.8939999938011169,0.8939999938011169,1.0,1.0,0.0,0.0 +trm_multi4_best,15,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8939999938011169,0.9597874879837036,0.8939999938011169,0.8939999938011169,7.1519999504089355,2.462700843811035,0.10599999874830246,0.8939999938011169,0.8939999938011169,1.0,1.0,0.0,0.0 +trm_multi4_best,15,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8939999938011169,0.9598518013954163,0.8939999938011169,0.8939999938011169,7.1519999504089355,2.462700843811035,0.10599999874830246,0.8939999938011169,0.8939999938011169,1.0,1.0,0.0,0.0 +trm_multi4_best,15,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8941249847412109,0.9598509073257446,0.8946666717529297,0.8939999938011169,7.1529998779296875,2.4601335525512695,0.10533333569765091,0.8939999938011169,0.8939999938011169,1.0,1.0,0.001179245300590992,0.006289307959377766 +trm_multi4_best,15,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_35805,both,gaussian,0.8942499756813049,0.9599588513374329,0.8956666588783264,0.893666684627533,7.1539998054504395,2.456070899963379,0.10433333367109299,0.893666684627533,0.8939999938011169,0.9999067783355713,0.9996271729469299,0.003144653979688883,0.015723271295428276 diff --git a/research/flossing/late_perturb_robustness/trm_multi4_final_step65100_n3000_k8_late.meta.json b/research/flossing/late_perturb_robustness/trm_multi4_final_step65100_n3000_k8_late.meta.json new file mode 100644 index 0000000..1f543a4 --- /dev/null +++ b/research/flossing/late_perturb_robustness/trm_multi4_final_step65100_n3000_k8_late.meta.json @@ -0,0 +1,34 @@ +{ + "args": { + "batch_size": 32, + "ckpt_name": "step_65100", + "ckpt_root": "/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro", + "label": "trm_multi4_final", + "n_samples": 3000, + "noise_distribution": "gaussian", + "out_prefix": "research/flossing/late_perturb_robustness/trm_multi4_final_step65100_n3000_k8_late", + "perturb": "both", + "perturb_afters": "0,4,8,12,15", + "rollouts": 8, + "seed": 20260606, + "sigmas": "0,0.001,0.003,0.01,0.03,0.1" + }, + "config_global_batch_size": 768, + "data_path": "/home/yurenh2/rrm/data/sudoku-extreme-1k-aug-1000", + "n_samples": 3000, + "perturb_afters": [ + 0, + 4, + 8, + 12, + 15 + ], + "sigmas": [ + 0.0, + 0.001, + 0.003, + 0.01, + 0.03, + 0.1 + ] +} \ No newline at end of file diff --git a/research/flossing/late_perturb_robustness/trm_multi4_final_step65100_n3000_k8_late.summary.csv b/research/flossing/late_perturb_robustness/trm_multi4_final_step65100_n3000_k8_late.summary.csv new file mode 100644 index 0000000..b4d80d6 --- /dev/null +++ b/research/flossing/late_perturb_robustness/trm_multi4_final_step65100_n3000_k8_late.summary.csv @@ -0,0 +1,31 @@ +label,perturb_after,sigma,n_samples,rollouts,ckpt_root,ckpt_name,perturb,noise_distribution,mean_rollout_exact,mean_rollout_token_acc,pass_at_k,all_k,correct_count_mean,correct_count_std,zero_frac,full_frac,clean_acc,retain_mean_on_clean_success,allK_on_clean_success,rescue_mean_on_clean_fail,passK_on_clean_fail +trm_multi4_final,0,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8286666870117188,0.9328684210777283,0.8286666870117188,0.8286666870117188,6.62933349609375,3.014399766921997,0.17133332788944244,0.8286666870117188,0.8286666870117188,1.0,1.0,0.0,0.0 +trm_multi4_final,0,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8319166898727417,0.9341394901275635,0.9086666703224182,0.7463333606719971,6.655333518981934,2.654029369354248,0.09133332967758179,0.7463333606719971,0.8286666870117188,0.9579645991325378,0.8942075371742249,0.22227627038955688,0.4902723729610443 +trm_multi4_final,0,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8315416574478149,0.93404221534729,0.9143333435058594,0.7419999837875366,6.6523332595825195,2.6464054584503174,0.08566666394472122,0.7419999837875366,0.8286666870117188,0.9564561247825623,0.8897827863693237,0.22738327085971832,0.5252918004989624 +trm_multi4_final,0,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8301666378974915,0.9333322644233704,0.9143333435058594,0.7356666922569275,6.641333103179932,2.639954090118408,0.08566666394472122,0.7356666922569275,0.8286666870117188,0.9536906480789185,0.879324197769165,0.23273345828056335,0.5272373557090759 +trm_multi4_final,0,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8319166898727417,0.934055507183075,0.9213333129882812,0.7239999771118164,6.655333518981934,2.594970464706421,0.07866666465997696,0.7239999771118164,0.8286666870117188,0.9528358578681946,0.8688656687736511,0.2470817118883133,0.5661478638648987 +trm_multi4_final,0,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8354583382606506,0.9354007244110107,0.9269999861717224,0.7246666550636292,6.683666706085205,2.5519142150878906,0.0729999989271164,0.7246666550636292,0.8286666870117188,0.9477574229240417,0.861625075340271,0.29231518507003784,0.5992217659950256 +trm_multi4_final,4,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8286666870117188,0.9328684210777283,0.8286666870117188,0.8286666870117188,6.62933349609375,3.014399766921997,0.17133332788944244,0.8286666870117188,0.8286666870117188,1.0,1.0,0.0,0.0 +trm_multi4_final,4,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8335833549499512,0.9348888397216797,0.8993333578109741,0.7756666541099548,6.668666839599609,2.732560157775879,0.10066666454076767,0.7756666541099548,0.8286666870117188,0.970585286617279,0.9336283206939697,0.17096303403377533,0.43968871235847473 +trm_multi4_final,4,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8352500200271606,0.9354547262191772,0.9020000100135803,0.7749999761581421,6.682000160217285,2.7150585651397705,0.09799999743700027,0.7749999761581421,0.8286666870117188,0.971238911151886,0.9328238368034363,0.17752918601036072,0.4494163393974304 +trm_multi4_final,4,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8327916860580444,0.9343785643577576,0.8989999890327454,0.7720000147819519,6.6623334884643555,2.7240254878997803,0.10100000351667404,0.7720000147819519,0.8286666870117188,0.9690265655517578,0.9292035102844238,0.17388132214546204,0.4319066107273102 +trm_multi4_final,4,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8316666483879089,0.9340242743492126,0.8986666798591614,0.7730000019073486,6.6533331871032715,2.7423510551452637,0.10133333504199982,0.7730000019073486,0.8286666870117188,0.9677695035934448,0.9267899990081787,0.17339494824409485,0.4319066107273102 +trm_multi4_final,4,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8324583172798157,0.9344562888145447,0.8989999890327454,0.7680000066757202,6.659666538238525,2.7232038974761963,0.10100000351667404,0.7680000066757202,0.8286666870117188,0.9691271185874939,0.9239742755889893,0.1714494228363037,0.4319066107273102 +trm_multi4_final,8,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8286666870117188,0.9328684210777283,0.8286666870117188,0.8286666870117188,6.62933349609375,3.014399766921997,0.17133332788944244,0.8286666870117188,0.8286666870117188,1.0,1.0,0.0,0.0 +trm_multi4_final,8,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8332499861717224,0.934550940990448,0.8849999904632568,0.8013333082199097,6.665999889373779,2.8155124187469482,0.11500000208616257,0.8013333082199097,0.8286666870117188,0.9811444282531738,0.9662107825279236,0.11794747412204742,0.344357967376709 +trm_multi4_final,8,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8323749899864197,0.9341363906860352,0.8830000162124634,0.8016666769981384,6.658999919891357,2.8219470977783203,0.11699999868869781,0.8016666769981384,0.8286666870117188,0.9805410504341125,0.967015266418457,0.1157587543129921,0.3346303403377533 +trm_multi4_final,8,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8352916836738586,0.9354629516601562,0.8856666684150696,0.8016666769981384,6.682333469390869,2.7959651947021484,0.11433333158493042,0.8016666769981384,0.8286666870117188,0.9822506308555603,0.9666130542755127,0.12451361864805222,0.344357967376709 +trm_multi4_final,8,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8335000276565552,0.9347721338272095,0.890333354473114,0.7993333339691162,6.668000221252441,2.803647994995117,0.10966666787862778,0.7993333339691162,0.8286666870117188,0.9798873662948608,0.9625905156135559,0.1254863739013672,0.3696497976779938 +trm_multi4_final,8,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8331666588783264,0.9347808957099915,0.8916666507720947,0.7960000038146973,6.665333271026611,2.7949953079223633,0.10833333432674408,0.7960000038146973,0.8286666870117188,0.9781777858734131,0.9593724608421326,0.13180933892726898,0.38910505175590515 +trm_multi4_final,12,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8286666870117188,0.9328684210777283,0.8286666870117188,0.8286666870117188,6.62933349609375,3.014399766921997,0.17133332788944244,0.8286666870117188,0.8286666870117188,1.0,1.0,0.0,0.0 +trm_multi4_final,12,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8307916522026062,0.9338425993919373,0.8569999933242798,0.8209999799728394,6.64633321762085,2.936537742614746,0.14300000667572021,0.8209999799728394,0.8286666870117188,0.9936142563819885,0.9907481670379639,0.043287936598062515,0.1750972718000412 +trm_multi4_final,12,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8307499885559082,0.9337176084518433,0.8603333234786987,0.8209999799728394,6.645999908447266,2.9328060150146484,0.13966666162014008,0.8209999799728394,0.8286666870117188,0.9935137033462524,0.9907481670379639,0.04353112727403641,0.18871594965457916 +trm_multi4_final,12,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8307499885559082,0.9339007139205933,0.8583333492279053,0.8203333616256714,6.645999908447266,2.93530535697937,0.14166666567325592,0.8203333616256714,0.8286666870117188,0.993463397026062,0.9899436831474304,0.0437743179500103,0.17898832261562347 +trm_multi4_final,12,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.831166684627533,0.9338744282722473,0.8616666793823242,0.8190000057220459,6.649333477020264,2.921477794647217,0.13833333551883698,0.8190000057220459,0.8286666870117188,0.9930108785629272,0.9883346557617188,0.04839494079351425,0.20428015291690826 +trm_multi4_final,12,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8309583067893982,0.9341686964035034,0.8683333396911621,0.8149999976158142,6.6476664543151855,2.9015274047851562,0.1316666603088379,0.8149999976158142,0.8286666870117188,0.9908487796783447,0.9835076332092285,0.057636186480522156,0.24319066107273102 +trm_multi4_final,15,0.0,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8286666870117188,0.9328684210777283,0.8286666870117188,0.8286666870117188,6.62933349609375,3.014399766921997,0.17133332788944244,0.8286666870117188,0.8286666870117188,1.0,1.0,0.0,0.0 +trm_multi4_final,15,0.001,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8286666870117188,0.9330503940582275,0.8286666870117188,0.8286666870117188,6.62933349609375,3.014399766921997,0.17133332788944244,0.8286666870117188,0.8286666870117188,1.0,1.0,0.0,0.0 +trm_multi4_final,15,0.003,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8287083506584167,0.9331271052360535,0.8289999961853027,0.8286666870117188,6.629666805267334,3.0137219429016113,0.17100000381469727,0.8286666870117188,0.8286666870117188,1.0,1.0,0.00024319066142197698,0.0019455252913758159 +trm_multi4_final,15,0.01,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8286250233650208,0.9330225586891174,0.8289999961853027,0.828000009059906,6.629000186920166,3.0135293006896973,0.17100000381469727,0.828000009059906,0.8286666870117188,0.9998994469642639,0.9991955161094666,0.00024319066142197698,0.0019455252913758159 +trm_multi4_final,15,0.03,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8287500143051147,0.9331192970275879,0.8306666612625122,0.8276666402816772,6.630000114440918,3.009169340133667,0.1693333387374878,0.8276666402816772,0.8286666870117188,0.999698281288147,0.9987932443618774,0.0019455252913758159,0.011673151515424252 +trm_multi4_final,15,0.1,3000,8,/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro,step_65100,both,gaussian,0.8292916417121887,0.9331137537956238,0.8330000042915344,0.8276666402816772,6.63433313369751,3.0008811950683594,0.16699999570846558,0.8276666402816772,0.8286666870117188,0.9995977282524109,0.9987932443618774,0.0055933850817382336,0.02529182843863964 diff --git a/research/flossing/late_perturb_robustness/watch_and_plot.sh b/research/flossing/late_perturb_robustness/watch_and_plot.sh new file mode 100755 index 0000000..5bb1d8a --- /dev/null +++ b/research/flossing/late_perturb_robustness/watch_and_plot.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT=/home/yurenh2/rrm +PY=/home/yurenh2/miniconda3/envs/rrm/bin/python +cd "${ROOT}" + +PIDS=( + research/flossing/late_perturb_robustness/logs/trm_baseline_best_step58590_n3000_k8_late.pid + research/flossing/late_perturb_robustness/logs/trm_multi4_best_step35805_n3000_k8_late.pid + research/flossing/late_perturb_robustness/logs/trm_multi4_final_step65100_n3000_k8_late.pid +) + +for pf in "${PIDS[@]}"; do + pid=$(cat "${pf}") + echo "watch ${pf}: ${pid}" + while kill -0 "${pid}" 2>/dev/null; do + sleep 60 + done + echo "done ${pf}: ${pid}" +done + +"${PY}" research/flossing/plot_late_perturb_robustness.py \ + --summaries \ + research/flossing/late_perturb_robustness/trm_baseline_best_step58590_n3000_k8_late.summary.csv \ + research/flossing/late_perturb_robustness/trm_multi4_best_step35805_n3000_k8_late.summary.csv \ + research/flossing/late_perturb_robustness/trm_multi4_final_step65100_n3000_k8_late.summary.csv \ + --out-dir research/flossing/late_perturb_robustness/plots \ + --slice-sigma 0.1 + +nvidia-smi --query-gpu=index,memory.used,memory.total,utilization.gpu --format=csv,noheader,nounits \ + > research/flossing/late_perturb_robustness/plots/final_gpu_status.txt -- cgit v1.2.3