From 66e0d8b9fd4d0f7a2231d689c055e26fdf1cf04a Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Sat, 13 Jun 2026 12:35:36 -0500 Subject: rrm workspace: TRM/HRM/SRM code, Maze dataset, dynamical-analysis pipeline Curated export for clone-and-run Maze training (2x A6000) + diagnostics. trm/hrm pretrain.py carry trajectory-augmentation code (backward-compatible). Heavy artifacts (checkpoints/wandb/npz) gitignored; see PROVENANCE.md. Co-Authored-By: Claude Fable 5 --- .../multi4_eval_compare/hrm_baseline_eval.csv | 11 ++++++ .../hrm_honly_step26040_eval.csv | 2 ++ .../multi4_eval_compare/hrm_matched_compare.csv | 6 ++++ .../hrm_multi4_complete_eval.csv | 11 ++++++ .../multi4_eval_compare/hrm_multi4_eval.csv | 6 ++++ .../hrm_multi4_horizon_sweep_768.csv | 41 ++++++++++++++++++++++ .../multi4_eval_compare/trm_baseline_eval.csv | 11 ++++++ ...ti4_step65100_det_n10000_seed20260602.meta.json | 15 ++++++++ ...4_step65100_det_n10000_seed20260602.summary.csv | 2 ++ .../multi4_eval_compare/trm_matched_compare.csv | 2 ++ .../multi4_eval_compare/trm_multi4_eval.csv | 2 ++ .../multi4_eval_compare/trm_multi4_eval_full.csv | 11 ++++++ .../trm_official_gbs768_eval.csv | 11 ++++++ .../trm_official_gbs768_multi4_eval.csv | 21 +++++++++++ .../trm_official_gbs768_multi4_step16275_eval.csv | 2 ++ .../multi4_eval_compare/wallclock_eval.csv | 30 ++++++++++++++++ 16 files changed, 184 insertions(+) create mode 100644 research/flossing/multi4_eval_compare/hrm_baseline_eval.csv create mode 100644 research/flossing/multi4_eval_compare/hrm_honly_step26040_eval.csv create mode 100644 research/flossing/multi4_eval_compare/hrm_matched_compare.csv create mode 100644 research/flossing/multi4_eval_compare/hrm_multi4_complete_eval.csv create mode 100644 research/flossing/multi4_eval_compare/hrm_multi4_eval.csv create mode 100644 research/flossing/multi4_eval_compare/hrm_multi4_horizon_sweep_768.csv create mode 100644 research/flossing/multi4_eval_compare/trm_baseline_eval.csv create mode 100644 research/flossing/multi4_eval_compare/trm_gbs768_multi4_step65100_det_n10000_seed20260602.meta.json create mode 100644 research/flossing/multi4_eval_compare/trm_gbs768_multi4_step65100_det_n10000_seed20260602.summary.csv create mode 100644 research/flossing/multi4_eval_compare/trm_matched_compare.csv create mode 100644 research/flossing/multi4_eval_compare/trm_multi4_eval.csv create mode 100644 research/flossing/multi4_eval_compare/trm_multi4_eval_full.csv create mode 100644 research/flossing/multi4_eval_compare/trm_official_gbs768_eval.csv create mode 100644 research/flossing/multi4_eval_compare/trm_official_gbs768_multi4_eval.csv create mode 100644 research/flossing/multi4_eval_compare/trm_official_gbs768_multi4_step16275_eval.csv create mode 100644 research/flossing/multi4_eval_compare/wallclock_eval.csv (limited to 'research/flossing/multi4_eval_compare') diff --git a/research/flossing/multi4_eval_compare/hrm_baseline_eval.csv b/research/flossing/multi4_eval_compare/hrm_baseline_eval.csv new file mode 100644 index 0000000..afc2b27 --- /dev/null +++ b/research/flossing/multi4_eval_compare/hrm_baseline_eval.csv @@ -0,0 +1,11 @@ +run,step,all/exact_accuracy,all/accuracy,all/lm_loss,all/q_halt_accuracy,all/q_halt_loss,all/steps +hrm_baseline,2604,0.016369983553886414,0.6336106061935425,0.8522627949714661,0.9927079081535339,0.020887982100248337,16 +hrm_baseline,5208,0.06169315055012703,0.6744286417961121,0.7430469989776611,0.9772745370864868,0.07281211763620377,16 +hrm_baseline,7812,0.1358133852481842,0.6985693573951721,0.676047682762146,0.996868371963501,0.0211492907255888,16 +hrm_baseline,10416,0.20248068869113922,0.7259970307350159,0.6480565667152405,0.9983821511268616,0.017285015434026718,16 +hrm_baseline,13020,0.3024248778820038,0.7580602169036865,0.5614010691642761,0.9971805810928345,0.02416178770363331,16 +hrm_baseline,15624,0.36705803871154785,0.7842973470687866,0.49664124846458435,0.9964591860771179,0.02256467007100582,16 +hrm_baseline,18228,0.46287721395492554,0.8077820539474487,0.4641122817993164,0.9953309893608093,0.02509351819753647,16 +hrm_baseline,20832,0.4912721812725067,0.8198283314704895,0.4137057960033417,0.9955745935440063,0.02796635963022709,16 +hrm_baseline,23436,0.5193856954574585,0.8260135054588318,0.41173747181892395,0.9975590705871582,0.024772455915808678,16 +hrm_baseline,26040,0.5265287756919861,0.8270824551582336,0.4161679148674011,0.9960216283798218,0.03298119083046913,16 diff --git a/research/flossing/multi4_eval_compare/hrm_honly_step26040_eval.csv b/research/flossing/multi4_eval_compare/hrm_honly_step26040_eval.csv new file mode 100644 index 0000000..d3e3ab9 --- /dev/null +++ b/research/flossing/multi4_eval_compare/hrm_honly_step26040_eval.csv @@ -0,0 +1,2 @@ +step,accuracy,exact_accuracy,lm_loss,q_halt_accuracy,q_halt_loss,steps +26040,0.85720146,0.62264127,0.35935268,0.99625105,0.024241636,16.0 diff --git a/research/flossing/multi4_eval_compare/hrm_matched_compare.csv b/research/flossing/multi4_eval_compare/hrm_matched_compare.csv new file mode 100644 index 0000000..71b8e16 --- /dev/null +++ b/research/flossing/multi4_eval_compare/hrm_matched_compare.csv @@ -0,0 +1,6 @@ +family,step,base_exact,multi4_exact,delta_exact,base_acc,multi4_acc,delta_acc,base_loss,multi4_loss,delta_loss,base_steps,multi4_steps +hrm,2604,0.016369983553886414,0.0123774204403162,-0.003992563113570213,0.6336106061935425,0.6303551197052002,-0.003255486488342285,0.8522627949714661,0.8603715300559998,0.008108735084533691,16,16 +hrm,5208,0.06169315055012703,0.025012653321027756,-0.036680497229099274,0.6744286417961121,0.6685170531272888,-0.005911588668823242,0.7430469989776611,0.7412638068199158,-0.0017831921577453613,16,16 +hrm,7812,0.1358133852481842,0.04651052877306938,-0.08930285647511482,0.6985693573951721,0.687346339225769,-0.011223018169403076,0.676047682762146,0.7044885158538818,0.02844083309173584,16,16 +hrm,10416,0.20248068869113922,0.2006617933511734,-0.0018188953399658203,0.7259970307350159,0.7243355512619019,-0.0016614794731140137,0.6480565667152405,0.6340938210487366,-0.013962745666503906,16,16 +hrm,13020,0.3024248778820038,0.34697696566581726,0.04455208778381348,0.7580602169036865,0.7794705033302307,0.02141028642654419,0.5614010691642761,0.49915269017219543,-0.06224837899208069,16,16 diff --git a/research/flossing/multi4_eval_compare/hrm_multi4_complete_eval.csv b/research/flossing/multi4_eval_compare/hrm_multi4_complete_eval.csv new file mode 100644 index 0000000..9aac158 --- /dev/null +++ b/research/flossing/multi4_eval_compare/hrm_multi4_complete_eval.csv @@ -0,0 +1,11 @@ +run,step,exact,accuracy,loss +hrm_multi4,2604,0.0123774204403162,0.6303551197052002,0.8603715300559998 +hrm_multi4,5208,0.025012653321027756,0.6685170531272888,0.7412638068199158 +hrm_multi4,7812,0.04651052877306938,0.687346339225769,0.7044885158538818 +hrm_multi4,10416,0.2006617933511734,0.7243355512619019,0.6340938210487366 +hrm_multi4,13020,0.34697696566581726,0.7794705033302307,0.49915269017219543 +hrm_multi4,15624,0.4653252363204956,0.8156101703643799,0.42743897438049316 +hrm_multi4,18228,0.5790873169898987,0.8494690656661987,0.3459467887878418 +hrm_multi4,20832,0.6393187,0.8660642,0.3186217 +hrm_multi4,23436,0.6443189,0.8684137,0.30427682 +hrm_multi4,26040,0.46235448,0.80298746,0.603943 diff --git a/research/flossing/multi4_eval_compare/hrm_multi4_eval.csv b/research/flossing/multi4_eval_compare/hrm_multi4_eval.csv new file mode 100644 index 0000000..0f7dbc9 --- /dev/null +++ b/research/flossing/multi4_eval_compare/hrm_multi4_eval.csv @@ -0,0 +1,6 @@ +run,step,all/exact_accuracy,all/accuracy,all/lm_loss,all/q_halt_accuracy,all/q_halt_loss,all/steps +hrm_multi4,2604,0.0123774204403162,0.6303551197052002,0.8603715300559998,0.9879773855209351,0.020903315395116806,16 +hrm_multi4,5208,0.025012653321027756,0.6685170531272888,0.7412638068199158,0.9968069195747375,0.019309692084789276,16 +hrm_multi4,7812,0.04651052877306938,0.687346339225769,0.7044885158538818,0.9902976751327515,0.03646523132920265,16 +hrm_multi4,10416,0.2006617933511734,0.7243355512619019,0.6340938210487366,0.9991532564163208,0.011545917019248009,16 +hrm_multi4,13020,0.34697696566581726,0.7794705033302307,0.49915269017219543,0.9987062215805054,0.01581510715186596,16 diff --git a/research/flossing/multi4_eval_compare/hrm_multi4_horizon_sweep_768.csv b/research/flossing/multi4_eval_compare/hrm_multi4_horizon_sweep_768.csv new file mode 100644 index 0000000..3bdace3 --- /dev/null +++ b/research/flossing/multi4_eval_compare/hrm_multi4_horizon_sweep_768.csv @@ -0,0 +1,41 @@ +step,split,horizon,count,exact_accuracy,accuracy,lm_loss,q_halt_loss,steps +23436,train,2,768.0,0.2955729166666667,0.8364840348561605,0.39225157833573504,0.04459714392820994,2.0 +23436,train,3,768.0,0.5755208333333334,0.8917663097381592,0.2618181909650845,0.01976812755068143,3.0 +23436,train,4,768.0,0.6796875,0.9122781753540039,0.21020127075343642,0.006546831379334132,4.0 +23436,train,5,768.0,0.734375,0.9230967362721761,0.182652537414814,0.00717167928814888,5.0 +23436,train,6,768.0,0.7669270833333334,0.9298482735951742,0.16631383252369117,0.00374875341852506,6.0 +23436,train,8,768.0,0.8033854166666666,0.9389146169026693,0.14464881393209095,0.0037066793690125146,8.0 +23436,train,10,768.0,0.8268229166666666,0.9462127685546875,0.12909535948177878,0.00991838239133358,10.0 +23436,train,12,768.0,0.84375,0.9500546455383301,0.11931335874268405,0.004430865868926048,12.0 +23436,train,14,768.0,0.85546875,0.9517585436503092,0.11357831630187847,0.0033199011037747064,14.0 +23436,train,16,768.0,0.859375,0.9536232948303223,0.11014115689326769,0.003578242535392443,16.0 +23436,test,2,768.0,0.12760416666666666,0.7265946865081787,0.6354224683840087,0.025561923782030743,2.0 +23436,test,3,768.0,0.2526041666666667,0.7604809602101644,0.5506175992783068,0.012921225279569626,3.0 +23436,test,4,768.0,0.3138020833333333,0.7742091019948324,0.515490498860075,0.003827621228992939,4.0 +23436,test,5,768.0,0.3619791666666667,0.7867315610249838,0.48693293612013444,0.00506168728073438,5.0 +23436,test,6,768.0,0.3984375,0.7953317960103353,0.46390732880926117,0.0035611667359868684,6.0 +23436,test,8,768.0,0.4583333333333333,0.8105709552764893,0.4314775246277862,0.005083844686547915,8.0 +23436,test,10,768.0,0.48828125,0.8183674812316895,0.4161860321298086,0.003043775757153829,10.0 +23436,test,12,768.0,0.51171875,0.8260512351989746,0.4016971584101859,0.004879387095570564,12.0 +23436,test,14,768.0,0.5325520833333334,0.831050713857015,0.3878147863194716,0.00292336226751407,14.0 +23436,test,16,768.0,0.5559895833333334,0.8361946741739908,0.37473119346002354,0.0035912382105986276,16.0 +26040,train,2,768.0,0.7825520833333334,0.963814894358317,0.09390118849629236,0.04166571795940399,2.0 +26040,train,3,768.0,0.9127604166666666,0.9810153643290201,0.051538720202730794,0.008062846958637238,3.0 +26040,train,4,768.0,0.9309895833333334,0.9841659863789877,0.04344159450663771,0.012521501630544662,4.0 +26040,train,5,768.0,0.9401041666666666,0.9858539899190267,0.04013312268752094,0.015388640264670054,5.0 +26040,train,6,768.0,0.9401041666666666,0.9870595932006836,0.03853385294570503,0.023091336091359455,6.0 +26040,train,8,768.0,0.94921875,0.9884098370869955,0.03620980748266996,0.01947430024544398,8.0 +26040,train,10,768.0,0.9557291666666666,0.9893261591593424,0.03444665149376691,0.03555429975191752,10.0 +26040,train,12,768.0,0.953125,0.9895029067993164,0.034414704500878884,0.04184401035308838,12.0 +26040,train,14,768.0,0.9557291666666666,0.9893904527028402,0.03586123670240371,0.037144094705581665,14.0 +26040,train,16,768.0,0.9518229166666666,0.9894386132558187,0.03654265702438753,0.041033936043580375,16.0 +26040,test,2,768.0,0.11588541666666667,0.7054398854573568,0.8908620335632751,0.040651207168896995,2.0 +26040,test,3,768.0,0.1875,0.7230902512868246,0.7620792943957264,0.03934991856416067,3.0 +26040,test,4,768.0,0.23567708333333334,0.731706460316976,0.7335753038165317,0.11375004053115845,4.0 +26040,test,5,768.0,0.25,0.7362075646718343,0.7385935210540664,0.160938690106074,5.0 +26040,test,6,768.0,0.2669270833333333,0.7397923469543457,0.743103274276764,0.17507813374201456,6.0 +26040,test,8,768.0,0.296875,0.7452899614969889,0.7468322750276379,0.19044278065363565,8.0 +26040,test,10,768.0,0.3138020833333333,0.7472190856933594,0.7515058945457195,0.21485831340154013,10.0 +26040,test,12,768.0,0.3229166666666667,0.7502892812093099,0.7620030015396181,0.22753063837687174,12.0 +26040,test,14,768.0,0.3307291666666667,0.7518165111541748,0.7553974518406182,0.2158371408780416,14.0 +26040,test,16,768.0,0.3346354166666667,0.7514950434366862,0.7639393310889216,0.23628832896550497,16.0 diff --git a/research/flossing/multi4_eval_compare/trm_baseline_eval.csv b/research/flossing/multi4_eval_compare/trm_baseline_eval.csv new file mode 100644 index 0000000..0b6bc11 --- /dev/null +++ b/research/flossing/multi4_eval_compare/trm_baseline_eval.csv @@ -0,0 +1,11 @@ +run,step,all/exact_accuracy,all/accuracy,all/lm_loss,all/q_halt_accuracy,all/q_halt_loss,all/steps +trm_baseline,26041,0.5575894117355347,0.8469749093055725,0.35285070538520813,0.9997445344924927,0.003797376062721014,16 +trm_baseline,52082,0.6295099854469299,0.8704391121864319,0.2973524034023285,0.9998486042022705,0.0011324305087327957,16 +trm_baseline,78123,0.6993892788887024,0.8920264840126038,0.2477506548166275,0.9998935461044312,0.0006260558147914708,16 +trm_baseline,104164,0.72928386926651,0.9020143151283264,0.22608688473701477,0.9998770356178284,0.001141014276072383,16 +trm_baseline,130205,0.7653990387916565,0.914251446723938,0.19878524541854858,0.999917209148407,0.0010435190051794052,16 +trm_baseline,156246,0.7596656680107117,0.9119072556495667,0.2041437327861786,0.999862790107727,0.0011596218682825565,16 +trm_baseline,182287,0.7541900873184204,0.9094774723052979,0.2100999504327774,0.9998249411582947,0.0013093978632241488,16 +trm_baseline,208328,0.7732800841331482,0.9166732430458069,0.19410833716392517,0.9998320937156677,0.0033004307188093662,16 +trm_baseline,234369,0.7750374674797058,0.9172152280807495,0.19279460608959198,0.9998533725738525,0.0032994903158396482,16 +trm_baseline,260410,0.7742025256156921,0.9169425964355469,0.19348150491714478,0.9998462796211243,0.002894919365644455,16 diff --git a/research/flossing/multi4_eval_compare/trm_gbs768_multi4_step65100_det_n10000_seed20260602.meta.json b/research/flossing/multi4_eval_compare/trm_gbs768_multi4_step65100_det_n10000_seed20260602.meta.json new file mode 100644 index 0000000..f61ba39 --- /dev/null +++ b/research/flossing/multi4_eval_compare/trm_gbs768_multi4_step65100_det_n10000_seed20260602.meta.json @@ -0,0 +1,15 @@ +{ + "batch_size": 256, + "ckpt_name": "step_65100", + "ckpt_root": "trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_multi4_loguniform_repro", + "fd_eps": 0.01, + "fd_lyap": false, + "fd_spectrum_k": 0, + "include_clean": false, + "n_samples": 10000, + "noise_std": 0.0, + "perturb": "both", + "rollouts": 1, + "seed": 20260602, + "steps": 16 +} \ No newline at end of file diff --git a/research/flossing/multi4_eval_compare/trm_gbs768_multi4_step65100_det_n10000_seed20260602.summary.csv b/research/flossing/multi4_eval_compare/trm_gbs768_multi4_step65100_det_n10000_seed20260602.summary.csv new file mode 100644 index 0000000..38ca8f5 --- /dev/null +++ b/research/flossing/multi4_eval_compare/trm_gbs768_multi4_step65100_det_n10000_seed20260602.summary.csv @@ -0,0 +1,2 @@ +correct_count/det_fail_mean,correct_count/det_success_mean,correct_count/full_frac,correct_count/ge_1_frac,correct_count/mean,correct_count/median,correct_count/q10,correct_count/q25,correct_count/q75,correct_count/q90,correct_count/std,correct_count/zero_frac,deterministic/exact,deterministic/token_acc,fd_lyap,fd_spectrum_k,include_clean,mean_rollout/exact,mean_rollout/token_acc,n_samples,noise_std,oracle_pass/det_fail_frac,oracle_pass/det_success_frac,oracle_pass/exact,oracle_pass/token_acc,perturb_both,perturb_h,perturb_l,q_max/det_fail_frac,q_max/det_success_frac,q_max/exact,q_max/token_acc,q_mean,rollout0/exact,rollout0/token_acc,rollouts,steps +0.0,1.0,0.8312000036239624,0.8312000036239624,0.8312000036239624,1.0,0.0,1.0,1.0,1.0,0.3745751678943634,0.1687999963760376,0.8312000036239624,0.9333752989768982,0.0,0.0,0.0,0.8312000036239624,0.9333752989768982,10000.0,0.0,0.0,1.0,0.8312000036239624,0.9333752989768982,1.0,0.0,0.0,0.0,1.0,0.8312000036239624,0.9333752989768982,3.762489080429077,0.8312000036239624,0.9333752989768982,1.0,16.0 diff --git a/research/flossing/multi4_eval_compare/trm_matched_compare.csv b/research/flossing/multi4_eval_compare/trm_matched_compare.csv new file mode 100644 index 0000000..32006a5 --- /dev/null +++ b/research/flossing/multi4_eval_compare/trm_matched_compare.csv @@ -0,0 +1,2 @@ +family,step,base_exact,multi4_exact,delta_exact,base_acc,multi4_acc,delta_acc,base_loss,multi4_loss,delta_loss,base_steps,multi4_steps +trm,26041,0.5575894117355347,0.7394047975540161,0.18181538581848145,0.8469749093055725,0.9067130088806152,0.059738099575042725,0.35285070538520813,0.21417337656021118,-0.13867732882499695,16,16 diff --git a/research/flossing/multi4_eval_compare/trm_multi4_eval.csv b/research/flossing/multi4_eval_compare/trm_multi4_eval.csv new file mode 100644 index 0000000..f3d16b5 --- /dev/null +++ b/research/flossing/multi4_eval_compare/trm_multi4_eval.csv @@ -0,0 +1,2 @@ +run,step,all/exact_accuracy,all/accuracy,all/lm_loss,all/q_halt_accuracy,all/q_halt_loss,all/steps +trm_multi4,26041,0.7394047975540161,0.9067130088806152,0.21417337656021118,0.9997232556343079,0.0024572687689214945,16 diff --git a/research/flossing/multi4_eval_compare/trm_multi4_eval_full.csv b/research/flossing/multi4_eval_compare/trm_multi4_eval_full.csv new file mode 100644 index 0000000..3411ff5 --- /dev/null +++ b/research/flossing/multi4_eval_compare/trm_multi4_eval_full.csv @@ -0,0 +1,11 @@ +run,step,all/exact_accuracy,all/accuracy,all/lm_loss,all/q_halt_accuracy,all/q_halt_loss,all/steps +trm_multi4_loguniform_repro,26041,0.7394047975540161,0.9067130088806152,0.21417337656021118,0.9997232556343079,0.0024572687689214945,16 +trm_multi4_loguniform_repro,52082,0.8449901342391968,0.9424432516098022,0.1342233270406723,0.9998746514320374,0.0010290677892044187,16 +trm_multi4_loguniform_repro,78123,0.8417639136314392,0.9411031007766724,0.13769488036632538,0.9997493028640747,0.0022251552436500788,16 +trm_multi4_loguniform_repro,104164,0.8547161221504211,0.9456510543823242,0.12779666483402252,0.999834418296814,0.0019055778393521905,16 +trm_multi4_loguniform_repro,130205,0.8536233305931091,0.9453508853912354,0.1282763034105301,0.9998888373374939,0.0018422487191855907,16 +trm_multi4_loguniform_repro,156246,0.8489472270011902,0.9433866739273071,0.13273237645626068,0.999782383441925,0.003051365725696087,16 +trm_multi4_loguniform_repro,182287,0.8558868765830994,0.9459810256958008,0.12728126347064972,0.9997469186782837,0.0024399051908403635,16 +trm_multi4_loguniform_repro,208328,0.8404204249382019,0.9403222799301147,0.13971956074237823,0.9996499419212341,0.0029723909683525562,16 +trm_multi4_loguniform_repro,234369,0.845432460308075,0.9419097304344177,0.13668429851531982,0.9997681975364685,0.0016449446557089686,16 +trm_multi4_loguniform_repro,260410,0.826143741607666,0.9344042539596558,0.15468436479568481,0.999701976776123,0.001810370129533112,16 diff --git a/research/flossing/multi4_eval_compare/trm_official_gbs768_eval.csv b/research/flossing/multi4_eval_compare/trm_official_gbs768_eval.csv new file mode 100644 index 0000000..66dfe1b --- /dev/null +++ b/research/flossing/multi4_eval_compare/trm_official_gbs768_eval.csv @@ -0,0 +1,11 @@ +run,step,all/exact_accuracy,all/accuracy,all/lm_loss,all/q_halt_accuracy,all/q_halt_loss,all/steps +trm_official_gbs768,6510,0.20006339251995087,0.7261562347412109,0.6155200004577637,0.9993779063224792,0.0063577014952898026,16 +trm_official_gbs768,13020,0.6248409152030945,0.8679871559143066,0.30110087990760803,0.9997587203979492,0.0018232133006677032,16 +trm_official_gbs768,19530,0.7098508477210999,0.8961462378501892,0.23719573020935059,0.9997185468673706,0.002359110629186034,16 +trm_official_gbs768,26040,0.7558197379112244,0.9113300442695618,0.20295456051826477,0.9997066855430603,0.0026622479781508446,16 +trm_official_gbs768,32550,0.7946407794952393,0.9247151017189026,0.17315243184566498,0.9997208714485168,0.0024344150442630053,16 +trm_official_gbs768,39060,0.8247435092926025,0.9351920485496521,0.14995059370994568,0.9996594190597534,0.002962446305900812,16 +trm_official_gbs768,45570,0.8479396104812622,0.9433117508888245,0.1323014795780182,0.9996806979179382,0.0026382978539913893,16 +trm_official_gbs768,52080,0.8633161783218384,0.9488447904586792,0.12009253352880478,0.9997137784957886,0.002587266732007265,16 +trm_official_gbs768,58590,0.8686309456825256,0.9508475661277771,0.11555595695972443,0.9998438954353333,0.0014915302162989974,16 +trm_official_gbs768,65100,0.86624675989151,0.9500409364700317,0.11748332530260086,0.9996523261070251,0.002605273388326168,16 diff --git a/research/flossing/multi4_eval_compare/trm_official_gbs768_multi4_eval.csv b/research/flossing/multi4_eval_compare/trm_official_gbs768_multi4_eval.csv new file mode 100644 index 0000000..ec353ce --- /dev/null +++ b/research/flossing/multi4_eval_compare/trm_official_gbs768_multi4_eval.csv @@ -0,0 +1,21 @@ +run,step,all/exact_accuracy,all/accuracy,all/lm_loss,all/q_halt_accuracy,all/q_halt_loss,all/steps,_runtime,_timestamp +trm_official_gbs768_multi4,3255,0.0175999198108911,0.6504403352737427,0.7941210269927979,0.9824000597000122,0.108424387872219,16.0,12550.718766357,1780192940.452497 +trm_official_gbs768_multi4,6510,0.2277795374393463,0.7379271984100342,0.5934479236602783,0.999262034893036,0.0035974220372736,16.0,25064.702453699,1780205454.4563622 +trm_official_gbs768_multi4,9765,0.6357069611549377,0.8709613084793091,0.2931223213672638,0.9997137784957886,0.0014165197499096,16.0,37579.311725746,1780217969.132149 +trm_official_gbs768_multi4,13020,0.75853031873703,0.910834014415741,0.2033131867647171,0.9997611045837402,0.0021604059729725,16.0,50092.402719112,1780230482.2738986 +trm_official_gbs768_multi4,16275,0.8263778686523438,0.9346956610679626,0.1507271528244018,0.9997800588607788,0.0041020140051841,16.0,62595.579823935,1780242985.398887 +trm_official_gbs768_multi4,19530,0.8654520511627197,0.9488762617111206,0.1198361814022064,0.9997090697288512,0.0052023055031895,16.0,75096.299217356,1780255486.102359 +trm_official_gbs768_multi4,22785,0.8817486763000488,0.9549695253372192,0.1065462753176689,0.9997114539146424,0.0037784865126013,16.0,87607.26667599,1780267997.105077 +trm_official_gbs768_multi4,26040,0.8879267573356628,0.957356870174408,0.1014027148485183,0.999642848968506,0.0053329654037952,16.0,100118.838503384,1780280508.634669 +trm_official_gbs768_multi4,29295,0.8933455944061279,0.959309697151184,0.0971761047840118,0.9996144771575928,0.0041324645280838,16.0,112620.42551711,1780293010.168183 +trm_official_gbs768_multi4,32550,0.8962169885635376,0.960436463356018,0.0948082581162452,0.9996026158332824,0.0144010595977306,16.0,125134.755165262,1780305524.5418072 +trm_official_gbs768_multi4,35805,0.8964653611183167,0.9604493975639344,0.0945562794804573,0.9995790123939514,0.0068304436281323,16.0,137714.797781532,1780318104.6386163 +trm_official_gbs768_multi4,39060,0.8957250118255615,0.9601802825927734,0.0952448472380638,0.999541163444519,0.0185395441949367,16.0,150233.754675447,1780330623.5781178 +trm_official_gbs768_multi4,42315,0.888913094997406,0.9574248790740968,0.1008101180195808,0.9995600581169128,0.0061048995703458,16.0,162739.436977464,1780343129.2559526 +trm_official_gbs768_multi4,45570,0.8858169317245483,0.9561300873756408,0.1036654934287071,0.9994938373565674,0.0054858992807567,16.0,175251.082236918,1780355640.8929477 +trm_official_gbs768_multi4,48825,0.882732629776001,0.9547808170318604,0.1068678125739097,0.9994110465049744,0.0072551541961729,16.0,187778.40637965,1780368168.2221627 +trm_official_gbs768_multi4,52080,0.8782954216003418,0.9530280232429504,0.1104752272367477,0.99934720993042,0.0089566921815276,16.0,200285.373208387,1780380675.1846614 +trm_official_gbs768_multi4,55335,0.8694067597389221,0.94942307472229,0.1194151788949966,0.9987700581550598,0.0272370912134647,16.0,212785.271448664,1780393175.0770009 +trm_official_gbs768_multi4,58590,0.8620365858078003,0.9463443756103516,0.1268824934959411,0.9983466863632202,0.0163928251713514,16.0,225280.516080387,1780405670.3403552 +trm_official_gbs768_multi4,61845,0.850763738155365,0.941650390625,0.1378339380025863,0.9978050589561462,0.0215476993471384,16.0,237778.331426833,1780418168.1822684 +trm_official_gbs768_multi4,65100,0.8350536823272705,0.9350366592407228,0.1547555029392242,0.9962841868400574,0.0310162808746099,16.0,250280.395200839,1780430673.009492 diff --git a/research/flossing/multi4_eval_compare/trm_official_gbs768_multi4_step16275_eval.csv b/research/flossing/multi4_eval_compare/trm_official_gbs768_multi4_step16275_eval.csv new file mode 100644 index 0000000..a34ae57 --- /dev/null +++ b/research/flossing/multi4_eval_compare/trm_official_gbs768_multi4_step16275_eval.csv @@ -0,0 +1,2 @@ +step,accuracy,exact_accuracy,lm_loss,q_halt_accuracy,q_halt_loss,steps +16275,0.934574,0.82621706,0.15090619,0.99974453,0.00405054,16.0 diff --git a/research/flossing/multi4_eval_compare/wallclock_eval.csv b/research/flossing/multi4_eval_compare/wallclock_eval.csv new file mode 100644 index 0000000..268a924 --- /dev/null +++ b/research/flossing/multi4_eval_compare/wallclock_eval.csv @@ -0,0 +1,30 @@ +run,step,runtime_sec,runtime_hms,timestamp_local,ckpt_exists,ckpt_mtime_local,exact,accuracy,loss +hrm_baseline,2604,1440.547846523,00:24:00.55,2026-05-22 06:32:03,True,2026-05-22 06:32:04,0.016369983553886414,0.6336106061935425,0.8522627949714661 +hrm_baseline,5208,2915.282126881,00:48:35.28,2026-05-22 06:56:38,True,2026-05-22 06:56:39,0.06169315055012703,0.6744286417961121,0.7430469989776611 +hrm_baseline,7812,3726.920135005,01:02:06.92,2026-05-22 07:10:09,True,2026-05-22 07:10:10,0.1358133852481842,0.6985693573951721,0.676047682762146 +hrm_baseline,10416,4458.114451179,01:14:18.11,2026-05-22 07:22:21,True,2026-05-22 07:22:22,0.20248068869113922,0.7259970307350159,0.6480565667152405 +hrm_baseline,13020,5801.015399971,01:36:41.02,2026-05-22 07:44:43,True,2026-05-22 07:44:45,0.3024248778820038,0.7580602169036865,0.5614010691642761 +hrm_baseline,15624,7219.496414876,02:00:19.50,2026-05-22 08:08:22,True,2026-05-22 08:08:23,0.36705803871154785,0.7842973470687866,0.49664124846458435 +hrm_baseline,18228,8616.18880162,02:23:36.19,2026-05-22 08:31:39,True,2026-05-22 08:31:40,0.46287721395492554,0.8077820539474487,0.4641122817993164 +hrm_baseline,20832,9904.193793478,02:45:04.19,2026-05-22 08:53:07,True,2026-05-22 08:53:08,0.4912721812725067,0.8198283314704895,0.4137057960033417 +hrm_baseline,23436,10634.360398376,02:57:14.36,2026-05-22 09:05:17,True,2026-05-22 09:05:18,0.5193856954574585,0.8260135054588318,0.41173747181892395 +hrm_baseline,26040,11366.743085113,03:09:26.74,2026-05-22 09:17:29,True,2026-05-22 09:17:30,0.5265287756919861,0.8270824551582336,0.4161679148674011 +hrm_multi4,2604,5810.841404196,01:36:50.84,2026-05-27 22:44:06,True,2026-05-27 22:44:07,0.0123774204403162,0.6303551197052002,0.8603715300559998 +hrm_multi4,5208,11587.805059004,03:13:07.81,2026-05-28 00:20:23,True,2026-05-28 00:20:24,0.025012653321027756,0.6685170531272888,0.7412638068199158 +hrm_multi4,7812,17365.375767937,04:49:25.38,2026-05-28 01:56:41,True,2026-05-28 01:56:42,0.04651052877306938,0.687346339225769,0.7044885158538818 +hrm_multi4,10416,23142.813901422,06:25:42.81,2026-05-28 03:32:58,True,2026-05-28 03:32:59,0.2006617933511734,0.7243355512619019,0.6340938210487366 +hrm_multi4,13020,29347.517355686,08:09:07.52,2026-05-28 05:16:23,True,2026-05-28 05:16:24,0.34697696566581726,0.7794705033302307,0.49915269017219543 +hrm_multi4,15624,35123.513458465,09:45:23.51,2026-05-28 06:52:39,True,2026-05-28 06:52:40,0.4653252363204956,0.8156101703643799,0.42743897438049316 +hrm_multi4,18228,40898.005173388,11:21:38.01,2026-05-28 08:28:54,True,2026-05-28 08:28:55,0.5790873169898987,0.8494690656661987,0.3459467887878418 +trm_baseline,26041,8593.415472305,02:23:13.42,2026-05-23 01:55:09,True,2026-05-23 01:55:10,0.5575894117355347,0.8469749093055725,0.35285070538520813 +trm_baseline,52082,17155.876633182,04:45:55.88,2026-05-23 04:17:52,True,2026-05-23 04:17:52,0.6295099854469299,0.8704391121864319,0.2973524034023285 +trm_baseline,78123,25719.395089913,07:08:39.40,2026-05-23 06:40:35,True,2026-05-23 06:40:36,0.6993892788887024,0.8920264840126038,0.2477506548166275 +trm_baseline,104164,34282.065662564,09:31:22.07,2026-05-23 09:03:18,True,2026-05-23 09:03:18,0.72928386926651,0.9020143151283264,0.22608688473701477 +trm_baseline,130205,42852.348430037,11:54:12.35,2026-05-23 11:26:08,True,2026-05-23 11:26:09,0.7653990387916565,0.914251446723938,0.19878524541854858 +trm_baseline,156246,51422.119869545,14:17:02.12,2026-05-23 13:48:58,True,2026-05-23 13:48:58,0.7596656680107117,0.9119072556495667,0.2041437327861786 +trm_baseline,182287,59826.120123505,16:37:06.12,2026-05-23 16:09:02,True,2026-05-23 16:09:02,0.7541900873184204,0.9094774723052979,0.2100999504327774 +trm_baseline,208328,68138.229200214,18:55:38.23,2026-05-23 18:27:34,True,2026-05-23 18:27:34,0.7732800841331482,0.9166732430458069,0.19410833716392517 +trm_baseline,234369,76460.482892161,21:14:20.48,2026-05-23 20:46:17,True,2026-05-23 20:46:17,0.7750374674797058,0.9172152280807495,0.19279460608959198 +trm_baseline,260410,84783.527271934,23:33:03.53,2026-05-23 23:05:00,True,2026-05-23 23:05:00,0.7742025256156921,0.9169425964355469,0.19348150491714478 +trm_multi4,26041,22216.1844709,06:10:16.18,2026-05-28 03:18:26,True,2026-05-28 03:18:26,0.7394047975540161,0.9067130088806152,0.21417337656021118 +trm_multi4,52082,44853.735386924,12:27:33.74,2026-05-28 09:35:44,True,2026-05-28 09:35:44,0.8449901342391968,0.9424432516098022,0.1342233270406723 -- cgit v1.2.3