From 66e0d8b9fd4d0f7a2231d689c055e26fdf1cf04a Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Sat, 13 Jun 2026 12:35:36 -0500 Subject: rrm workspace: TRM/HRM/SRM code, Maze dataset, dynamical-analysis pipeline Curated export for clone-and-run Maze training (2x A6000) + diagnostics. trm/hrm pretrain.py carry trajectory-augmentation code (backward-compatible). Heavy artifacts (checkpoints/wandb/npz) gitignored; see PROVENANCE.md. Co-Authored-By: Claude Fable 5 --- ...ma03_Lonly_fdlyap_n512_seed20260602.summary.csv | 2 ++ ...4_sigma03_Lonly_n10000_seed20260602.summary.csv | 2 ++ .../tables/fig5_qhead_vs_lambda1_ptrm_summary.csv | 3 ++ .../tables/headline_trm_multi4_dynamics_table.csv | 4 +++ .../tables/hrm_baseline_eval.csv | 11 ++++++ .../tables/hrm_honly_step26040_eval.csv | 2 ++ .../tables/hrm_matched_compare.csv | 6 ++++ .../tables/hrm_multi4_complete_eval.csv | 11 ++++++ .../tables/hrm_multi4_eval.csv | 6 ++++ .../tables/hrm_multi4_horizon_sweep_768.csv | 41 ++++++++++++++++++++++ .../tables/hrm_trm_redesigned_summary.csv | 7 ++++ .../tables/meeting_figures_v2_report.md | 26 ++++++++++++++ ...ma03_Lonly_fdlyap_n512_seed20260602.summary.csv | 2 ++ ...4_sigma03_Lonly_n10000_seed20260602.summary.csv | 2 ++ .../paired_ptrm_k100_n1000_seed0_summary.csv | 2 ++ .../tables/summary_n512_k8_seed20260602.csv | 4 +++ .../tables/summary_n64_k8_seed20260602.csv | 5 +++ .../tables/trm_baseline_eval.csv | 11 ++++++ ...4_step65100_det_n10000_seed20260602.summary.csv | 2 ++ .../tables/trm_matched_compare.csv | 2 ++ .../tables/trm_multi4_eval.csv | 2 ++ .../tables/trm_multi4_eval_full.csv | 11 ++++++ .../tables/trm_official_gbs768_eval.csv | 11 ++++++ .../tables/trm_official_gbs768_multi4_eval.csv | 21 +++++++++++ .../trm_official_gbs768_multi4_step16275_eval.csv | 2 ++ .../tables/wallclock_eval.csv | 30 ++++++++++++++++ 26 files changed, 228 insertions(+) create mode 100644 research/flossing/report_bundle_20260603/tables/base58590_k25_d64_sigma03_Lonly_fdlyap_n512_seed20260602.summary.csv create mode 100644 research/flossing/report_bundle_20260603/tables/base58590_k25_d64_sigma03_Lonly_n10000_seed20260602.summary.csv create mode 100644 research/flossing/report_bundle_20260603/tables/fig5_qhead_vs_lambda1_ptrm_summary.csv create mode 100644 research/flossing/report_bundle_20260603/tables/headline_trm_multi4_dynamics_table.csv create mode 100644 research/flossing/report_bundle_20260603/tables/hrm_baseline_eval.csv create mode 100644 research/flossing/report_bundle_20260603/tables/hrm_honly_step26040_eval.csv create mode 100644 research/flossing/report_bundle_20260603/tables/hrm_matched_compare.csv create mode 100644 research/flossing/report_bundle_20260603/tables/hrm_multi4_complete_eval.csv create mode 100644 research/flossing/report_bundle_20260603/tables/hrm_multi4_eval.csv create mode 100644 research/flossing/report_bundle_20260603/tables/hrm_multi4_horizon_sweep_768.csv create mode 100644 research/flossing/report_bundle_20260603/tables/hrm_trm_redesigned_summary.csv create mode 100644 research/flossing/report_bundle_20260603/tables/meeting_figures_v2_report.md create mode 100644 research/flossing/report_bundle_20260603/tables/multi4_35805_k25_d64_sigma03_Lonly_fdlyap_n512_seed20260602.summary.csv create mode 100644 research/flossing/report_bundle_20260603/tables/multi4_35805_k25_d64_sigma03_Lonly_n10000_seed20260602.summary.csv create mode 100644 research/flossing/report_bundle_20260603/tables/paired_ptrm_k100_n1000_seed0_summary.csv create mode 100644 research/flossing/report_bundle_20260603/tables/summary_n512_k8_seed20260602.csv create mode 100644 research/flossing/report_bundle_20260603/tables/summary_n64_k8_seed20260602.csv create mode 100644 research/flossing/report_bundle_20260603/tables/trm_baseline_eval.csv create mode 100644 research/flossing/report_bundle_20260603/tables/trm_gbs768_multi4_step65100_det_n10000_seed20260602.summary.csv create mode 100644 research/flossing/report_bundle_20260603/tables/trm_matched_compare.csv create mode 100644 research/flossing/report_bundle_20260603/tables/trm_multi4_eval.csv create mode 100644 research/flossing/report_bundle_20260603/tables/trm_multi4_eval_full.csv create mode 100644 research/flossing/report_bundle_20260603/tables/trm_official_gbs768_eval.csv create mode 100644 research/flossing/report_bundle_20260603/tables/trm_official_gbs768_multi4_eval.csv create mode 100644 research/flossing/report_bundle_20260603/tables/trm_official_gbs768_multi4_step16275_eval.csv create mode 100644 research/flossing/report_bundle_20260603/tables/wallclock_eval.csv (limited to 'research/flossing/report_bundle_20260603/tables') diff --git a/research/flossing/report_bundle_20260603/tables/base58590_k25_d64_sigma03_Lonly_fdlyap_n512_seed20260602.summary.csv b/research/flossing/report_bundle_20260603/tables/base58590_k25_d64_sigma03_Lonly_fdlyap_n512_seed20260602.summary.csv new file mode 100644 index 0000000..1078ef9 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/base58590_k25_d64_sigma03_Lonly_fdlyap_n512_seed20260602.summary.csv @@ -0,0 +1,2 @@ +correct_count/det_fail_mean,correct_count/det_success_mean,correct_count/full_frac,correct_count/ge_10_frac,correct_count/ge_1_frac,correct_count/ge_25_frac,correct_count/ge_5_frac,correct_count/mean,correct_count/median,correct_count/q10,correct_count/q25,correct_count/q75,correct_count/q90,correct_count/std,correct_count/zero_frac,deterministic/exact,deterministic/token_acc,fd_lyap,fd_spectrum_k,include_clean,lambda_fail_mean,lambda_mean,lambda_success_mean,lyap_min/exact,lyap_min/token_acc,mean_rollout/exact,mean_rollout/token_acc,n_samples,noise_std,oracle_pass/det_fail_frac,oracle_pass/det_success_frac,oracle_pass/exact,oracle_pass/token_acc,perturb_both,perturb_h,perturb_l,q_fail_mean,q_max/det_fail_frac,q_max/det_success_frac,q_max/exact,q_max/token_acc,q_mean,q_minus_0.25lambda/exact,q_minus_0.25lambda/token_acc,q_minus_0.5lambda/exact,q_minus_0.5lambda/token_acc,q_minus_1lambda/exact,q_minus_1lambda/token_acc,q_minus_2lambda/exact,q_minus_2lambda/token_acc,q_success_mean,rollout0/exact,rollout0/token_acc,rollouts,steps +15.185714721679688,24.78506851196289,0.859375,0.94921875,0.97265625,0.859375,0.96875,23.47265625,25.0,22.10000228881836,25.0,25.0,25.0,5.083601474761963,0.02734375,0.86328125,0.9498456716537476,1.0,0.0,0.0,6.921443462371826,5.711092472076416,5.632336139678955,0.97265625,0.989703893661499,0.9389062523841858,0.9769077897071838,512.0,0.3,0.800000011920929,1.0,0.97265625,0.9911988973617554,0.0,0.0,1.0,-10.65998649597168,0.800000011920929,1.0,0.97265625,0.989366352558136,6.669083118438721,0.97265625,0.9893181324005127,0.97265625,0.9891493320465088,0.97265625,0.989149272441864,0.97265625,0.9892698526382446,7.7966694831848145,0.94140625,0.9774305820465088,25.0,64.0 diff --git a/research/flossing/report_bundle_20260603/tables/base58590_k25_d64_sigma03_Lonly_n10000_seed20260602.summary.csv b/research/flossing/report_bundle_20260603/tables/base58590_k25_d64_sigma03_Lonly_n10000_seed20260602.summary.csv new file mode 100644 index 0000000..16c5a1b --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/base58590_k25_d64_sigma03_Lonly_n10000_seed20260602.summary.csv @@ -0,0 +1,2 @@ +correct_count/det_fail_mean,correct_count/det_success_mean,correct_count/full_frac,correct_count/ge_10_frac,correct_count/ge_1_frac,correct_count/ge_25_frac,correct_count/ge_5_frac,correct_count/mean,correct_count/median,correct_count/q10,correct_count/q25,correct_count/q75,correct_count/q90,correct_count/std,correct_count/zero_frac,deterministic/exact,deterministic/token_acc,fd_lyap,fd_spectrum_k,include_clean,mean_rollout/exact,mean_rollout/token_acc,n_samples,noise_std,oracle_pass/det_fail_frac,oracle_pass/det_success_frac,oracle_pass/exact,oracle_pass/token_acc,perturb_both,perturb_h,perturb_l,q_max/det_fail_frac,q_max/det_success_frac,q_max/exact,q_max/token_acc,q_mean,rollout0/exact,rollout0/token_acc,rollouts,steps +13.544709205627441,24.74855613708496,0.847599983215332,0.944599986076355,0.9776999950408936,0.847599983215332,0.9599999785423279,23.2450008392334,25.0,20.0,25.0,25.0,25.0,5.4259724617004395,0.022299999371170998,0.8658000230789185,0.9498122930526733,0.0,0.0,0.0,0.9297999739646912,0.9728542566299438,10000.0,0.3,0.8338301181793213,1.0,0.9776999950408936,0.9922346472740173,0.0,0.0,1.0,0.8338301181793213,0.9998844861984253,0.9775999784469604,0.9908208847045898,6.504925727844238,0.9284999966621399,0.9722283482551575,25.0,64.0 diff --git a/research/flossing/report_bundle_20260603/tables/fig5_qhead_vs_lambda1_ptrm_summary.csv b/research/flossing/report_bundle_20260603/tables/fig5_qhead_vs_lambda1_ptrm_summary.csv new file mode 100644 index 0000000..bcbf401 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/fig5_qhead_vs_lambda1_ptrm_summary.csv @@ -0,0 +1,3 @@ +name,path,n_samples,rollouts,mean_rollout_exact,q_max_exact,lambda_min_exact,oracle_pass_exact,q_lambda_same_argmax_frac,global_pearson_q_vs_stability,global_spearman_q_vs_stability,within_problem_pearson_mean,within_problem_spearman_mean,q_success_mean,q_fail_mean,lambda_success_mean,lambda_fail_mean,mixed_problem_count,zero_success_problem_count,full_success_problem_count,mixed_global_pearson_q_vs_stability,mixed_q_max_exact,mixed_lambda_min_exact,mixed_oracle_exact +TRM baseline + PTRM rollouts,research/flossing/q_lambda_scatter/base58590_k25_d64_sigma03_Lonly_fdlyap_n512_seed20260602.npz,512.0,25.0,0.93890625,0.97265625,0.97265625,0.97265625,0.04296875,0.7524171235289728,-0.025655130770112743,0.12497240516050691,0.09656841990607949,7.796669578964886,-10.659986413043478,5.632336168325687,6.921443493469901,58.0,14.0,440.0,0.7857503437605575,1.0,1.0,1.0 +TRM multi4 + PTRM rollouts,research/flossing/q_lambda_scatter/multi4_35805_k25_d64_sigma03_Lonly_fdlyap_n512_seed20260602.npz,512.0,25.0,0.944609375,0.974609375,0.974609375,0.974609375,0.0859375,0.7289674235313497,-0.13633997124282113,0.18866459504767938,0.11859038341758327,7.7404597169382185,-10.081937588152327,5.7459801223593345,7.271883726456269,43.0,13.0,456.0,0.7911647653323942,1.0,1.0,1.0 diff --git a/research/flossing/report_bundle_20260603/tables/headline_trm_multi4_dynamics_table.csv b/research/flossing/report_bundle_20260603/tables/headline_trm_multi4_dynamics_table.csv new file mode 100644 index 0000000..742ac68 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/headline_trm_multi4_dynamics_table.csv @@ -0,0 +1,4 @@ +model,step,full_exact,full_token_acc,lm_loss,dyn_sample_exact,lambda1_all,mean8_all,tail4_all,pos_count_all +TRM baseline best,58590,0.8686309456825256,0.9508475661277772,0.1155559569597244,0.875,0.02823458132615997,0.013457294571722192,0.0075273313675370546,7.841796875 +TRM multi4 best,35805,0.8964653611183167,0.9604493975639344,0.0945562794804573,0.900390625,0.020381716455975862,0.0065844104191477015,0.001402141885882835,3.841796875 +TRM multi4 final,65100,0.8350536823272705,0.9350366592407228,0.1547555029392242,0.82421875,0.03232463403946895,0.018508151198432188,0.013372940185377047,8.0 diff --git a/research/flossing/report_bundle_20260603/tables/hrm_baseline_eval.csv b/research/flossing/report_bundle_20260603/tables/hrm_baseline_eval.csv new file mode 100644 index 0000000..afc2b27 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/hrm_baseline_eval.csv @@ -0,0 +1,11 @@ +run,step,all/exact_accuracy,all/accuracy,all/lm_loss,all/q_halt_accuracy,all/q_halt_loss,all/steps +hrm_baseline,2604,0.016369983553886414,0.6336106061935425,0.8522627949714661,0.9927079081535339,0.020887982100248337,16 +hrm_baseline,5208,0.06169315055012703,0.6744286417961121,0.7430469989776611,0.9772745370864868,0.07281211763620377,16 +hrm_baseline,7812,0.1358133852481842,0.6985693573951721,0.676047682762146,0.996868371963501,0.0211492907255888,16 +hrm_baseline,10416,0.20248068869113922,0.7259970307350159,0.6480565667152405,0.9983821511268616,0.017285015434026718,16 +hrm_baseline,13020,0.3024248778820038,0.7580602169036865,0.5614010691642761,0.9971805810928345,0.02416178770363331,16 +hrm_baseline,15624,0.36705803871154785,0.7842973470687866,0.49664124846458435,0.9964591860771179,0.02256467007100582,16 +hrm_baseline,18228,0.46287721395492554,0.8077820539474487,0.4641122817993164,0.9953309893608093,0.02509351819753647,16 +hrm_baseline,20832,0.4912721812725067,0.8198283314704895,0.4137057960033417,0.9955745935440063,0.02796635963022709,16 +hrm_baseline,23436,0.5193856954574585,0.8260135054588318,0.41173747181892395,0.9975590705871582,0.024772455915808678,16 +hrm_baseline,26040,0.5265287756919861,0.8270824551582336,0.4161679148674011,0.9960216283798218,0.03298119083046913,16 diff --git a/research/flossing/report_bundle_20260603/tables/hrm_honly_step26040_eval.csv b/research/flossing/report_bundle_20260603/tables/hrm_honly_step26040_eval.csv new file mode 100644 index 0000000..d3e3ab9 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/hrm_honly_step26040_eval.csv @@ -0,0 +1,2 @@ +step,accuracy,exact_accuracy,lm_loss,q_halt_accuracy,q_halt_loss,steps +26040,0.85720146,0.62264127,0.35935268,0.99625105,0.024241636,16.0 diff --git a/research/flossing/report_bundle_20260603/tables/hrm_matched_compare.csv b/research/flossing/report_bundle_20260603/tables/hrm_matched_compare.csv new file mode 100644 index 0000000..71b8e16 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/hrm_matched_compare.csv @@ -0,0 +1,6 @@ +family,step,base_exact,multi4_exact,delta_exact,base_acc,multi4_acc,delta_acc,base_loss,multi4_loss,delta_loss,base_steps,multi4_steps +hrm,2604,0.016369983553886414,0.0123774204403162,-0.003992563113570213,0.6336106061935425,0.6303551197052002,-0.003255486488342285,0.8522627949714661,0.8603715300559998,0.008108735084533691,16,16 +hrm,5208,0.06169315055012703,0.025012653321027756,-0.036680497229099274,0.6744286417961121,0.6685170531272888,-0.005911588668823242,0.7430469989776611,0.7412638068199158,-0.0017831921577453613,16,16 +hrm,7812,0.1358133852481842,0.04651052877306938,-0.08930285647511482,0.6985693573951721,0.687346339225769,-0.011223018169403076,0.676047682762146,0.7044885158538818,0.02844083309173584,16,16 +hrm,10416,0.20248068869113922,0.2006617933511734,-0.0018188953399658203,0.7259970307350159,0.7243355512619019,-0.0016614794731140137,0.6480565667152405,0.6340938210487366,-0.013962745666503906,16,16 +hrm,13020,0.3024248778820038,0.34697696566581726,0.04455208778381348,0.7580602169036865,0.7794705033302307,0.02141028642654419,0.5614010691642761,0.49915269017219543,-0.06224837899208069,16,16 diff --git a/research/flossing/report_bundle_20260603/tables/hrm_multi4_complete_eval.csv b/research/flossing/report_bundle_20260603/tables/hrm_multi4_complete_eval.csv new file mode 100644 index 0000000..9aac158 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/hrm_multi4_complete_eval.csv @@ -0,0 +1,11 @@ +run,step,exact,accuracy,loss +hrm_multi4,2604,0.0123774204403162,0.6303551197052002,0.8603715300559998 +hrm_multi4,5208,0.025012653321027756,0.6685170531272888,0.7412638068199158 +hrm_multi4,7812,0.04651052877306938,0.687346339225769,0.7044885158538818 +hrm_multi4,10416,0.2006617933511734,0.7243355512619019,0.6340938210487366 +hrm_multi4,13020,0.34697696566581726,0.7794705033302307,0.49915269017219543 +hrm_multi4,15624,0.4653252363204956,0.8156101703643799,0.42743897438049316 +hrm_multi4,18228,0.5790873169898987,0.8494690656661987,0.3459467887878418 +hrm_multi4,20832,0.6393187,0.8660642,0.3186217 +hrm_multi4,23436,0.6443189,0.8684137,0.30427682 +hrm_multi4,26040,0.46235448,0.80298746,0.603943 diff --git a/research/flossing/report_bundle_20260603/tables/hrm_multi4_eval.csv b/research/flossing/report_bundle_20260603/tables/hrm_multi4_eval.csv new file mode 100644 index 0000000..0f7dbc9 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/hrm_multi4_eval.csv @@ -0,0 +1,6 @@ +run,step,all/exact_accuracy,all/accuracy,all/lm_loss,all/q_halt_accuracy,all/q_halt_loss,all/steps +hrm_multi4,2604,0.0123774204403162,0.6303551197052002,0.8603715300559998,0.9879773855209351,0.020903315395116806,16 +hrm_multi4,5208,0.025012653321027756,0.6685170531272888,0.7412638068199158,0.9968069195747375,0.019309692084789276,16 +hrm_multi4,7812,0.04651052877306938,0.687346339225769,0.7044885158538818,0.9902976751327515,0.03646523132920265,16 +hrm_multi4,10416,0.2006617933511734,0.7243355512619019,0.6340938210487366,0.9991532564163208,0.011545917019248009,16 +hrm_multi4,13020,0.34697696566581726,0.7794705033302307,0.49915269017219543,0.9987062215805054,0.01581510715186596,16 diff --git a/research/flossing/report_bundle_20260603/tables/hrm_multi4_horizon_sweep_768.csv b/research/flossing/report_bundle_20260603/tables/hrm_multi4_horizon_sweep_768.csv new file mode 100644 index 0000000..3bdace3 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/hrm_multi4_horizon_sweep_768.csv @@ -0,0 +1,41 @@ +step,split,horizon,count,exact_accuracy,accuracy,lm_loss,q_halt_loss,steps +23436,train,2,768.0,0.2955729166666667,0.8364840348561605,0.39225157833573504,0.04459714392820994,2.0 +23436,train,3,768.0,0.5755208333333334,0.8917663097381592,0.2618181909650845,0.01976812755068143,3.0 +23436,train,4,768.0,0.6796875,0.9122781753540039,0.21020127075343642,0.006546831379334132,4.0 +23436,train,5,768.0,0.734375,0.9230967362721761,0.182652537414814,0.00717167928814888,5.0 +23436,train,6,768.0,0.7669270833333334,0.9298482735951742,0.16631383252369117,0.00374875341852506,6.0 +23436,train,8,768.0,0.8033854166666666,0.9389146169026693,0.14464881393209095,0.0037066793690125146,8.0 +23436,train,10,768.0,0.8268229166666666,0.9462127685546875,0.12909535948177878,0.00991838239133358,10.0 +23436,train,12,768.0,0.84375,0.9500546455383301,0.11931335874268405,0.004430865868926048,12.0 +23436,train,14,768.0,0.85546875,0.9517585436503092,0.11357831630187847,0.0033199011037747064,14.0 +23436,train,16,768.0,0.859375,0.9536232948303223,0.11014115689326769,0.003578242535392443,16.0 +23436,test,2,768.0,0.12760416666666666,0.7265946865081787,0.6354224683840087,0.025561923782030743,2.0 +23436,test,3,768.0,0.2526041666666667,0.7604809602101644,0.5506175992783068,0.012921225279569626,3.0 +23436,test,4,768.0,0.3138020833333333,0.7742091019948324,0.515490498860075,0.003827621228992939,4.0 +23436,test,5,768.0,0.3619791666666667,0.7867315610249838,0.48693293612013444,0.00506168728073438,5.0 +23436,test,6,768.0,0.3984375,0.7953317960103353,0.46390732880926117,0.0035611667359868684,6.0 +23436,test,8,768.0,0.4583333333333333,0.8105709552764893,0.4314775246277862,0.005083844686547915,8.0 +23436,test,10,768.0,0.48828125,0.8183674812316895,0.4161860321298086,0.003043775757153829,10.0 +23436,test,12,768.0,0.51171875,0.8260512351989746,0.4016971584101859,0.004879387095570564,12.0 +23436,test,14,768.0,0.5325520833333334,0.831050713857015,0.3878147863194716,0.00292336226751407,14.0 +23436,test,16,768.0,0.5559895833333334,0.8361946741739908,0.37473119346002354,0.0035912382105986276,16.0 +26040,train,2,768.0,0.7825520833333334,0.963814894358317,0.09390118849629236,0.04166571795940399,2.0 +26040,train,3,768.0,0.9127604166666666,0.9810153643290201,0.051538720202730794,0.008062846958637238,3.0 +26040,train,4,768.0,0.9309895833333334,0.9841659863789877,0.04344159450663771,0.012521501630544662,4.0 +26040,train,5,768.0,0.9401041666666666,0.9858539899190267,0.04013312268752094,0.015388640264670054,5.0 +26040,train,6,768.0,0.9401041666666666,0.9870595932006836,0.03853385294570503,0.023091336091359455,6.0 +26040,train,8,768.0,0.94921875,0.9884098370869955,0.03620980748266996,0.01947430024544398,8.0 +26040,train,10,768.0,0.9557291666666666,0.9893261591593424,0.03444665149376691,0.03555429975191752,10.0 +26040,train,12,768.0,0.953125,0.9895029067993164,0.034414704500878884,0.04184401035308838,12.0 +26040,train,14,768.0,0.9557291666666666,0.9893904527028402,0.03586123670240371,0.037144094705581665,14.0 +26040,train,16,768.0,0.9518229166666666,0.9894386132558187,0.03654265702438753,0.041033936043580375,16.0 +26040,test,2,768.0,0.11588541666666667,0.7054398854573568,0.8908620335632751,0.040651207168896995,2.0 +26040,test,3,768.0,0.1875,0.7230902512868246,0.7620792943957264,0.03934991856416067,3.0 +26040,test,4,768.0,0.23567708333333334,0.731706460316976,0.7335753038165317,0.11375004053115845,4.0 +26040,test,5,768.0,0.25,0.7362075646718343,0.7385935210540664,0.160938690106074,5.0 +26040,test,6,768.0,0.2669270833333333,0.7397923469543457,0.743103274276764,0.17507813374201456,6.0 +26040,test,8,768.0,0.296875,0.7452899614969889,0.7468322750276379,0.19044278065363565,8.0 +26040,test,10,768.0,0.3138020833333333,0.7472190856933594,0.7515058945457195,0.21485831340154013,10.0 +26040,test,12,768.0,0.3229166666666667,0.7502892812093099,0.7620030015396181,0.22753063837687174,12.0 +26040,test,14,768.0,0.3307291666666667,0.7518165111541748,0.7553974518406182,0.2158371408780416,14.0 +26040,test,16,768.0,0.3346354166666667,0.7514950434366862,0.7639393310889216,0.23628832896550497,16.0 diff --git a/research/flossing/report_bundle_20260603/tables/hrm_trm_redesigned_summary.csv b/research/flossing/report_bundle_20260603/tables/hrm_trm_redesigned_summary.csv new file mode 100644 index 0000000..c5a4ad0 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/hrm_trm_redesigned_summary.csv @@ -0,0 +1,7 @@ +model,label,step,full_exact,sample_exact,lambda1_all,mean8_all,pos_count_all,lambda1_success,lambda1_fail,mean8_success,mean8_fail,pos_count_success,pos_count_fail +HRM,baseline best,26040,0.5265287756919861,0.5,-0.0569394779099639,-0.10732079181130239,0.98046875,-0.14642834789538028,0.03254939207545249,-0.18957092847483636,-0.025070655147768406,0.01953125,1.94140625 +HRM,multi4 best,23436,0.6443189,0.654296875,-0.04733450568929953,-0.112159715874796,1.166015625,-0.10103540500250659,0.054302789621007604,-0.16998952501653278,-0.0027078172167066595,0.05970149253731343,3.2598870056497176 +HRM,multi4 final,26040,0.46235448,0.4296875,0.02874533511322852,-0.040662085491063316,1.626953125,0.03573289099003887,0.02348073821974127,-0.053027883530268646,-0.031345388338237384,1.8181818181818181,1.4828767123287672 +TRM,baseline best,58590,0.8686309456825256,0.875,0.02823458132615997,0.013457294571722192,7.841796875,0.01761167685357837,0.10259491263423115,0.008003413600119422,0.05163446137294159,7.819196428571429,8.0 +TRM,multi4 best,35805,0.8964653611183167,0.900390625,0.020381716455975862,0.0065844104191477015,3.841796875,0.01118387160416574,0.10352301992037717,0.0019524994650864183,0.04845325257252518,3.3817787418655096,8.0 +TRM,multi4 final,65100,0.8350536823272705,0.82421875,0.03232463403946895,0.018508151198432188,8.0,0.01912124014472792,0.09423388096814354,0.012883653437293365,0.04488079625621645,8.0,8.0 diff --git a/research/flossing/report_bundle_20260603/tables/meeting_figures_v2_report.md b/research/flossing/report_bundle_20260603/tables/meeting_figures_v2_report.md new file mode 100644 index 0000000..5158c99 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/meeting_figures_v2_report.md @@ -0,0 +1,26 @@ +# Meeting Figures v2 + +## Figure Strategy + +0. `fig0_motivation_lambda1_success_failure_hrm_trm.png`: first-exponent success/failure distribution in HRM and TRM. This motivates chaos as a detector before introducing the method. +1. `fig1_hrm_trm_training_curves.png`: performance over training for HRM and TRM. This answers whether the method improves accuracy and where best/final are. +2. `fig2_accuracy_vs_chaotic_volume_phase.png`: phase view, with accuracy versus mean top-8 Lyapunov exponent. This answers whether better checkpoints are dynamically more stable. +3. `fig3_hrm_trm_success_failure_spectra.png`: full success/failure spectrum separation for HRM and TRM best checkpoints. This extends Fig0 beyond λ1. +4. `fig4_ptrm_same_subset_comparison.png`: PTRM same-subset result. This is a secondary inference-time story. +5. `fig5_qhead_vs_lambda1_ptrm.png`: PTRM Q-head halt logit versus finite-difference stability proxy `-lambda_1`. The bottom row isolates mixed problems where trajectory selection actually matters. + +## Key Numbers + +- HRM baseline best: 0.5265 exact. HRM multi4 best: 0.6443 exact. HRM multi4 final: 0.4624 exact. +- TRM baseline best: 0.8686 exact. TRM multi4 best: 0.8965 exact. TRM multi4 final: 0.8351 exact. +- HRM multi4 best dynamics sample: mean top-8 exponent -0.1122; final -0.0407. +- TRM multi4 best dynamics sample: mean top-8 exponent +0.0066; final +0.0185. +- PTRM same subset, K=100: Q-selected 0.984 -> 0.988; mean rollout 0.942 -> 0.954. +- PTRM Q-vs-stability, K=25/N=512: mixed-problem Pearson is 0.786 for baseline and 0.791 for multi4. In both runs, Q-max selection and lambda-min selection reach the same oracle exact accuracy on this subset. + +## Caveats + +- Dynamics spectra use N=512 diagnostic samples, not the full test set. +- PTRM numbers use a fixed N=1000 subset; do not mix its deterministic subset accuracy with full-test W&B exact accuracy. +- Final checkpoints are collapse diagnostics, not the method's reported performance. +- Q-head is not a pure lambda ranker: global Spearman is weak because most problems are all-success/all-failure across K rollouts. The strongest evidence is the mixed-problem class separation and selector equivalence. diff --git a/research/flossing/report_bundle_20260603/tables/multi4_35805_k25_d64_sigma03_Lonly_fdlyap_n512_seed20260602.summary.csv b/research/flossing/report_bundle_20260603/tables/multi4_35805_k25_d64_sigma03_Lonly_fdlyap_n512_seed20260602.summary.csv new file mode 100644 index 0000000..7d15a10 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/multi4_35805_k25_d64_sigma03_Lonly_fdlyap_n512_seed20260602.summary.csv @@ -0,0 +1,2 @@ +correct_count/det_fail_mean,correct_count/det_success_mean,correct_count/full_frac,correct_count/ge_10_frac,correct_count/ge_1_frac,correct_count/ge_25_frac,correct_count/ge_5_frac,correct_count/mean,correct_count/median,correct_count/q10,correct_count/q25,correct_count/q75,correct_count/q90,correct_count/std,correct_count/zero_frac,deterministic/exact,deterministic/token_acc,fd_lyap,fd_spectrum_k,include_clean,lambda_fail_mean,lambda_mean,lambda_success_mean,lyap_min/exact,lyap_min/token_acc,mean_rollout/exact,mean_rollout/token_acc,n_samples,noise_std,oracle_pass/det_fail_frac,oracle_pass/det_success_frac,oracle_pass/exact,oracle_pass/token_acc,perturb_both,perturb_h,perturb_l,q_fail_mean,q_max/det_fail_frac,q_max/det_success_frac,q_max/exact,q_max/token_acc,q_mean,q_minus_0.25lambda/exact,q_minus_0.25lambda/token_acc,q_minus_0.5lambda/exact,q_minus_0.5lambda/token_acc,q_minus_1lambda/exact,q_minus_1lambda/token_acc,q_minus_2lambda/exact,q_minus_2lambda/token_acc,q_success_mean,rollout0/exact,rollout0/token_acc,rollouts,steps +13.054545402526855,24.886215209960938,0.890625,0.953125,0.974609375,0.890625,0.966796875,23.615234375,25.0,24.0,25.0,25.0,25.0,5.043018341064453,0.025390625,0.892578125,0.9596836566925049,1.0,0.0,0.0,7.271883964538574,5.830501556396484,5.7459797859191895,0.974609375,0.9897279739379883,0.9446094036102295,0.9784027934074402,512.0,0.3,0.7636363506317139,1.0,0.974609375,0.9913435578346252,0.0,0.0,1.0,-10.081937789916992,0.7636363506317139,1.0,0.974609375,0.9889563918113708,6.753265857696533,0.974609375,0.9889563918113708,0.974609375,0.9890046119689941,0.974609375,0.9889804720878601,0.974609375,0.9889563918113708,7.74045991897583,0.9375,0.9766348004341125,25.0,64.0 diff --git a/research/flossing/report_bundle_20260603/tables/multi4_35805_k25_d64_sigma03_Lonly_n10000_seed20260602.summary.csv b/research/flossing/report_bundle_20260603/tables/multi4_35805_k25_d64_sigma03_Lonly_n10000_seed20260602.summary.csv new file mode 100644 index 0000000..f991531 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/multi4_35805_k25_d64_sigma03_Lonly_n10000_seed20260602.summary.csv @@ -0,0 +1,2 @@ +correct_count/det_fail_mean,correct_count/det_success_mean,correct_count/full_frac,correct_count/ge_10_frac,correct_count/ge_1_frac,correct_count/ge_25_frac,correct_count/ge_5_frac,correct_count/mean,correct_count/median,correct_count/q10,correct_count/q25,correct_count/q75,correct_count/q90,correct_count/std,correct_count/zero_frac,deterministic/exact,deterministic/token_acc,fd_lyap,fd_spectrum_k,include_clean,mean_rollout/exact,mean_rollout/token_acc,n_samples,noise_std,oracle_pass/det_fail_frac,oracle_pass/det_success_frac,oracle_pass/exact,oracle_pass/token_acc,perturb_both,perturb_h,perturb_l,q_max/det_fail_frac,q_max/det_success_frac,q_max/exact,q_max/token_acc,q_mean,rollout0/exact,rollout0/token_acc,rollouts,steps +13.846529960632324,24.8140811920166,0.8842999935150146,0.9599000215530396,0.9828000068664551,0.8842999935150146,0.9696999788284302,23.692100524902344,25.0,24.0,25.0,25.0,25.0,4.753871917724609,0.01720000058412552,0.8977000117301941,0.9607678651809692,0.0,0.0,0.0,0.9476839900016785,0.9795129895210266,10000.0,0.3,0.8328445553779602,0.9998885989189148,0.9828000068664551,0.9940740466117859,0.0,0.0,1.0,0.829912006855011,0.9995543956756592,0.982200026512146,0.9926013946533203,6.862946510314941,0.9480999708175659,0.979781448841095,25.0,64.0 diff --git a/research/flossing/report_bundle_20260603/tables/paired_ptrm_k100_n1000_seed0_summary.csv b/research/flossing/report_bundle_20260603/tables/paired_ptrm_k100_n1000_seed0_summary.csv new file mode 100644 index 0000000..8122604 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/paired_ptrm_k100_n1000_seed0_summary.csv @@ -0,0 +1,2 @@ +n,rollouts,base_det,multi4_det,delta_det,base_mean_rollout,multi4_mean_rollout,delta_mean_rollout,base_qmax,multi4_qmax,delta_qmax,base_oracle,multi4_oracle,delta_oracle,base_correct_count_mean,multi4_correct_count_mean,delta_correct_count_mean,det_base_only_frac,det_multi4_only_frac,oracle_base_only_frac,oracle_multi4_only_frac +1000,100,0.887,0.911,0.02400000000000002,0.94188,0.95417,0.012289999999999912,0.984,0.988,0.0040000000000000036,0.985,0.988,0.0030000000000000027,94.188,95.417,1.2289999999999992,0.034,0.058,0.001,0.004 diff --git a/research/flossing/report_bundle_20260603/tables/summary_n512_k8_seed20260602.csv b/research/flossing/report_bundle_20260603/tables/summary_n512_k8_seed20260602.csv new file mode 100644 index 0000000..c255252 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/summary_n512_k8_seed20260602.csv @@ -0,0 +1,4 @@ +run,step,n,sample_exact,sample_token_acc,lambda1_all,lambda1_success,lambda1_fail,lambda1_fail_minus_success,mean8_all,mean8_success,mean8_fail,mean8_fail_minus_success,tail4_all,tail4_success,tail4_fail,pos_count_all,pos_count_success,pos_count_fail,pos_mass_all,pos_mass_success,pos_mass_fail,lambda2_all,lambda2_success,lambda2_fail,lambda3_all,lambda3_success,lambda3_fail,lambda4_all,lambda4_success,lambda4_fail,lambda5_all,lambda5_success,lambda5_fail,lambda6_all,lambda6_success,lambda6_fail,lambda7_all,lambda7_success,lambda7_fail,lambda8_all,lambda8_success,lambda8_fail +baseline_best,58590,512,0.875,0.9539689430384897,0.02823458132615997,0.01761167685357837,0.10259491263423115,0.08498323578065278,0.013457294571722192,0.008003413600119422,0.05163446137294159,0.04363104777282217,0.0075273313675370546,0.004122858266632009,0.03135864307387237,7.841796875,7.819196428571429,8.0,0.10779670139772268,0.0641854171711784,0.4130756909835327,0.0205107267238418,0.012619587999194794,0.07574869779637083,0.016147883449207256,0.0097831444752176,0.060701056267134845,0.012655839604420294,0.007521466406436568,0.04859645199030638,0.010066905798097991,0.00580994511748096,0.03986563056241721,0.00827578879948021,0.00458740731747704,0.0340944591735024,0.006620997387619565,0.003523945934349396,0.028300357560510747,0.0051456334849504515,0.00257013469722064,0.02317412499905913 +multi4_best,35805,512,0.900390625,0.9625048226444051,0.020381716455975862,0.01118387160416574,0.10352301992037717,0.09233914831621143,0.0065844104191477015,0.0019524994650864183,0.04845325257252518,0.046500753107438765,0.001402141885882835,-0.0014188478887233206,0.02690167690732279,3.841796875,3.3817787418655096,8.0,0.06377898653446445,0.027952091227886174,0.38762602058020146,0.012663036363773195,0.0057181008775463405,0.07543980615103946,0.008348809059507634,0.003078277385450445,0.055990281642652025,0.005673153930393582,0.0013151374084221035,0.04506620523684165,0.0034857525132654388,-9.921478389921372e-05,0.035891045140577296,0.002008319042374751,-0.001002505589948276,0.02922381228749074,0.0006402704918926361,-0.0019078789586364823,0.023673542976087213,-0.0005257745040014861,-0.0026657922224093103,0.018818307225135902 +multi4_final,65100,512,0.82421875,0.9289882326847874,0.03232463403946895,0.01912124014472792,0.09423388096814354,0.07511264082341562,0.018508151198432188,0.012883653437293365,0.04488079625621645,0.03199714281892309,0.013372940185377047,0.010672304166179654,0.026035922408724824,8.0,8.0,8.0,0.1480652095874575,0.10306922749834692,0.3590463700497316,0.02459628393262392,0.015458292881759563,0.0674433086377879,0.020185894951282535,0.013495850063401376,0.05155477209223641,0.017466635922573914,0.01230462774373944,0.041670718716664445,0.015463502108104876,0.011429727322452865,0.034377423880828754,0.013748909759215167,0.010808219788353272,0.027537478289256494,0.012621539073734311,0.010437874606770786,0.022860499129941068,0.011657809800453833,0.010013394947141692,0.019368288334872988 diff --git a/research/flossing/report_bundle_20260603/tables/summary_n64_k8_seed20260602.csv b/research/flossing/report_bundle_20260603/tables/summary_n64_k8_seed20260602.csv new file mode 100644 index 0000000..5a12638 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/summary_n64_k8_seed20260602.csv @@ -0,0 +1,5 @@ +name,n,acc,lambda1_all,lambda1_success,lambda1_fail,mean8_all,mean8_success,mean8_fail,tail4_all,pos_count_all,pos_count_success,pos_count_fail,pos_mass_all +baseline_best,64,0.875,0.027470633,0.015822656,0.109006464,0.013350397,0.0077010575,0.052895777,0.0074741757,7.796875,7.767857142857143,8.0,0.10691354 +multi4_bestish,64,0.953125,0.016227467,0.012409109,0.09386742,0.004405942,0.002358008,0.046047267,-0.00011889404,3.375,3.1475409836065573,8.0,0.048455432 +multi4_late,64,0.84375,0.031355858,0.01815009,0.10266701,0.018412568,0.01310199,0.047089677,0.013622271,8.0,8.0,8.0,0.14730054 +multi4_final,64,0.859375,0.028622076,0.01753769,0.09636,0.01713102,0.01240586,0.046007,0.012961711,8.0,8.0,8.0,0.13704816 diff --git a/research/flossing/report_bundle_20260603/tables/trm_baseline_eval.csv b/research/flossing/report_bundle_20260603/tables/trm_baseline_eval.csv new file mode 100644 index 0000000..0b6bc11 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/trm_baseline_eval.csv @@ -0,0 +1,11 @@ +run,step,all/exact_accuracy,all/accuracy,all/lm_loss,all/q_halt_accuracy,all/q_halt_loss,all/steps +trm_baseline,26041,0.5575894117355347,0.8469749093055725,0.35285070538520813,0.9997445344924927,0.003797376062721014,16 +trm_baseline,52082,0.6295099854469299,0.8704391121864319,0.2973524034023285,0.9998486042022705,0.0011324305087327957,16 +trm_baseline,78123,0.6993892788887024,0.8920264840126038,0.2477506548166275,0.9998935461044312,0.0006260558147914708,16 +trm_baseline,104164,0.72928386926651,0.9020143151283264,0.22608688473701477,0.9998770356178284,0.001141014276072383,16 +trm_baseline,130205,0.7653990387916565,0.914251446723938,0.19878524541854858,0.999917209148407,0.0010435190051794052,16 +trm_baseline,156246,0.7596656680107117,0.9119072556495667,0.2041437327861786,0.999862790107727,0.0011596218682825565,16 +trm_baseline,182287,0.7541900873184204,0.9094774723052979,0.2100999504327774,0.9998249411582947,0.0013093978632241488,16 +trm_baseline,208328,0.7732800841331482,0.9166732430458069,0.19410833716392517,0.9998320937156677,0.0033004307188093662,16 +trm_baseline,234369,0.7750374674797058,0.9172152280807495,0.19279460608959198,0.9998533725738525,0.0032994903158396482,16 +trm_baseline,260410,0.7742025256156921,0.9169425964355469,0.19348150491714478,0.9998462796211243,0.002894919365644455,16 diff --git a/research/flossing/report_bundle_20260603/tables/trm_gbs768_multi4_step65100_det_n10000_seed20260602.summary.csv b/research/flossing/report_bundle_20260603/tables/trm_gbs768_multi4_step65100_det_n10000_seed20260602.summary.csv new file mode 100644 index 0000000..38ca8f5 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/trm_gbs768_multi4_step65100_det_n10000_seed20260602.summary.csv @@ -0,0 +1,2 @@ +correct_count/det_fail_mean,correct_count/det_success_mean,correct_count/full_frac,correct_count/ge_1_frac,correct_count/mean,correct_count/median,correct_count/q10,correct_count/q25,correct_count/q75,correct_count/q90,correct_count/std,correct_count/zero_frac,deterministic/exact,deterministic/token_acc,fd_lyap,fd_spectrum_k,include_clean,mean_rollout/exact,mean_rollout/token_acc,n_samples,noise_std,oracle_pass/det_fail_frac,oracle_pass/det_success_frac,oracle_pass/exact,oracle_pass/token_acc,perturb_both,perturb_h,perturb_l,q_max/det_fail_frac,q_max/det_success_frac,q_max/exact,q_max/token_acc,q_mean,rollout0/exact,rollout0/token_acc,rollouts,steps +0.0,1.0,0.8312000036239624,0.8312000036239624,0.8312000036239624,1.0,0.0,1.0,1.0,1.0,0.3745751678943634,0.1687999963760376,0.8312000036239624,0.9333752989768982,0.0,0.0,0.0,0.8312000036239624,0.9333752989768982,10000.0,0.0,0.0,1.0,0.8312000036239624,0.9333752989768982,1.0,0.0,0.0,0.0,1.0,0.8312000036239624,0.9333752989768982,3.762489080429077,0.8312000036239624,0.9333752989768982,1.0,16.0 diff --git a/research/flossing/report_bundle_20260603/tables/trm_matched_compare.csv b/research/flossing/report_bundle_20260603/tables/trm_matched_compare.csv new file mode 100644 index 0000000..32006a5 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/trm_matched_compare.csv @@ -0,0 +1,2 @@ +family,step,base_exact,multi4_exact,delta_exact,base_acc,multi4_acc,delta_acc,base_loss,multi4_loss,delta_loss,base_steps,multi4_steps +trm,26041,0.5575894117355347,0.7394047975540161,0.18181538581848145,0.8469749093055725,0.9067130088806152,0.059738099575042725,0.35285070538520813,0.21417337656021118,-0.13867732882499695,16,16 diff --git a/research/flossing/report_bundle_20260603/tables/trm_multi4_eval.csv b/research/flossing/report_bundle_20260603/tables/trm_multi4_eval.csv new file mode 100644 index 0000000..f3d16b5 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/trm_multi4_eval.csv @@ -0,0 +1,2 @@ +run,step,all/exact_accuracy,all/accuracy,all/lm_loss,all/q_halt_accuracy,all/q_halt_loss,all/steps +trm_multi4,26041,0.7394047975540161,0.9067130088806152,0.21417337656021118,0.9997232556343079,0.0024572687689214945,16 diff --git a/research/flossing/report_bundle_20260603/tables/trm_multi4_eval_full.csv b/research/flossing/report_bundle_20260603/tables/trm_multi4_eval_full.csv new file mode 100644 index 0000000..3411ff5 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/trm_multi4_eval_full.csv @@ -0,0 +1,11 @@ +run,step,all/exact_accuracy,all/accuracy,all/lm_loss,all/q_halt_accuracy,all/q_halt_loss,all/steps +trm_multi4_loguniform_repro,26041,0.7394047975540161,0.9067130088806152,0.21417337656021118,0.9997232556343079,0.0024572687689214945,16 +trm_multi4_loguniform_repro,52082,0.8449901342391968,0.9424432516098022,0.1342233270406723,0.9998746514320374,0.0010290677892044187,16 +trm_multi4_loguniform_repro,78123,0.8417639136314392,0.9411031007766724,0.13769488036632538,0.9997493028640747,0.0022251552436500788,16 +trm_multi4_loguniform_repro,104164,0.8547161221504211,0.9456510543823242,0.12779666483402252,0.999834418296814,0.0019055778393521905,16 +trm_multi4_loguniform_repro,130205,0.8536233305931091,0.9453508853912354,0.1282763034105301,0.9998888373374939,0.0018422487191855907,16 +trm_multi4_loguniform_repro,156246,0.8489472270011902,0.9433866739273071,0.13273237645626068,0.999782383441925,0.003051365725696087,16 +trm_multi4_loguniform_repro,182287,0.8558868765830994,0.9459810256958008,0.12728126347064972,0.9997469186782837,0.0024399051908403635,16 +trm_multi4_loguniform_repro,208328,0.8404204249382019,0.9403222799301147,0.13971956074237823,0.9996499419212341,0.0029723909683525562,16 +trm_multi4_loguniform_repro,234369,0.845432460308075,0.9419097304344177,0.13668429851531982,0.9997681975364685,0.0016449446557089686,16 +trm_multi4_loguniform_repro,260410,0.826143741607666,0.9344042539596558,0.15468436479568481,0.999701976776123,0.001810370129533112,16 diff --git a/research/flossing/report_bundle_20260603/tables/trm_official_gbs768_eval.csv b/research/flossing/report_bundle_20260603/tables/trm_official_gbs768_eval.csv new file mode 100644 index 0000000..66dfe1b --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/trm_official_gbs768_eval.csv @@ -0,0 +1,11 @@ +run,step,all/exact_accuracy,all/accuracy,all/lm_loss,all/q_halt_accuracy,all/q_halt_loss,all/steps +trm_official_gbs768,6510,0.20006339251995087,0.7261562347412109,0.6155200004577637,0.9993779063224792,0.0063577014952898026,16 +trm_official_gbs768,13020,0.6248409152030945,0.8679871559143066,0.30110087990760803,0.9997587203979492,0.0018232133006677032,16 +trm_official_gbs768,19530,0.7098508477210999,0.8961462378501892,0.23719573020935059,0.9997185468673706,0.002359110629186034,16 +trm_official_gbs768,26040,0.7558197379112244,0.9113300442695618,0.20295456051826477,0.9997066855430603,0.0026622479781508446,16 +trm_official_gbs768,32550,0.7946407794952393,0.9247151017189026,0.17315243184566498,0.9997208714485168,0.0024344150442630053,16 +trm_official_gbs768,39060,0.8247435092926025,0.9351920485496521,0.14995059370994568,0.9996594190597534,0.002962446305900812,16 +trm_official_gbs768,45570,0.8479396104812622,0.9433117508888245,0.1323014795780182,0.9996806979179382,0.0026382978539913893,16 +trm_official_gbs768,52080,0.8633161783218384,0.9488447904586792,0.12009253352880478,0.9997137784957886,0.002587266732007265,16 +trm_official_gbs768,58590,0.8686309456825256,0.9508475661277771,0.11555595695972443,0.9998438954353333,0.0014915302162989974,16 +trm_official_gbs768,65100,0.86624675989151,0.9500409364700317,0.11748332530260086,0.9996523261070251,0.002605273388326168,16 diff --git a/research/flossing/report_bundle_20260603/tables/trm_official_gbs768_multi4_eval.csv b/research/flossing/report_bundle_20260603/tables/trm_official_gbs768_multi4_eval.csv new file mode 100644 index 0000000..ec353ce --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/trm_official_gbs768_multi4_eval.csv @@ -0,0 +1,21 @@ +run,step,all/exact_accuracy,all/accuracy,all/lm_loss,all/q_halt_accuracy,all/q_halt_loss,all/steps,_runtime,_timestamp +trm_official_gbs768_multi4,3255,0.0175999198108911,0.6504403352737427,0.7941210269927979,0.9824000597000122,0.108424387872219,16.0,12550.718766357,1780192940.452497 +trm_official_gbs768_multi4,6510,0.2277795374393463,0.7379271984100342,0.5934479236602783,0.999262034893036,0.0035974220372736,16.0,25064.702453699,1780205454.4563622 +trm_official_gbs768_multi4,9765,0.6357069611549377,0.8709613084793091,0.2931223213672638,0.9997137784957886,0.0014165197499096,16.0,37579.311725746,1780217969.132149 +trm_official_gbs768_multi4,13020,0.75853031873703,0.910834014415741,0.2033131867647171,0.9997611045837402,0.0021604059729725,16.0,50092.402719112,1780230482.2738986 +trm_official_gbs768_multi4,16275,0.8263778686523438,0.9346956610679626,0.1507271528244018,0.9997800588607788,0.0041020140051841,16.0,62595.579823935,1780242985.398887 +trm_official_gbs768_multi4,19530,0.8654520511627197,0.9488762617111206,0.1198361814022064,0.9997090697288512,0.0052023055031895,16.0,75096.299217356,1780255486.102359 +trm_official_gbs768_multi4,22785,0.8817486763000488,0.9549695253372192,0.1065462753176689,0.9997114539146424,0.0037784865126013,16.0,87607.26667599,1780267997.105077 +trm_official_gbs768_multi4,26040,0.8879267573356628,0.957356870174408,0.1014027148485183,0.999642848968506,0.0053329654037952,16.0,100118.838503384,1780280508.634669 +trm_official_gbs768_multi4,29295,0.8933455944061279,0.959309697151184,0.0971761047840118,0.9996144771575928,0.0041324645280838,16.0,112620.42551711,1780293010.168183 +trm_official_gbs768_multi4,32550,0.8962169885635376,0.960436463356018,0.0948082581162452,0.9996026158332824,0.0144010595977306,16.0,125134.755165262,1780305524.5418072 +trm_official_gbs768_multi4,35805,0.8964653611183167,0.9604493975639344,0.0945562794804573,0.9995790123939514,0.0068304436281323,16.0,137714.797781532,1780318104.6386163 +trm_official_gbs768_multi4,39060,0.8957250118255615,0.9601802825927734,0.0952448472380638,0.999541163444519,0.0185395441949367,16.0,150233.754675447,1780330623.5781178 +trm_official_gbs768_multi4,42315,0.888913094997406,0.9574248790740968,0.1008101180195808,0.9995600581169128,0.0061048995703458,16.0,162739.436977464,1780343129.2559526 +trm_official_gbs768_multi4,45570,0.8858169317245483,0.9561300873756408,0.1036654934287071,0.9994938373565674,0.0054858992807567,16.0,175251.082236918,1780355640.8929477 +trm_official_gbs768_multi4,48825,0.882732629776001,0.9547808170318604,0.1068678125739097,0.9994110465049744,0.0072551541961729,16.0,187778.40637965,1780368168.2221627 +trm_official_gbs768_multi4,52080,0.8782954216003418,0.9530280232429504,0.1104752272367477,0.99934720993042,0.0089566921815276,16.0,200285.373208387,1780380675.1846614 +trm_official_gbs768_multi4,55335,0.8694067597389221,0.94942307472229,0.1194151788949966,0.9987700581550598,0.0272370912134647,16.0,212785.271448664,1780393175.0770009 +trm_official_gbs768_multi4,58590,0.8620365858078003,0.9463443756103516,0.1268824934959411,0.9983466863632202,0.0163928251713514,16.0,225280.516080387,1780405670.3403552 +trm_official_gbs768_multi4,61845,0.850763738155365,0.941650390625,0.1378339380025863,0.9978050589561462,0.0215476993471384,16.0,237778.331426833,1780418168.1822684 +trm_official_gbs768_multi4,65100,0.8350536823272705,0.9350366592407228,0.1547555029392242,0.9962841868400574,0.0310162808746099,16.0,250280.395200839,1780430673.009492 diff --git a/research/flossing/report_bundle_20260603/tables/trm_official_gbs768_multi4_step16275_eval.csv b/research/flossing/report_bundle_20260603/tables/trm_official_gbs768_multi4_step16275_eval.csv new file mode 100644 index 0000000..a34ae57 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/trm_official_gbs768_multi4_step16275_eval.csv @@ -0,0 +1,2 @@ +step,accuracy,exact_accuracy,lm_loss,q_halt_accuracy,q_halt_loss,steps +16275,0.934574,0.82621706,0.15090619,0.99974453,0.00405054,16.0 diff --git a/research/flossing/report_bundle_20260603/tables/wallclock_eval.csv b/research/flossing/report_bundle_20260603/tables/wallclock_eval.csv new file mode 100644 index 0000000..268a924 --- /dev/null +++ b/research/flossing/report_bundle_20260603/tables/wallclock_eval.csv @@ -0,0 +1,30 @@ +run,step,runtime_sec,runtime_hms,timestamp_local,ckpt_exists,ckpt_mtime_local,exact,accuracy,loss +hrm_baseline,2604,1440.547846523,00:24:00.55,2026-05-22 06:32:03,True,2026-05-22 06:32:04,0.016369983553886414,0.6336106061935425,0.8522627949714661 +hrm_baseline,5208,2915.282126881,00:48:35.28,2026-05-22 06:56:38,True,2026-05-22 06:56:39,0.06169315055012703,0.6744286417961121,0.7430469989776611 +hrm_baseline,7812,3726.920135005,01:02:06.92,2026-05-22 07:10:09,True,2026-05-22 07:10:10,0.1358133852481842,0.6985693573951721,0.676047682762146 +hrm_baseline,10416,4458.114451179,01:14:18.11,2026-05-22 07:22:21,True,2026-05-22 07:22:22,0.20248068869113922,0.7259970307350159,0.6480565667152405 +hrm_baseline,13020,5801.015399971,01:36:41.02,2026-05-22 07:44:43,True,2026-05-22 07:44:45,0.3024248778820038,0.7580602169036865,0.5614010691642761 +hrm_baseline,15624,7219.496414876,02:00:19.50,2026-05-22 08:08:22,True,2026-05-22 08:08:23,0.36705803871154785,0.7842973470687866,0.49664124846458435 +hrm_baseline,18228,8616.18880162,02:23:36.19,2026-05-22 08:31:39,True,2026-05-22 08:31:40,0.46287721395492554,0.8077820539474487,0.4641122817993164 +hrm_baseline,20832,9904.193793478,02:45:04.19,2026-05-22 08:53:07,True,2026-05-22 08:53:08,0.4912721812725067,0.8198283314704895,0.4137057960033417 +hrm_baseline,23436,10634.360398376,02:57:14.36,2026-05-22 09:05:17,True,2026-05-22 09:05:18,0.5193856954574585,0.8260135054588318,0.41173747181892395 +hrm_baseline,26040,11366.743085113,03:09:26.74,2026-05-22 09:17:29,True,2026-05-22 09:17:30,0.5265287756919861,0.8270824551582336,0.4161679148674011 +hrm_multi4,2604,5810.841404196,01:36:50.84,2026-05-27 22:44:06,True,2026-05-27 22:44:07,0.0123774204403162,0.6303551197052002,0.8603715300559998 +hrm_multi4,5208,11587.805059004,03:13:07.81,2026-05-28 00:20:23,True,2026-05-28 00:20:24,0.025012653321027756,0.6685170531272888,0.7412638068199158 +hrm_multi4,7812,17365.375767937,04:49:25.38,2026-05-28 01:56:41,True,2026-05-28 01:56:42,0.04651052877306938,0.687346339225769,0.7044885158538818 +hrm_multi4,10416,23142.813901422,06:25:42.81,2026-05-28 03:32:58,True,2026-05-28 03:32:59,0.2006617933511734,0.7243355512619019,0.6340938210487366 +hrm_multi4,13020,29347.517355686,08:09:07.52,2026-05-28 05:16:23,True,2026-05-28 05:16:24,0.34697696566581726,0.7794705033302307,0.49915269017219543 +hrm_multi4,15624,35123.513458465,09:45:23.51,2026-05-28 06:52:39,True,2026-05-28 06:52:40,0.4653252363204956,0.8156101703643799,0.42743897438049316 +hrm_multi4,18228,40898.005173388,11:21:38.01,2026-05-28 08:28:54,True,2026-05-28 08:28:55,0.5790873169898987,0.8494690656661987,0.3459467887878418 +trm_baseline,26041,8593.415472305,02:23:13.42,2026-05-23 01:55:09,True,2026-05-23 01:55:10,0.5575894117355347,0.8469749093055725,0.35285070538520813 +trm_baseline,52082,17155.876633182,04:45:55.88,2026-05-23 04:17:52,True,2026-05-23 04:17:52,0.6295099854469299,0.8704391121864319,0.2973524034023285 +trm_baseline,78123,25719.395089913,07:08:39.40,2026-05-23 06:40:35,True,2026-05-23 06:40:36,0.6993892788887024,0.8920264840126038,0.2477506548166275 +trm_baseline,104164,34282.065662564,09:31:22.07,2026-05-23 09:03:18,True,2026-05-23 09:03:18,0.72928386926651,0.9020143151283264,0.22608688473701477 +trm_baseline,130205,42852.348430037,11:54:12.35,2026-05-23 11:26:08,True,2026-05-23 11:26:09,0.7653990387916565,0.914251446723938,0.19878524541854858 +trm_baseline,156246,51422.119869545,14:17:02.12,2026-05-23 13:48:58,True,2026-05-23 13:48:58,0.7596656680107117,0.9119072556495667,0.2041437327861786 +trm_baseline,182287,59826.120123505,16:37:06.12,2026-05-23 16:09:02,True,2026-05-23 16:09:02,0.7541900873184204,0.9094774723052979,0.2100999504327774 +trm_baseline,208328,68138.229200214,18:55:38.23,2026-05-23 18:27:34,True,2026-05-23 18:27:34,0.7732800841331482,0.9166732430458069,0.19410833716392517 +trm_baseline,234369,76460.482892161,21:14:20.48,2026-05-23 20:46:17,True,2026-05-23 20:46:17,0.7750374674797058,0.9172152280807495,0.19279460608959198 +trm_baseline,260410,84783.527271934,23:33:03.53,2026-05-23 23:05:00,True,2026-05-23 23:05:00,0.7742025256156921,0.9169425964355469,0.19348150491714478 +trm_multi4,26041,22216.1844709,06:10:16.18,2026-05-28 03:18:26,True,2026-05-28 03:18:26,0.7394047975540161,0.9067130088806152,0.21417337656021118 +trm_multi4,52082,44853.735386924,12:27:33.74,2026-05-28 09:35:44,True,2026-05-28 09:35:44,0.8449901342391968,0.9424432516098022,0.1342233270406723 -- cgit v1.2.3