[ { "kind": "HRM", "file": "/home/yurenh2/rrm/research/flossing/diag_hrm_step_2604_512.npz", "step": 2604, "n": 512, "k": 8, "acc": 0.015625, "n_success": 8, "n_failure": 504, "raw_monotone_adjacent_fraction": 0.6551339285714286, "raw_col0_is_sample_max_fraction": 0.62890625, "raw_col0_success_mean": -0.12265677284449339, "raw_col0_failure_mean": -0.0977591768882814, "raw_col0_delta_failure_minus_success": 0.024897595956211993, "raw_col0_auc_failure": 0.6889880952380952, "lambda_max_success_mean": -0.1213220115751028, "lambda_max_failure_mean": -0.0949796658704087, "lambda_max_delta_failure_minus_success": 0.026342345704694112, "lambda_max_auc_failure": 0.6932043650793651, "mean8_success_mean": -0.19951309508178383, "mean8_failure_mean": -0.11536999984130648, "mean8_delta_failure_minus_success": 0.08414309524047735, "mean8_auc_failure": 0.9992559523809523, "tail_mean_5_8_success_mean": -0.23467964679002762, "tail_mean_5_8_failure_mean": -0.12418151276171326, "tail_mean_5_8_delta_failure_minus_success": 0.11049813402831436, "tail_mean_5_8_auc_failure": 1.0, "positive_sum_success_mean": 0.0, "positive_sum_failure_mean": 7.118396313181e-05, "positive_sum_delta_failure_minus_success": 7.118396313181e-05, "positive_sum_auc_failure": 0.5009920634920635, "positive_count_success_mean": 0.0, "positive_count_failure_mean": 0.001984126984126984, "positive_count_delta_failure_minus_success": 0.001984126984126984, "positive_count_auc_failure": 0.5009920634920635, "spread_success_mean": 0.13017353788018227, "spread_failure_mean": 0.034761710632001126, "spread_delta_failure_minus_success": -0.09541182724818115, "spread_auc_failure": 0.019345238095238096, "gap12_success_mean": 0.03395136073231697, "gap12_failure_mean": 0.01004286463712416, "gap12_delta_failure_minus_success": -0.023908496095192813, "gap12_auc_failure": 0.12177579365079365, "lambda_max_corr_token_acc": 0.11864046173536506, "mean8_corr_token_acc": -0.24317640561416, "tail_mean_5_8_corr_token_acc": -0.364745516214904, "positive_sum_corr_token_acc": 0.01654115981917942, "positive_count_corr_token_acc": 0.01654115981917943 }, { "kind": "HRM", "file": "/home/yurenh2/rrm/research/flossing/diag_hrm_step_5208_512.npz", "step": 5208, "n": 512, "k": 8, "acc": 0.048828125, "n_success": 25, "n_failure": 487, "raw_monotone_adjacent_fraction": 0.7368861607142857, "raw_col0_is_sample_max_fraction": 0.798828125, "raw_col0_success_mean": -0.18248083293437958, "raw_col0_failure_mean": -0.1252742203170514, "raw_col0_delta_failure_minus_success": 0.05720661261732818, "raw_col0_auc_failure": 0.8788501026694046, "lambda_max_success_mean": -0.18110741019248963, "lambda_max_failure_mean": -0.12377287614975867, "lambda_max_delta_failure_minus_success": 0.057334534042730964, "lambda_max_auc_failure": 0.8758110882956879, "mean8_success_mean": -0.22203324407339095, "mean8_failure_mean": -0.16840960289919615, "mean8_delta_failure_minus_success": 0.0536236411741948, "mean8_auc_failure": 0.9536755646817249, "tail_mean_5_8_success_mean": -0.24040259554982185, "tail_mean_5_8_failure_mean": -0.1877109366994986, "tail_mean_5_8_delta_failure_minus_success": 0.05269165885032326, "tail_mean_5_8_auc_failure": 0.9394661190965092, "positive_sum_success_mean": 0.0, "positive_sum_failure_mean": 0.0, "positive_sum_delta_failure_minus_success": 0.0, "positive_sum_auc_failure": 0.5, "positive_count_success_mean": 0.0, "positive_count_failure_mean": 0.0, "positive_count_delta_failure_minus_success": 0.0, "positive_count_auc_failure": 0.5, "spread_success_mean": 0.06972571730613708, "spread_failure_mean": 0.07497160217019078, "spread_delta_failure_minus_success": 0.005245884864053696, "spread_auc_failure": 0.5605749486652978, "gap12_success_mean": 0.01896007537841797, "gap12_failure_mean": 0.0215270076581952, "gap12_delta_failure_minus_success": 0.00256693227977723, "gap12_auc_failure": 0.5172073921971253, "lambda_max_corr_token_acc": -0.19170281345239037, "mean8_corr_token_acc": -0.3935742986571875, "tail_mean_5_8_corr_token_acc": -0.44039154437692224, "positive_sum_corr_token_acc": NaN, "positive_count_corr_token_acc": NaN }, { "kind": "HRM", "file": "/home/yurenh2/rrm/research/flossing/diag_hrm_step_7812_512.npz", "step": 7812, "n": 512, "k": 8, "acc": 0.15234375, "n_success": 78, "n_failure": 434, "raw_monotone_adjacent_fraction": 0.7536272321428571, "raw_col0_is_sample_max_fraction": 0.826171875, "raw_col0_success_mean": -0.19744304562799442, "raw_col0_failure_mean": -0.09758298332957993, "raw_col0_delta_failure_minus_success": 0.09986006229841449, "raw_col0_auc_failure": 0.9227519792035921, "lambda_max_success_mean": -0.19638217866229704, "lambda_max_failure_mean": -0.09583942211473957, "lambda_max_delta_failure_minus_success": 0.10054275654755747, "lambda_max_auc_failure": 0.9273307337823467, "mean8_success_mean": -0.2626351442558166, "mean8_failure_mean": -0.15164301766584318, "mean8_delta_failure_minus_success": 0.1109921265899734, "mean8_auc_failure": 0.9941214699279215, "tail_mean_5_8_success_mean": -0.2882901179866913, "tail_mean_5_8_failure_mean": -0.1752684197264127, "tail_mean_5_8_delta_failure_minus_success": 0.11302169826027861, "tail_mean_5_8_auc_failure": 0.9917582417582418, "positive_sum_success_mean": 0.0, "positive_sum_failure_mean": 0.0003272232148648253, "positive_sum_delta_failure_minus_success": 0.0003272232148648253, "positive_sum_auc_failure": 0.5138248847926268, "positive_count_success_mean": 0.0, "positive_count_failure_mean": 0.027649769585253458, "positive_count_delta_failure_minus_success": 0.027649769585253458, "positive_count_auc_failure": 0.5138248847926268, "spread_success_mean": 0.10413952138370429, "spread_failure_mean": 0.09246484074762525, "spread_delta_failure_minus_success": -0.011674680636079043, "spread_auc_failure": 0.4651719248493442, "gap12_success_mean": 0.03829955409925718, "gap12_failure_mean": 0.028275593728860324, "gap12_delta_failure_minus_success": -0.010023960370396854, "gap12_auc_failure": 0.4361928394186459, "lambda_max_corr_token_acc": -0.538963951651198, "mean8_corr_token_acc": -0.6951799773061379, "tail_mean_5_8_corr_token_acc": -0.6975383575513661, "positive_sum_corr_token_acc": -0.06688266082458418, "positive_count_corr_token_acc": -0.07401140024719328 }, { "kind": "HRM", "file": "/home/yurenh2/rrm/research/flossing/diag_hrm_step_10416_512.npz", "step": 10416, "n": 512, "k": 8, "acc": 0.1796875, "n_success": 92, "n_failure": 420, "raw_monotone_adjacent_fraction": 0.7572544642857143, "raw_col0_is_sample_max_fraction": 0.841796875, "raw_col0_success_mean": -0.1734262867020848, "raw_col0_failure_mean": -0.0592695716998562, "raw_col0_delta_failure_minus_success": 0.1141567150022286, "raw_col0_auc_failure": 0.9278985507246377, "lambda_max_success_mean": -0.1725775988408081, "lambda_max_failure_mean": -0.05791120557031328, "lambda_max_delta_failure_minus_success": 0.1146663932704948, "lambda_max_auc_failure": 0.9305900621118013, "mean8_success_mean": -0.22779642522547636, "mean8_failure_mean": -0.11871804687665038, "mean8_delta_failure_minus_success": 0.10907837834882597, "mean8_auc_failure": 0.9611024844720497, "tail_mean_5_8_success_mean": -0.25038242412974004, "tail_mean_5_8_failure_mean": -0.1428806266010118, "tail_mean_5_8_delta_failure_minus_success": 0.10750179752872824, "tail_mean_5_8_auc_failure": 0.9642857142857143, "positive_sum_success_mean": 0.0, "positive_sum_failure_mean": 0.005729037300833235, "positive_sum_delta_failure_minus_success": 0.005729037300833235, "positive_sum_auc_failure": 0.580952380952381, "positive_count_success_mean": 0.0, "positive_count_failure_mean": 0.20476190476190476, "positive_count_delta_failure_minus_success": 0.20476190476190476, "positive_count_auc_failure": 0.580952380952381, "spread_success_mean": 0.09030715536083216, "spread_failure_mean": 0.09668695457983717, "spread_delta_failure_minus_success": 0.006379799219005014, "spread_auc_failure": 0.555667701863354, "gap12_success_mean": 0.028749073491148327, "gap12_failure_mean": 0.03182062698594693, "gap12_delta_failure_minus_success": 0.003071553494798606, "gap12_auc_failure": 0.5320393374741201, "lambda_max_corr_token_acc": -0.4191290312385547, "mean8_corr_token_acc": -0.47253422425178293, "tail_mean_5_8_corr_token_acc": -0.4888621923983516, "positive_sum_corr_token_acc": -0.0452883997405569, "positive_count_corr_token_acc": -0.059274922302819555 }, { "kind": "HRM", "file": "/home/yurenh2/rrm/research/flossing/diag_hrm_step_13020_512.npz", "step": 13020, "n": 512, "k": 8, "acc": 0.30078125, "n_success": 154, "n_failure": 358, "raw_monotone_adjacent_fraction": 0.7329799107142857, "raw_col0_is_sample_max_fraction": 0.779296875, "raw_col0_success_mean": -0.16317386656331334, "raw_col0_failure_mean": -0.012345420176165918, "raw_col0_delta_failure_minus_success": 0.15082844638714743, "raw_col0_auc_failure": 0.9784879924544729, "lambda_max_success_mean": -0.16130138939328179, "lambda_max_failure_mean": -0.010260718098612039, "lambda_max_delta_failure_minus_success": 0.15104067129466975, "lambda_max_auc_failure": 0.9784698541681782, "mean8_success_mean": -0.20199005053763794, "mean8_failure_mean": -0.06562561695864545, "mean8_delta_failure_minus_success": 0.13636443357899247, "mean8_auc_failure": 0.9897337299571937, "tail_mean_5_8_success_mean": -0.21928635291610057, "tail_mean_5_8_failure_mean": -0.08840406447549293, "tail_mean_5_8_delta_failure_minus_success": 0.13088228844060765, "tail_mean_5_8_auc_failure": 0.9910578248567076, "positive_sum_success_mean": 0.0, "positive_sum_failure_mean": 0.03084314924092586, "positive_sum_delta_failure_minus_success": 0.03084314924092586, "positive_sum_auc_failure": 0.7067039106145251, "positive_count_success_mean": 0.0, "positive_count_failure_mean": 0.9134078212290503, "positive_count_delta_failure_minus_success": 0.9134078212290503, "positive_count_auc_failure": 0.7067039106145251, "spread_success_mean": 0.06808332314294485, "spread_failure_mean": 0.08949387786655578, "spread_delta_failure_minus_success": 0.021410554723610933, "spread_auc_failure": 0.6710440397591235, "gap12_success_mean": 0.020334010348842756, "gap12_failure_mean": 0.02894391759471739, "gap12_delta_failure_minus_success": 0.008609907245874633, "gap12_auc_failure": 0.5919248349415948, "lambda_max_corr_token_acc": -0.6439558752057271, "mean8_corr_token_acc": -0.6897349633120721, "tail_mean_5_8_corr_token_acc": -0.6989600499840098, "positive_sum_corr_token_acc": -0.14942778693878858, "positive_count_corr_token_acc": -0.20197511884575872 }, { "kind": "HRM", "file": "/home/yurenh2/rrm/research/flossing/diag_hrm_step_15624_512.npz", "step": 15624, "n": 512, "k": 8, "acc": 0.333984375, "n_success": 171, "n_failure": 341, "raw_monotone_adjacent_fraction": 0.7368861607142857, "raw_col0_is_sample_max_fraction": 0.78515625, "raw_col0_success_mean": -0.17926316590140962, "raw_col0_failure_mean": 0.013238977896144526, "raw_col0_delta_failure_minus_success": 0.19250214379755415, "raw_col0_auc_failure": 0.9800209222959647, "lambda_max_success_mean": -0.1768394878504482, "lambda_max_failure_mean": 0.015099927341163328, "lambda_max_delta_failure_minus_success": 0.19193941519161153, "lambda_max_auc_failure": 0.9809298417108264, "mean8_success_mean": -0.2279172917539989, "mean8_failure_mean": -0.04059797345319265, "mean8_delta_failure_minus_success": 0.18731931830080625, "mean8_auc_failure": 0.9898475416302241, "tail_mean_5_8_success_mean": -0.24842964750796295, "tail_mean_5_8_failure_mean": -0.06421948799691395, "tail_mean_5_8_delta_failure_minus_success": 0.18421015951104902, "tail_mean_5_8_auc_failure": 0.9895731508634734, "positive_sum_success_mean": 0.0007992622970837598, "positive_sum_failure_mean": 0.054435018036423026, "positive_sum_delta_failure_minus_success": 0.053635755739339264, "positive_sum_auc_failure": 0.7755826516437722, "positive_count_success_mean": 0.023391812865497075, "positive_count_failure_mean": 1.5073313782991202, "positive_count_delta_failure_minus_success": 1.4839395654336232, "positive_count_auc_failure": 0.7767059388451578, "spread_success_mean": 0.08114021638005275, "spread_failure_mean": 0.0924473905291844, "spread_delta_failure_minus_success": 0.01130717414913164, "spread_auc_failure": 0.5924782631064465, "gap12_success_mean": 0.026074302491693818, "gap12_failure_mean": 0.027355855661720895, "gap12_delta_failure_minus_success": 0.001281553170027077, "gap12_auc_failure": 0.5126305499819931, "lambda_max_corr_token_acc": -0.7050637096452648, "mean8_corr_token_acc": -0.7549696612898014, "tail_mean_5_8_corr_token_acc": -0.758163657007401, "positive_sum_corr_token_acc": -0.24676472967716626, "positive_count_corr_token_acc": -0.2994831724010142 }, { "kind": "HRM", "file": "/home/yurenh2/rrm/research/flossing/diag_hrm_step_18228_512.npz", "step": 18228, "n": 512, "k": 8, "acc": 0.474609375, "n_success": 243, "n_failure": 269, "raw_monotone_adjacent_fraction": 0.7474888392857143, "raw_col0_is_sample_max_fraction": 0.818359375, "raw_col0_success_mean": -0.07771555354618448, "raw_col0_failure_mean": 0.016638744400673956, "raw_col0_delta_failure_minus_success": 0.09435429794685843, "raw_col0_auc_failure": 0.8841617329845335, "lambda_max_success_mean": -0.07614383449056474, "lambda_max_failure_mean": 0.018023035920062363, "lambda_max_delta_failure_minus_success": 0.0941668704106271, "lambda_max_auc_failure": 0.8879250998210106, "mean8_success_mean": -0.14575748546156192, "mean8_failure_mean": -0.03353973620840138, "mean8_delta_failure_minus_success": 0.11221774925316054, "mean8_auc_failure": 0.9716523628130402, "tail_mean_5_8_success_mean": -0.1721063231313486, "tail_mean_5_8_failure_mean": -0.05434106856267585, "tail_mean_5_8_delta_failure_minus_success": 0.11776525456867276, "tail_mean_5_8_auc_failure": 0.9799899031621461, "positive_sum_success_mean": 0.00525456639471437, "positive_sum_failure_mean": 0.050987723279165634, "positive_sum_delta_failure_minus_success": 0.045733156884451266, "positive_sum_auc_failure": 0.749567824743372, "positive_count_success_mean": 0.13580246913580246, "positive_count_failure_mean": 1.4981412639405205, "positive_count_delta_failure_minus_success": 1.362338794804718, "positive_count_auc_failure": 0.7569645233833586, "spread_success_mean": 0.10785400053824455, "spread_failure_mean": 0.0830327885870487, "spread_delta_failure_minus_success": -0.024821211951195854, "spread_auc_failure": 0.35938623464439245, "gap12_success_mean": 0.03951045055418571, "gap12_failure_mean": 0.027345439433070017, "gap12_delta_failure_minus_success": -0.01216501112111569, "gap12_auc_failure": 0.4258111891321309, "lambda_max_corr_token_acc": -0.6086543875688578, "mean8_corr_token_acc": -0.7665403099919734, "tail_mean_5_8_corr_token_acc": -0.786002777751629, "positive_sum_corr_token_acc": -0.3211841461864595, "positive_count_corr_token_acc": -0.4146342203458471 }, { "kind": "HRM", "file": "/home/yurenh2/rrm/research/flossing/diag_hrm_step_20832_512.npz", "step": 20832, "n": 512, "k": 8, "acc": 0.45703125, "n_success": 234, "n_failure": 278, "raw_monotone_adjacent_fraction": 0.7497209821428571, "raw_col0_is_sample_max_fraction": 0.80078125, "raw_col0_success_mean": -0.0847591630423951, "raw_col0_failure_mean": 0.011107698112080065, "raw_col0_delta_failure_minus_success": 0.09586686115447517, "raw_col0_auc_failure": 0.9046762589928058, "lambda_max_success_mean": -0.08308592252134799, "lambda_max_failure_mean": 0.012636878445990327, "lambda_max_delta_failure_minus_success": 0.09572280096733832, "lambda_max_auc_failure": 0.9064287031912931, "mean8_success_mean": -0.14093293187680603, "mean8_failure_mean": -0.044163071447564366, "mean8_delta_failure_minus_success": 0.09676986042924166, "mean8_auc_failure": 0.9825831642378405, "tail_mean_5_8_success_mean": -0.16335714651812983, "tail_mean_5_8_failure_mean": -0.06769531056555371, "tail_mean_5_8_delta_failure_minus_success": 0.09566183595257612, "tail_mean_5_8_auc_failure": 0.9836438541474513, "positive_sum_success_mean": 0.002644224575935648, "positive_sum_failure_mean": 0.04743396344555085, "positive_sum_delta_failure_minus_success": 0.0447897388696152, "positive_sum_auc_failure": 0.7440432269568961, "positive_count_success_mean": 0.0811965811965812, "positive_count_failure_mean": 1.2589928057553956, "positive_count_delta_failure_minus_success": 1.1777962245588145, "positive_count_auc_failure": 0.747501998401279, "spread_success_mean": 0.0904661258792059, "spread_failure_mean": 0.09196829576401849, "spread_delta_failure_minus_success": 0.0015021698848125958, "spread_auc_failure": 0.5366168603578676, "gap12_success_mean": 0.03158770251569426, "gap12_failure_mean": 0.029151334776909496, "gap12_delta_failure_minus_success": -0.0024363677387847643, "gap12_auc_failure": 0.49451208264157903, "lambda_max_corr_token_acc": -0.5961668167415688, "mean8_corr_token_acc": -0.7490505096842796, "tail_mean_5_8_corr_token_acc": -0.7670744302809093, "positive_sum_corr_token_acc": -0.25827536960316083, "positive_count_corr_token_acc": -0.3352751212871484 }, { "kind": "HRM", "file": "/home/yurenh2/rrm/research/flossing/diag_hrm_step_23436_512.npz", "step": 23436, "n": 512, "k": 8, "acc": 0.505859375, "n_success": 259, "n_failure": 253, "raw_monotone_adjacent_fraction": 0.7444196428571429, "raw_col0_is_sample_max_fraction": 0.8125, "raw_col0_success_mean": -0.1396155291305258, "raw_col0_failure_mean": -0.002783365376173506, "raw_col0_delta_failure_minus_success": 0.13683216375435228, "raw_col0_auc_failure": 0.9710501014848841, "lambda_max_success_mean": -0.1384992553999928, "lambda_max_failure_mean": -0.0009893162803058476, "lambda_max_delta_failure_minus_success": 0.13750993911968695, "lambda_max_auc_failure": 0.9724541028888855, "mean8_success_mean": -0.2008695753030298, "mean8_failure_mean": -0.05231515268090541, "mean8_delta_failure_minus_success": 0.14855442262212437, "mean8_auc_failure": 0.9922932531628184, "tail_mean_5_8_success_mean": -0.22361706499433173, "tail_mean_5_8_failure_mean": -0.07373835104911058, "tail_mean_5_8_delta_failure_minus_success": 0.14987871394522115, "tail_mean_5_8_auc_failure": 0.9920032963511224, "positive_sum_success_mean": 0.0005239612893938558, "positive_sum_failure_mean": 0.02692777939243448, "positive_sum_delta_failure_minus_success": 0.026403818103040624, "positive_sum_auc_failure": 0.7254032688815297, "positive_count_success_mean": 0.019305019305019305, "positive_count_failure_mean": 0.8932806324110671, "positive_count_delta_failure_minus_success": 0.8739756131060479, "positive_count_auc_failure": 0.7251133120698338, "spread_success_mean": 0.09569484527981534, "spread_failure_mean": 0.08370462378894082, "spread_delta_failure_minus_success": -0.011990221490874517, "spread_auc_failure": 0.4474338822164909, "gap12_success_mean": 0.0377779275387468, "gap12_failure_mean": 0.025563243349588635, "gap12_delta_failure_minus_success": -0.012214684189158165, "gap12_auc_failure": 0.42294016207059687, "lambda_max_corr_token_acc": -0.735475615818423, "mean8_corr_token_acc": -0.8331365987869843, "tail_mean_5_8_corr_token_acc": -0.8388368318549599, "positive_sum_corr_token_acc": -0.2844985120608491, "positive_count_corr_token_acc": -0.35566859763323194 }, { "kind": "HRM", "file": "/home/yurenh2/rrm/research/flossing/diag_hrm_step_26040_512.npz", "step": 26040, "n": 512, "k": 8, "acc": 0.5, "n_success": 256, "n_failure": 256, "raw_monotone_adjacent_fraction": 0.748046875, "raw_col0_is_sample_max_fraction": 0.787109375, "raw_col0_success_mean": -0.14642834789538028, "raw_col0_failure_mean": 0.03254939207545249, "raw_col0_delta_failure_minus_success": 0.17897773997083277, "raw_col0_auc_failure": 0.9890289306640625, "lambda_max_success_mean": -0.14456235250236205, "lambda_max_failure_mean": 0.034372387226426326, "lambda_max_delta_failure_minus_success": 0.17893473972878837, "lambda_max_auc_failure": 0.9896240234375, "mean8_success_mean": -0.18957092847483636, "mean8_failure_mean": -0.025070655147768406, "mean8_delta_failure_minus_success": 0.16450027332706796, "mean8_auc_failure": 0.997589111328125, "tail_mean_5_8_success_mean": -0.21117772247089306, "tail_mean_5_8_failure_mean": -0.04990981457450516, "tail_mean_5_8_delta_failure_minus_success": 0.1612679078963879, "tail_mean_5_8_auc_failure": 0.9984893798828125, "positive_sum_success_mean": 0.00045363006938714534, "positive_sum_failure_mean": 0.074183922140719, "positive_sum_delta_failure_minus_success": 0.07373029207133186, "positive_sum_auc_failure": 0.865997314453125, "positive_count_success_mean": 0.01953125, "positive_count_failure_mean": 1.94140625, "positive_count_delta_failure_minus_success": 1.921875, "positive_count_auc_failure": 0.8662796020507812, "spread_success_mean": 0.07883575186588132, "spread_failure_mean": 0.09741729416225553, "spread_delta_failure_minus_success": 0.01858154229637421, "spread_auc_failure": 0.62774658203125, "gap12_success_mean": 0.01608559737815085, "gap12_failure_mean": 0.02983991140987996, "gap12_delta_failure_minus_success": 0.013754314031729109, "gap12_auc_failure": 0.631378173828125, "lambda_max_corr_token_acc": -0.8022434699246608, "mean8_corr_token_acc": -0.8493972056528871, "tail_mean_5_8_corr_token_acc": -0.8567110926525265, "positive_sum_corr_token_acc": -0.37718525164073874, "positive_count_corr_token_acc": -0.4879767533856408 }, { "kind": "TRM", "file": "/home/yurenh2/rrm/research/flossing/diag_trm_singleGPU_step26041_512.npz", "step": 26041, "n": 512, "k": 8, "acc": 0.576171875, "n_success": 295, "n_failure": 217, "raw_monotone_adjacent_fraction": 0.7438616071428571, "raw_col0_is_sample_max_fraction": 0.791015625, "raw_col0_success_mean": -0.03554810737259686, "raw_col0_failure_mean": 0.034024751214321254, "raw_col0_delta_failure_minus_success": 0.06957285858691811, "raw_col0_auc_failure": 0.9836132156525814, "lambda_max_success_mean": -0.03484145596473462, "lambda_max_failure_mean": 0.034498983389517714, "lambda_max_delta_failure_minus_success": 0.06934043935425233, "lambda_max_auc_failure": 0.9838475357338123, "mean8_success_mean": -0.044594623737682894, "mean8_failure_mean": 0.011831644308724274, "mean8_delta_failure_minus_success": 0.05642626804640717, "mean8_auc_failure": 0.9900960712333047, "tail_mean_5_8_success_mean": -0.04878217898325509, "tail_mean_5_8_failure_mean": 0.002079399892583064, "tail_mean_5_8_delta_failure_minus_success": 0.050861578875838157, "tail_mean_5_8_auc_failure": 0.9916582051081778, "positive_sum_success_mean": 0.002200093488031367, "positive_sum_failure_mean": 0.11402314361281402, "positive_sum_delta_failure_minus_success": 0.11182305012478265, "positive_sum_auc_failure": 0.9693353120362415, "positive_count_success_mean": 0.1694915254237288, "positive_count_failure_mean": 5.539170506912442, "positive_count_delta_failure_minus_success": 5.369678981488713, "positive_count_auc_failure": 0.9706006404748887, "spread_success_mean": 0.01622369096148759, "spread_failure_mean": 0.03719716329264318, "spread_delta_failure_minus_success": 0.02097347233115559, "spread_auc_failure": 0.8946653128173084, "gap12_success_mean": 0.004580460641320037, "gap12_failure_mean": 0.010477446547843793, "gap12_delta_failure_minus_success": 0.005896985906523756, "gap12_auc_failure": 0.7136608607357651, "lambda_max_corr_token_acc": -0.8221535026414836, "mean8_corr_token_acc": -0.8532299277645251, "tail_mean_5_8_corr_token_acc": -0.8618572814599218, "positive_sum_corr_token_acc": -0.6425677667973041, "positive_count_corr_token_acc": -0.7985750358664757 }, { "kind": "TRM", "file": "/home/yurenh2/rrm/research/flossing/diag_trm_singleGPU_step52082_512.npz", "step": 52082, "n": 512, "k": 8, "acc": 0.6484375, "n_success": 332, "n_failure": 180, "raw_monotone_adjacent_fraction": 0.7435825892857143, "raw_col0_is_sample_max_fraction": 0.78125, "raw_col0_success_mean": -0.0007075830156292839, "raw_col0_failure_mean": 0.05957649097674423, "raw_col0_delta_failure_minus_success": 0.06028407399237352, "raw_col0_auc_failure": 0.981425702811245, "lambda_max_success_mean": -0.00011202936298378704, "lambda_max_failure_mean": 0.059987526981987886, "lambda_max_delta_failure_minus_success": 0.060099556344971675, "lambda_max_auc_failure": 0.9824129852744311, "mean8_success_mean": -0.00932737177132882, "mean8_failure_mean": 0.03324007580221304, "mean8_delta_failure_minus_success": 0.04256744757354186, "mean8_auc_failure": 0.993591030789826, "tail_mean_5_8_success_mean": -0.01301439359083114, "tail_mean_5_8_failure_mean": 0.02160164021022663, "tail_mean_5_8_delta_failure_minus_success": 0.034616033801057766, "tail_mean_5_8_auc_failure": 0.9948627844712182, "positive_sum_success_mean": 0.011575991871503936, "positive_sum_failure_mean": 0.26611879217404444, "positive_sum_delta_failure_minus_success": 0.2545428003025405, "positive_sum_auc_failure": 0.9932730923694779, "positive_count_success_mean": 0.9879518072289156, "positive_count_failure_mean": 7.916666666666667, "positive_count_delta_failure_minus_success": 6.9287148594377514, "positive_count_auc_failure": 0.9850485274431058, "spread_success_mean": 0.014868822472832896, "spread_failure_mean": 0.04419851013889355, "spread_delta_failure_minus_success": 0.029329687666060658, "spread_auc_failure": 0.9461680053547523, "gap12_success_mean": 0.0048919110970939116, "gap12_failure_mean": 0.01239234626862324, "gap12_delta_failure_minus_success": 0.007500435171529329, "gap12_auc_failure": 0.7928547523427042, "lambda_max_corr_token_acc": -0.822862756235417, "mean8_corr_token_acc": -0.8543835399263785, "tail_mean_5_8_corr_token_acc": -0.8502975145711263, "positive_sum_corr_token_acc": -0.83615142839377, "positive_count_corr_token_acc": -0.8756439805499547 }, { "kind": "TRM", "file": "/home/yurenh2/rrm/research/flossing/diag_trm_singleGPU_step78123_512.npz", "step": 78123, "n": 512, "k": 8, "acc": 0.6875, "n_success": 352, "n_failure": 160, "raw_monotone_adjacent_fraction": 0.7522321428571429, "raw_col0_is_sample_max_fraction": 0.94921875, "raw_col0_success_mean": -0.0022565727597132286, "raw_col0_failure_mean": 0.08176288979593664, "raw_col0_delta_failure_minus_success": 0.08401946255564988, "raw_col0_auc_failure": 0.9937144886363637, "lambda_max_success_mean": -0.002104297534184628, "lambda_max_failure_mean": 0.08205510303378105, "lambda_max_delta_failure_minus_success": 0.08415940056796568, "lambda_max_auc_failure": 0.99375, "mean8_success_mean": -0.03085258081190225, "mean8_failure_mean": 0.04898583621179568, "mean8_delta_failure_minus_success": 0.07983841702369793, "mean8_auc_failure": 0.9991299715909091, "tail_mean_5_8_success_mean": -0.03740866408387236, "tail_mean_5_8_failure_mean": 0.03495748526038369, "tail_mean_5_8_delta_failure_minus_success": 0.07236614934425606, "tail_mean_5_8_auc_failure": 0.9993607954545455, "positive_sum_success_mean": 0.011536654059641065, "positive_sum_failure_mean": 0.39188668969436546, "positive_sum_delta_failure_minus_success": 0.3803500356347244, "positive_sum_auc_failure": 0.9991299715909091, "positive_count_success_mean": 0.6619318181818182, "positive_count_failure_mean": 8.0, "positive_count_delta_failure_minus_success": 7.338068181818182, "positive_count_auc_failure": 0.9900568181818182, "spread_success_mean": 0.037154979946411586, "spread_failure_mean": 0.05358391968184151, "spread_delta_failure_minus_success": 0.016428939735429922, "spread_auc_failure": 0.8060191761363636, "gap12_success_mean": 0.026502143868418152, "gap12_failure_mean": 0.015829400252550842, "gap12_delta_failure_minus_success": -0.01067274361586731, "gap12_auc_failure": 0.2559303977272727, "lambda_max_corr_token_acc": -0.8695160653352085, "mean8_corr_token_acc": -0.9167732333138554, "tail_mean_5_8_corr_token_acc": -0.9212713093750954, "positive_sum_corr_token_acc": -0.9333667142066921, "positive_count_corr_token_acc": -0.916788261375625 }, { "kind": "TRM", "file": "/home/yurenh2/rrm/research/flossing/diag_trm_singleGPU_step104164_512.npz", "step": 104164, "n": 512, "k": 8, "acc": 0.7109375, "n_success": 364, "n_failure": 148, "raw_monotone_adjacent_fraction": 0.7659040178571429, "raw_col0_is_sample_max_fraction": 0.873046875, "raw_col0_success_mean": 0.0007209903823832564, "raw_col0_failure_mean": 0.09478743761979244, "raw_col0_delta_failure_minus_success": 0.09406644723740919, "raw_col0_auc_failure": 0.9850942975942976, "lambda_max_success_mean": 0.0011589818968986748, "lambda_max_failure_mean": 0.0948834198753576, "lambda_max_delta_failure_minus_success": 0.09372443797845893, "lambda_max_auc_failure": 0.9852242352242352, "mean8_success_mean": -0.013257209616115657, "mean8_failure_mean": 0.05609436892523632, "mean8_delta_failure_minus_success": 0.06935157854135197, "mean8_auc_failure": 0.9912013662013662, "tail_mean_5_8_success_mean": -0.018433471481510245, "tail_mean_5_8_failure_mean": 0.04002377992288235, "tail_mean_5_8_delta_failure_minus_success": 0.05845725140439259, "tail_mean_5_8_auc_failure": 0.9914241164241164, "positive_sum_success_mean": 0.024264690904521637, "positive_sum_failure_mean": 0.44875495140189053, "positive_sum_delta_failure_minus_success": 0.4244902604973689, "positive_sum_auc_failure": 0.9912013662013662, "positive_count_success_mean": 1.0631868131868132, "positive_count_failure_mean": 8.0, "positive_count_delta_failure_minus_success": 6.936813186813187, "positive_count_auc_failure": 0.9766483516483516, "spread_success_mean": 0.02201517710918679, "spread_failure_mean": 0.06216927877048383, "spread_delta_failure_minus_success": 0.04015410166129704, "spread_auc_failure": 0.9606660231660231, "gap12_success_mean": 0.008753806905887968, "gap12_failure_mean": 0.020252479617861478, "gap12_delta_failure_minus_success": 0.01149867271197351, "gap12_auc_failure": 0.8213171963171964, "lambda_max_corr_token_acc": -0.8714207860201784, "mean8_corr_token_acc": -0.8978503025935065, "tail_mean_5_8_corr_token_acc": -0.9018662195261188, "positive_sum_corr_token_acc": -0.9164290683745636, "positive_count_corr_token_acc": -0.8552081309221397 }, { "kind": "TRM", "file": "/home/yurenh2/rrm/research/flossing/diag_trm_singleGPU_step130205_512.npz", "step": 130205, "n": 512, "k": 8, "acc": 0.755859375, "n_success": 387, "n_failure": 125, "raw_monotone_adjacent_fraction": 0.7661830357142857, "raw_col0_is_sample_max_fraction": 0.83203125, "raw_col0_success_mean": 0.013444696512909148, "raw_col0_failure_mean": 0.10679112273454666, "raw_col0_delta_failure_minus_success": 0.09334642622163751, "raw_col0_auc_failure": 0.988857881136951, "lambda_max_success_mean": 0.014031398803655426, "lambda_max_failure_mean": 0.10683381146192551, "lambda_max_delta_failure_minus_success": 0.09280241265827009, "lambda_max_auc_failure": 0.9889198966408269, "mean8_success_mean": 0.0006696076976612102, "mean8_failure_mean": 0.05997022696118802, "mean8_delta_failure_minus_success": 0.05930061926352681, "mean8_auc_failure": 0.9935503875968992, "tail_mean_5_8_success_mean": -0.004693869635777962, "tail_mean_5_8_failure_mean": 0.04096000522933901, "tail_mean_5_8_delta_failure_minus_success": 0.04565387486511697, "tail_mean_5_8_auc_failure": 0.9943152454780362, "positive_sum_success_mean": 0.04017261892318711, "positive_sum_failure_mean": 0.47976181568950416, "positive_sum_delta_failure_minus_success": 0.439589196766317, "positive_sum_auc_failure": 0.9935503875968992, "positive_count_success_mean": 2.4108527131782944, "positive_count_failure_mean": 8.0, "positive_count_delta_failure_minus_success": 5.589147286821706, "positive_count_auc_failure": 0.9521963824289406, "spread_success_mean": 0.021535266655820777, "spread_failure_mean": 0.07427249775826932, "spread_delta_failure_minus_success": 0.05273723110244854, "spread_auc_failure": 0.9726511627906976, "gap12_success_mean": 0.007057571094765433, "gap12_failure_mean": 0.02309795269370079, "gap12_delta_failure_minus_success": 0.016040381598935356, "gap12_auc_failure": 0.8932093023255814, "lambda_max_corr_token_acc": -0.868494410453853, "mean8_corr_token_acc": -0.8995673015890842, "tail_mean_5_8_corr_token_acc": -0.9101679267135595, "positive_sum_corr_token_acc": -0.9091943845695494, "positive_count_corr_token_acc": -0.7232326485538275 }, { "kind": "TRM", "file": "/home/yurenh2/rrm/research/flossing/diag_trm_singleGPU_step156246_512.npz", "step": 156246, "n": 512, "k": 8, "acc": 0.7578125, "n_success": 388, "n_failure": 124, "raw_monotone_adjacent_fraction": 0.8465401785714286, "raw_col0_is_sample_max_fraction": 0.984375, "raw_col0_success_mean": 0.033355438992845106, "raw_col0_failure_mean": 0.1080104663006721, "raw_col0_delta_failure_minus_success": 0.074655027307827, "raw_col0_auc_failure": 0.9934527768540073, "lambda_max_success_mean": 0.03340256920781724, "lambda_max_failure_mean": 0.10808430672172577, "lambda_max_delta_failure_minus_success": 0.07468173751390852, "lambda_max_auc_failure": 0.9935359161955437, "mean8_success_mean": -0.01719525050584332, "mean8_failure_mean": 0.059510113856841566, "mean8_delta_failure_minus_success": 0.07670536436268488, "mean8_auc_failure": 0.9990646824077153, "tail_mean_5_8_success_mean": -0.03258687625807819, "tail_mean_5_8_failure_mean": 0.03992759636434306, "tail_mean_5_8_delta_failure_minus_success": 0.07251447262242125, "tail_mean_5_8_auc_failure": 0.9987944795477219, "positive_sum_success_mean": 0.0501136147264783, "positive_sum_failure_mean": 0.47608091085473253, "positive_sum_delta_failure_minus_success": 0.42596729612825424, "positive_sum_auc_failure": 0.9990646824077153, "positive_count_success_mean": 1.7268041237113403, "positive_count_failure_mean": 8.0, "positive_count_delta_failure_minus_success": 6.27319587628866, "positive_count_auc_failure": 0.9806701030927835, "spread_success_mean": 0.07044461364565689, "spread_failure_mean": 0.07696695457167563, "spread_delta_failure_minus_success": 0.006522340926018735, "spread_auc_failure": 0.5740563684735617, "gap12_success_mean": 0.03767048126359648, "gap12_failure_mean": 0.024864713691415324, "gap12_delta_failure_minus_success": -0.012805767572181152, "gap12_auc_failure": 0.30377036913867644, "lambda_max_corr_token_acc": -0.8015386582153512, "mean8_corr_token_acc": -0.8887892447157182, "tail_mean_5_8_corr_token_acc": -0.8908349686685592, "positive_sum_corr_token_acc": -0.9311284896491179, "positive_count_corr_token_acc": -0.8411922875634292 }, { "kind": "TRM", "file": "/home/yurenh2/rrm/research/flossing/diag_trm_singleGPU_step182287_512.npz", "step": 182287, "n": 512, "k": 8, "acc": 0.7421875, "n_success": 380, "n_failure": 132, "raw_monotone_adjacent_fraction": 0.7837611607142857, "raw_col0_is_sample_max_fraction": 0.9296875, "raw_col0_success_mean": 0.00821161610291902, "raw_col0_failure_mean": 0.10131858266664273, "raw_col0_delta_failure_minus_success": 0.0931069665637237, "raw_col0_auc_failure": 0.9476076555023923, "lambda_max_success_mean": 0.008431412303494461, "lambda_max_failure_mean": 0.10132696061874881, "lambda_max_delta_failure_minus_success": 0.09289554831525434, "lambda_max_auc_failure": 0.9476275917065391, "mean8_success_mean": -0.018691345271909503, "mean8_failure_mean": 0.049928791549256966, "mean8_delta_failure_minus_success": 0.06862013682116647, "mean8_auc_failure": 0.993122009569378, "tail_mean_5_8_success_mean": -0.026891200006043378, "tail_mean_5_8_failure_mean": 0.029759585490286223, "tail_mean_5_8_delta_failure_minus_success": 0.0566507854963296, "tail_mean_5_8_auc_failure": 0.993421052631579, "positive_sum_success_mean": 0.027419095844602212, "positive_sum_failure_mean": 0.3994350857048465, "positive_sum_delta_failure_minus_success": 0.37201598986024426, "positive_sum_auc_failure": 0.993122009569378, "positive_count_success_mean": 0.8973684210526316, "positive_count_failure_mean": 7.992424242424242, "positive_count_delta_failure_minus_success": 7.095055821371611, "positive_count_auc_failure": 0.9867125199362041, "spread_success_mean": 0.03768573965873513, "spread_failure_mean": 0.07997999362255954, "spread_delta_failure_minus_success": 0.04229425396382441, "spread_auc_failure": 0.8919258373205742, "gap12_success_mean": 0.019669575117656057, "gap12_failure_mean": 0.027337144778081864, "gap12_delta_failure_minus_success": 0.007667569660425807, "gap12_auc_failure": 0.7587121212121212, "lambda_max_corr_token_acc": -0.7443356021278564, "mean8_corr_token_acc": -0.8907098916862968, "tail_mean_5_8_corr_token_acc": -0.9075392398447194, "positive_sum_corr_token_acc": -0.9063416990478439, "positive_count_corr_token_acc": -0.8813795117907826 }, { "kind": "TRM", "file": "/home/yurenh2/rrm/research/flossing/diag_trm_singleGPU_step208328_512.npz", "step": 208328, "n": 512, "k": 8, "acc": 0.75, "n_success": 384, "n_failure": 128, "raw_monotone_adjacent_fraction": 0.8138950892857143, "raw_col0_is_sample_max_fraction": 0.970703125, "raw_col0_success_mean": 0.03753090985757505, "raw_col0_failure_mean": 0.10676543856970966, "raw_col0_delta_failure_minus_success": 0.06923452871213462, "raw_col0_auc_failure": 0.9824422200520834, "lambda_max_success_mean": 0.037651085844269495, "lambda_max_failure_mean": 0.10677664534887299, "lambda_max_delta_failure_minus_success": 0.0691255595046035, "lambda_max_auc_failure": 0.9825439453125, "mean8_success_mean": -0.003334924222756793, "mean8_failure_mean": 0.0556086836418217, "mean8_delta_failure_minus_success": 0.058943607864578494, "mean8_auc_failure": 0.997314453125, "tail_mean_5_8_success_mean": -0.015420041531266785, "tail_mean_5_8_failure_mean": 0.03501016539212287, "tail_mean_5_8_delta_failure_minus_success": 0.050430206923389655, "tail_mean_5_8_auc_failure": 0.9977620442708334, "positive_sum_success_mean": 0.0587247395355727, "positive_sum_failure_mean": 0.4448694691345736, "positive_sum_delta_failure_minus_success": 0.38614472959900087, "positive_sum_auc_failure": 0.997314453125, "positive_count_success_mean": 2.3385416666666665, "positive_count_failure_mean": 8.0, "positive_count_delta_failure_minus_success": 5.661458333333334, "positive_count_auc_failure": 0.97265625, "spread_success_mean": 0.05609253943835787, "spread_failure_mean": 0.08039409609409631, "spread_delta_failure_minus_success": 0.024301556655738445, "spread_auc_failure": 0.8315836588541666, "gap12_success_mean": 0.029487703329541166, "gap12_failure_mean": 0.026957579655572772, "gap12_delta_failure_minus_success": -0.0025301236739683937, "gap12_auc_failure": 0.4844767252604167, "lambda_max_corr_token_acc": -0.7847679828439686, "mean8_corr_token_acc": -0.9025430240366568, "tail_mean_5_8_corr_token_acc": -0.9116696180830379, "positive_sum_corr_token_acc": -0.918178990916473, "positive_count_corr_token_acc": -0.8241555473543875 }, { "kind": "TRM", "file": "/home/yurenh2/rrm/research/flossing/diag_trm_singleGPU_step234369_512.npz", "step": 234369, "n": 512, "k": 8, "acc": 0.7734375, "n_success": 396, "n_failure": 116, "raw_monotone_adjacent_fraction": 0.7837611607142857, "raw_col0_is_sample_max_fraction": 0.943359375, "raw_col0_success_mean": 0.017852102687761037, "raw_col0_failure_mean": 0.10551282866247769, "raw_col0_delta_failure_minus_success": 0.08766072597471665, "raw_col0_auc_failure": 0.9871125043538836, "lambda_max_success_mean": 0.018100257435352516, "lambda_max_failure_mean": 0.10551282866247769, "lambda_max_delta_failure_minus_success": 0.08741257122712517, "lambda_max_auc_failure": 0.9871125043538836, "mean8_success_mean": -0.011827684605397524, "mean8_failure_mean": 0.05244761205414824, "mean8_delta_failure_minus_success": 0.06427529665954576, "mean8_auc_failure": 0.9948624172762104, "tail_mean_5_8_success_mean": -0.020207434739923615, "tail_mean_5_8_failure_mean": 0.03131334408957126, "tail_mean_5_8_delta_failure_minus_success": 0.05152077882949488, "tail_mean_5_8_auc_failure": 0.995276036224312, "positive_sum_success_mean": 0.03539385316928405, "positive_sum_failure_mean": 0.4195808964331859, "positive_sum_delta_failure_minus_success": 0.38418704326390185, "positive_sum_auc_failure": 0.9948624172762104, "positive_count_success_mean": 1.4595959595959596, "positive_count_failure_mean": 8.0, "positive_count_delta_failure_minus_success": 6.540404040404041, "positive_count_auc_failure": 0.976010101010101, "spread_success_mean": 0.04073868896878162, "spread_failure_mean": 0.08256442746115399, "spread_delta_failure_minus_success": 0.041825738492372366, "spread_auc_failure": 0.9344740508533612, "gap12_success_mean": 0.022704324246853612, "gap12_failure_mean": 0.026842519994182832, "gap12_delta_failure_minus_success": 0.00413819574732922, "gap12_auc_failure": 0.6134839777081157, "lambda_max_corr_token_acc": -0.8095679410757006, "mean8_corr_token_acc": -0.8841140003061905, "tail_mean_5_8_corr_token_acc": -0.8897475061573502, "positive_sum_corr_token_acc": -0.9111627943782784, "positive_count_corr_token_acc": -0.8264057387186906 }, { "kind": "TRM", "file": "/home/yurenh2/rrm/research/flossing/diag_trm_singleGPU_step260410_512.npz", "step": 260410, "n": 512, "k": 8, "acc": 0.76953125, "n_success": 394, "n_failure": 118, "raw_monotone_adjacent_fraction": 0.7299107142857143, "raw_col0_is_sample_max_fraction": 0.767578125, "raw_col0_success_mean": 0.012635910749513134, "raw_col0_failure_mean": 0.10288177159125522, "raw_col0_delta_failure_minus_success": 0.09024586084174209, "raw_col0_auc_failure": 0.9893530069689409, "lambda_max_success_mean": 0.013151870022697326, "lambda_max_failure_mean": 0.10296485964524543, "lambda_max_delta_failure_minus_success": 0.0898129896225481, "lambda_max_auc_failure": 0.9894175341994321, "mean8_success_mean": 0.0016685908707722002, "mean8_failure_mean": 0.05065993657996641, "mean8_delta_failure_minus_success": 0.04899134570919421, "mean8_auc_failure": 0.9931170954142648, "tail_mean_5_8_success_mean": -0.002553942017630222, "tail_mean_5_8_failure_mean": 0.030341437701096412, "tail_mean_5_8_delta_failure_minus_success": 0.032895379718726636, "tail_mean_5_8_auc_failure": 0.9938484040264992, "positive_sum_success_mean": 0.030020373667639433, "positive_sum_failure_mean": 0.40529236062114143, "positive_sum_delta_failure_minus_success": 0.375271986953502, "positive_sum_auc_failure": 0.9931170954142648, "positive_count_success_mean": 3.182741116751269, "positive_count_failure_mean": 7.991525423728813, "positive_count_delta_failure_minus_success": 4.808784306977544, "positive_count_auc_failure": 0.9728770541168373, "spread_success_mean": 0.01786690075157378, "spread_failure_mean": 0.08103766449001003, "spread_delta_failure_minus_success": 0.06317076373843625, "spread_auc_failure": 0.9816742665404801, "gap12_success_mean": 0.006782306742966769, "gap12_failure_mean": 0.028344342895483567, "gap12_delta_failure_minus_success": 0.021562036152516798, "gap12_auc_failure": 0.9238363589434742, "lambda_max_corr_token_acc": -0.8797240829367476, "mean8_corr_token_acc": -0.9146835924338959, "tail_mean_5_8_corr_token_acc": -0.9119410149316228, "positive_sum_corr_token_acc": -0.9168566073790785, "positive_count_corr_token_acc": -0.6899483781164985 } ]