***************************************** Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. ***************************************** logs/seq2048_sxm28_seed1338_20260319a.txt val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=/root/parameter-golf-sxm28/data/tokenizers/fineweb_1024_bpe.model train_loader:dataset:fineweb10B_sp1024 train_shards:80 val_loader:shards pattern=/root/parameter-golf-sxm28/data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:62021632 model_params:17059912 world_size:8 grad_accum_steps:1 sdp_backends:cudnn=False flash=True mem_efficient=False math=False attention_mode:gqa num_heads:8 num_kv_heads:4 tie_embeddings:True embed_lr:0.04 head_lr:0.0 matrix_lr:0.032 scalar_lr:0.032 train_batch_tokens:524288 train_seq_len:2048 iterations:20000 warmup_steps:20 max_wallclock_seconds:600.000 seed:1338 warmup_step:1/20 warmup_step:2/20 warmup_step:3/20 warmup_step:4/20 warmup_step:5/20 warmup_step:6/20 warmup_step:7/20 warmup_step:8/20 warmup_step:9/20 warmup_step:10/20 warmup_step:11/20 warmup_step:12/20 warmup_step:13/20 warmup_step:14/20 warmup_step:15/20 warmup_step:16/20 warmup_step:17/20 warmup_step:18/20 warmup_step:19/20 warmup_step:20/20 step:0/20000 val_loss:6.9373 val_bpb:4.1086 train_time:0ms step_avg:0.02ms step:1/20000 train_loss:6.9363 train_time:27ms step_avg:27.47ms step:2/20000 train_loss:14.7813 train_time:73ms step_avg:36.26ms step:3/20000 train_loss:8.1303 train_time:124ms step_avg:41.23ms step:4/20000 train_loss:6.6459 train_time:175ms step_avg:43.65ms step:5/20000 train_loss:6.9413 train_time:227ms step_avg:45.34ms step:6/20000 train_loss:7.8068 train_time:278ms step_avg:46.34ms step:7/20000 train_loss:6.8490 train_time:330ms step_avg:47.09ms step:8/20000 train_loss:6.4809 train_time:381ms step_avg:47.61ms step:9/20000 train_loss:6.2006 train_time:434ms step_avg:48.24ms step:10/20000 train_loss:6.0502 train_time:483ms step_avg:48.35ms step:200/20000 train_loss:2.7815 train_time:10297ms step_avg:51.48ms step:400/20000 train_loss:2.3049 train_time:20646ms step_avg:51.62ms step:600/20000 train_loss:2.4980 train_time:30990ms step_avg:51.65ms step:800/20000 train_loss:2.2491 train_time:41345ms step_avg:51.68ms step:1000/20000 train_loss:2.3361 train_time:51721ms step_avg:51.72ms step:1000/20000 val_loss:2.2898 val_bpb:1.3562 train_time:51754ms step_avg:51.75ms step:1200/20000 train_loss:2.3575 train_time:62108ms step_avg:51.76ms step:1400/20000 train_loss:2.3813 train_time:72504ms step_avg:51.79ms step:1600/20000 train_loss:2.0496 train_time:82894ms step_avg:51.81ms step:1800/20000 train_loss:2.1676 train_time:93289ms step_avg:51.83ms step:2000/20000 train_loss:2.2119 train_time:103684ms step_avg:51.84ms step:2000/20000 val_loss:2.1937 val_bpb:1.2993 train_time:103717ms step_avg:51.86ms step:2200/20000 train_loss:2.0320 train_time:114081ms step_avg:51.86ms step:2400/20000 train_loss:2.1652 train_time:124471ms step_avg:51.86ms step:2600/20000 train_loss:2.3821 train_time:134872ms step_avg:51.87ms step:2800/20000 train_loss:2.1987 train_time:145264ms step_avg:51.88ms step:3000/20000 train_loss:2.1912 train_time:155647ms step_avg:51.88ms step:3000/20000 val_loss:2.1534 val_bpb:1.2754 train_time:155680ms step_avg:51.89ms step:3200/20000 train_loss:2.1516 train_time:166037ms step_avg:51.89ms step:3400/20000 train_loss:2.1199 train_time:176417ms step_avg:51.89ms step:3600/20000 train_loss:2.0651 train_time:186795ms step_avg:51.89ms step:3800/20000 train_loss:2.1694 train_time:197173ms step_avg:51.89ms step:4000/20000 train_loss:2.1330 train_time:207554ms step_avg:51.89ms step:4000/20000 val_loss:2.1292 val_bpb:1.2610 train_time:207587ms step_avg:51.90ms step:4200/20000 train_loss:2.1284 train_time:217976ms step_avg:51.90ms step:4400/20000 train_loss:2.0686 train_time:228351ms step_avg:51.90ms step:4600/20000 train_loss:1.9371 train_time:238738ms step_avg:51.90ms step:4800/20000 train_loss:2.2171 train_time:249109ms step_avg:51.90ms step:5000/20000 train_loss:1.9744 train_time:259476ms step_avg:51.90ms step:5000/20000 val_loss:2.1127 val_bpb:1.2512 train_time:259509ms step_avg:51.90ms step:5200/20000 train_loss:2.1356 train_time:269848ms step_avg:51.89ms step:5400/20000 train_loss:2.1527 train_time:280217ms step_avg:51.89ms step:5600/20000 train_loss:2.1390 train_time:290578ms step_avg:51.89ms step:5800/20000 train_loss:2.0944 train_time:300933ms step_avg:51.89ms step:6000/20000 train_loss:2.1752 train_time:311294ms step_avg:51.88ms step:6000/20000 val_loss:2.1026 val_bpb:1.2453 train_time:311327ms step_avg:51.89ms step:6200/20000 train_loss:2.0458 train_time:321653ms step_avg:51.88ms step:6400/20000 train_loss:2.1240 train_time:332019ms step_avg:51.88ms step:6600/20000 train_loss:2.0830 train_time:342381ms step_avg:51.88ms step:6800/20000 train_loss:2.1434 train_time:352738ms step_avg:51.87ms step:7000/20000 train_loss:2.1907 train_time:363096ms step_avg:51.87ms step:7000/20000 val_loss:2.0916 val_bpb:1.2388 train_time:363129ms step_avg:51.88ms step:7200/20000 train_loss:2.1672 train_time:373450ms step_avg:51.87ms step:7400/20000 train_loss:2.0842 train_time:383806ms step_avg:51.87ms step:7600/20000 train_loss:1.9615 train_time:394163ms step_avg:51.86ms step:7800/20000 train_loss:2.1113 train_time:404518ms step_avg:51.86ms step:8000/20000 train_loss:2.0788 train_time:414870ms step_avg:51.86ms step:8000/20000 val_loss:2.0826 val_bpb:1.2334 train_time:414903ms step_avg:51.86ms step:8200/20000 train_loss:2.1505 train_time:425230ms step_avg:51.86ms step:8400/20000 train_loss:2.0933 train_time:435626ms step_avg:51.86ms step:8600/20000 train_loss:2.1070 train_time:445977ms step_avg:51.86ms step:8800/20000 train_loss:2.0708 train_time:456329ms step_avg:51.86ms step:9000/20000 train_loss:1.9882 train_time:466685ms step_avg:51.85ms step:9000/20000 val_loss:2.0772 val_bpb:1.2302 train_time:466718ms step_avg:51.86ms step:9200/20000 train_loss:2.0470 train_time:477091ms step_avg:51.86ms step:9400/20000 train_loss:2.0941 train_time:487469ms step_avg:51.86ms step:9600/20000 train_loss:2.1116 train_time:497817ms step_avg:51.86ms step:9800/20000 train_loss:2.0202 train_time:508164ms step_avg:51.85ms step:10000/20000 train_loss:2.0783 train_time:518510ms step_avg:51.85ms step:10000/20000 val_loss:2.0723 val_bpb:1.2274 train_time:518543ms step_avg:51.85ms step:10200/20000 train_loss:2.0342 train_time:528861ms step_avg:51.85ms step:10400/20000 train_loss:2.0587 train_time:539208ms step_avg:51.85ms step:10600/20000 train_loss:1.9323 train_time:549552ms step_avg:51.84ms step:10800/20000 train_loss:2.1371 train_time:559906ms step_avg:51.84ms step:11000/20000 train_loss:2.0567 train_time:570254ms step_avg:51.84ms step:11000/20000 val_loss:2.0458 val_bpb:1.2116 train_time:570286ms step_avg:51.84ms step:11200/20000 train_loss:2.0119 train_time:580608ms step_avg:51.84ms step:11400/20000 train_loss:1.9920 train_time:590954ms step_avg:51.84ms step:11575/20000 val_loss:2.0274 val_bpb:1.2007 train_time:600043ms step_avg:51.84ms stopping_early: wallclock_cap train_time:600043ms step:11575/20000 peak memory allocated: 10247 MiB reserved: 10312 MiB Serialized model: 67224983 bytes Code size: 47716 bytes Total submission size: 67272699 bytes Serialized model int8+zlib: 15813523 bytes (payload:17178912 raw_torch:17224025 payload_ratio:3.91x) Total submission size int8+zlib: 15861239 bytes final_int8_zlib_roundtrip val_loss:2.0366 val_bpb:1.2062 eval_time:1638ms final_int8_zlib_roundtrip_exact val_loss:2.03657529 val_bpb:1.20617460