summaryrefslogtreecommitdiff
path: root/records/track_10min_16mb/2026-03-18_LongContextSeq2048/train_seed1338.log
blob: 5851fe0a3c199e5f1a60527b3a98ab64edecf910 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124

*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
*****************************************
logs/seq2048_sxm28_seed1338_20260319a.txt
val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=/root/parameter-golf-sxm28/data/tokenizers/fineweb_1024_bpe.model
train_loader:dataset:fineweb10B_sp1024 train_shards:80
val_loader:shards pattern=/root/parameter-golf-sxm28/data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:62021632
model_params:17059912
world_size:8 grad_accum_steps:1
sdp_backends:cudnn=False flash=True mem_efficient=False math=False
attention_mode:gqa num_heads:8 num_kv_heads:4
tie_embeddings:True embed_lr:0.04 head_lr:0.0 matrix_lr:0.032 scalar_lr:0.032
train_batch_tokens:524288 train_seq_len:2048 iterations:20000 warmup_steps:20 max_wallclock_seconds:600.000
seed:1338
warmup_step:1/20
warmup_step:2/20
warmup_step:3/20
warmup_step:4/20
warmup_step:5/20
warmup_step:6/20
warmup_step:7/20
warmup_step:8/20
warmup_step:9/20
warmup_step:10/20
warmup_step:11/20
warmup_step:12/20
warmup_step:13/20
warmup_step:14/20
warmup_step:15/20
warmup_step:16/20
warmup_step:17/20
warmup_step:18/20
warmup_step:19/20
warmup_step:20/20
step:0/20000 val_loss:6.9373 val_bpb:4.1086 train_time:0ms step_avg:0.02ms
step:1/20000 train_loss:6.9363 train_time:27ms step_avg:27.47ms
step:2/20000 train_loss:14.7813 train_time:73ms step_avg:36.26ms
step:3/20000 train_loss:8.1303 train_time:124ms step_avg:41.23ms
step:4/20000 train_loss:6.6459 train_time:175ms step_avg:43.65ms
step:5/20000 train_loss:6.9413 train_time:227ms step_avg:45.34ms
step:6/20000 train_loss:7.8068 train_time:278ms step_avg:46.34ms
step:7/20000 train_loss:6.8490 train_time:330ms step_avg:47.09ms
step:8/20000 train_loss:6.4809 train_time:381ms step_avg:47.61ms
step:9/20000 train_loss:6.2006 train_time:434ms step_avg:48.24ms
step:10/20000 train_loss:6.0502 train_time:483ms step_avg:48.35ms
step:200/20000 train_loss:2.7815 train_time:10297ms step_avg:51.48ms
step:400/20000 train_loss:2.3049 train_time:20646ms step_avg:51.62ms
step:600/20000 train_loss:2.4980 train_time:30990ms step_avg:51.65ms
step:800/20000 train_loss:2.2491 train_time:41345ms step_avg:51.68ms
step:1000/20000 train_loss:2.3361 train_time:51721ms step_avg:51.72ms
step:1000/20000 val_loss:2.2898 val_bpb:1.3562 train_time:51754ms step_avg:51.75ms
step:1200/20000 train_loss:2.3575 train_time:62108ms step_avg:51.76ms
step:1400/20000 train_loss:2.3813 train_time:72504ms step_avg:51.79ms
step:1600/20000 train_loss:2.0496 train_time:82894ms step_avg:51.81ms
step:1800/20000 train_loss:2.1676 train_time:93289ms step_avg:51.83ms
step:2000/20000 train_loss:2.2119 train_time:103684ms step_avg:51.84ms
step:2000/20000 val_loss:2.1937 val_bpb:1.2993 train_time:103717ms step_avg:51.86ms
step:2200/20000 train_loss:2.0320 train_time:114081ms step_avg:51.86ms
step:2400/20000 train_loss:2.1652 train_time:124471ms step_avg:51.86ms
step:2600/20000 train_loss:2.3821 train_time:134872ms step_avg:51.87ms
step:2800/20000 train_loss:2.1987 train_time:145264ms step_avg:51.88ms
step:3000/20000 train_loss:2.1912 train_time:155647ms step_avg:51.88ms
step:3000/20000 val_loss:2.1534 val_bpb:1.2754 train_time:155680ms step_avg:51.89ms
step:3200/20000 train_loss:2.1516 train_time:166037ms step_avg:51.89ms
step:3400/20000 train_loss:2.1199 train_time:176417ms step_avg:51.89ms
step:3600/20000 train_loss:2.0651 train_time:186795ms step_avg:51.89ms
step:3800/20000 train_loss:2.1694 train_time:197173ms step_avg:51.89ms
step:4000/20000 train_loss:2.1330 train_time:207554ms step_avg:51.89ms
step:4000/20000 val_loss:2.1292 val_bpb:1.2610 train_time:207587ms step_avg:51.90ms
step:4200/20000 train_loss:2.1284 train_time:217976ms step_avg:51.90ms
step:4400/20000 train_loss:2.0686 train_time:228351ms step_avg:51.90ms
step:4600/20000 train_loss:1.9371 train_time:238738ms step_avg:51.90ms
step:4800/20000 train_loss:2.2171 train_time:249109ms step_avg:51.90ms
step:5000/20000 train_loss:1.9744 train_time:259476ms step_avg:51.90ms
step:5000/20000 val_loss:2.1127 val_bpb:1.2512 train_time:259509ms step_avg:51.90ms
step:5200/20000 train_loss:2.1356 train_time:269848ms step_avg:51.89ms
step:5400/20000 train_loss:2.1527 train_time:280217ms step_avg:51.89ms
step:5600/20000 train_loss:2.1390 train_time:290578ms step_avg:51.89ms
step:5800/20000 train_loss:2.0944 train_time:300933ms step_avg:51.89ms
step:6000/20000 train_loss:2.1752 train_time:311294ms step_avg:51.88ms
step:6000/20000 val_loss:2.1026 val_bpb:1.2453 train_time:311327ms step_avg:51.89ms
step:6200/20000 train_loss:2.0458 train_time:321653ms step_avg:51.88ms
step:6400/20000 train_loss:2.1240 train_time:332019ms step_avg:51.88ms
step:6600/20000 train_loss:2.0830 train_time:342381ms step_avg:51.88ms
step:6800/20000 train_loss:2.1434 train_time:352738ms step_avg:51.87ms
step:7000/20000 train_loss:2.1907 train_time:363096ms step_avg:51.87ms
step:7000/20000 val_loss:2.0916 val_bpb:1.2388 train_time:363129ms step_avg:51.88ms
step:7200/20000 train_loss:2.1672 train_time:373450ms step_avg:51.87ms
step:7400/20000 train_loss:2.0842 train_time:383806ms step_avg:51.87ms
step:7600/20000 train_loss:1.9615 train_time:394163ms step_avg:51.86ms
step:7800/20000 train_loss:2.1113 train_time:404518ms step_avg:51.86ms
step:8000/20000 train_loss:2.0788 train_time:414870ms step_avg:51.86ms
step:8000/20000 val_loss:2.0826 val_bpb:1.2334 train_time:414903ms step_avg:51.86ms
step:8200/20000 train_loss:2.1505 train_time:425230ms step_avg:51.86ms
step:8400/20000 train_loss:2.0933 train_time:435626ms step_avg:51.86ms
step:8600/20000 train_loss:2.1070 train_time:445977ms step_avg:51.86ms
step:8800/20000 train_loss:2.0708 train_time:456329ms step_avg:51.86ms
step:9000/20000 train_loss:1.9882 train_time:466685ms step_avg:51.85ms
step:9000/20000 val_loss:2.0772 val_bpb:1.2302 train_time:466718ms step_avg:51.86ms
step:9200/20000 train_loss:2.0470 train_time:477091ms step_avg:51.86ms
step:9400/20000 train_loss:2.0941 train_time:487469ms step_avg:51.86ms
step:9600/20000 train_loss:2.1116 train_time:497817ms step_avg:51.86ms
step:9800/20000 train_loss:2.0202 train_time:508164ms step_avg:51.85ms
step:10000/20000 train_loss:2.0783 train_time:518510ms step_avg:51.85ms
step:10000/20000 val_loss:2.0723 val_bpb:1.2274 train_time:518543ms step_avg:51.85ms
step:10200/20000 train_loss:2.0342 train_time:528861ms step_avg:51.85ms
step:10400/20000 train_loss:2.0587 train_time:539208ms step_avg:51.85ms
step:10600/20000 train_loss:1.9323 train_time:549552ms step_avg:51.84ms
step:10800/20000 train_loss:2.1371 train_time:559906ms step_avg:51.84ms
step:11000/20000 train_loss:2.0567 train_time:570254ms step_avg:51.84ms
step:11000/20000 val_loss:2.0458 val_bpb:1.2116 train_time:570286ms step_avg:51.84ms
step:11200/20000 train_loss:2.0119 train_time:580608ms step_avg:51.84ms
step:11400/20000 train_loss:1.9920 train_time:590954ms step_avg:51.84ms
step:11575/20000 val_loss:2.0274 val_bpb:1.2007 train_time:600043ms step_avg:51.84ms
stopping_early: wallclock_cap train_time:600043ms step:11575/20000
peak memory allocated: 10247 MiB reserved: 10312 MiB
Serialized model: 67224983 bytes
Code size: 47716 bytes
Total submission size: 67272699 bytes
Serialized model int8+zlib: 15813523 bytes (payload:17178912 raw_torch:17224025 payload_ratio:3.91x)
Total submission size int8+zlib: 15861239 bytes
final_int8_zlib_roundtrip val_loss:2.0366 val_bpb:1.2062 eval_time:1638ms
final_int8_zlib_roundtrip_exact val_loss:2.03657529 val_bpb:1.20617460