From 93d77b197d457b1fdfa7341ecd59fc460b20d6b1 Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Mon, 9 Feb 2026 11:23:15 -0600 Subject: =?UTF-8?q?Fix=20init=20state:=20add=20logit=5Fbias=20so=20A?= =?UTF-8?q?=E2=89=881=20at=20init=20(dense=20connectivity)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add learnable logit_bias=15.0 to PredictorMLP, so σ(15/τ_init) ≈ 0.95 at init, reproducing dense connectivity instead of random A≈0.25 - Fix dtype mismatch: cast A to model dtype (bfloat16) in DAGFormerOLMo.forward - Fix YAML lr parsing: add type coercion in TrainConfig.from_yaml - Fix device mismatch: call self.to(device) in StructurePredictor.__init__ - Add python -u for unbuffered SLURM output, TOKENIZERS_PARALLELISM=false - Delete stale eval_cache.pt (built with buggy MLP input code) Co-Authored-By: Claude Opus 4.6 --- scripts/slurm_train.sh | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'scripts') diff --git a/scripts/slurm_train.sh b/scripts/slurm_train.sh index 6b283ea..e1df687 100644 --- a/scripts/slurm_train.sh +++ b/scripts/slurm_train.sh @@ -1,18 +1,9 @@ #!/bin/bash -#SBATCH --partition=gpuA40x4 -#SBATCH --account=bfqt-delta-gpu -#SBATCH --nodes=1 -#SBATCH --gpus-per-node=1 -#SBATCH --time=02:00:00 -#SBATCH --mem=64g -#SBATCH --job-name=dagformer-sanity -#SBATCH --output=logs/sanity_%j.out -#SBATCH --error=logs/sanity_%j.err - export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache export TRANSFORMERS_CACHE=/projects/bfqt/users/yurenh2/hf_cache/transformers export HF_HUB_CACHE=/projects/bfqt/users/yurenh2/hf_cache/hub export HF_DATASETS_CACHE=/projects/bfqt/users/yurenh2/hf_cache/datasets +export TOKENIZERS_PARALLELISM=false export PYTHONPATH=/projects/bfqt/users/yurenh2/ml-projects/DAGFormer:$PYTHONPATH export PATH=$HOME/.local/bin:$PATH @@ -27,4 +18,4 @@ echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader)" echo "" echo "=== Starting training ===" -python3 scripts/train.py --config configs/sanity_check.yaml +python3 -u scripts/train.py --config configs/sanity_check.yaml -- cgit v1.2.3