diff options
| -rw-r--r-- | Group-Entropy-Equalization/README.md | 44 | ||||
| -rw-r--r-- | Group-Entropy-Equalization/train.py | 24 |
2 files changed, 59 insertions, 9 deletions
diff --git a/Group-Entropy-Equalization/README.md b/Group-Entropy-Equalization/README.md index 804af95..33bd020 100644 --- a/Group-Entropy-Equalization/README.md +++ b/Group-Entropy-Equalization/README.md @@ -13,6 +13,50 @@ pip install -r requirements.txt --- +### Colab Quickstart (single-GPU, no DeepSpeed) + +In Colab, use a smaller model first to verify end-to-end. Then scale up if VRAM allows. + +```bash +!git clone https://github.com/YurenHao0426/gee.git +%cd /content/gee/Group-Entropy-Equalization +!pip -q install transformers==4.44.2 accelerate==0.33.0 peft==0.12.0 bitsandbytes==0.43.3 datasets==2.21.0 wandb==0.17.7 pyarrow==17.0.0 +``` + +Create a small parquet if you don’t have one: + +```python +import os, pandas as pd +os.makedirs("dataset/1shot_rlvr", exist_ok=True) +df = pd.DataFrame({"problem": [ + "What is 2 + 2?", + "If x=3, compute x^2 + 2x + 1.", + "The doctor is a ____.", + "Factor 12.", + "What is 7*8?", +]}) +df_big = pd.concat([df]*256, ignore_index=True).iloc[:1280] +df_big.to_parquet("dataset/1shot_rlvr/pi1_r1280.parquet", index=False) +``` + +Run training (no DeepSpeed, no AMP to avoid Colab GradScaler quirks): + +```bash +!python train.py \ + --model_name Qwen2.5-1.5B \ + --model_path Qwen/Qwen2.5-1.5B \ + --train_data dataset/1shot_rlvr/pi1_r1280.parquet \ + --effective_batch 4 --micro_batch_size 1 \ + --temperature 0.5 --learning_rate 2e-5 --sample_temp 0.5 \ + --max_steps 10 --log_steps 1 --save_steps 10 \ + --run_name colab_em10 --wandb_project one-shot-em \ + --no_deepspeed --mixed_precision no +``` + +Checkpoints are saved under `checkpoints/<model>/<run_name>/`. + +--- + ### Reproducing One-shot EM Training (SOTA) ```bash diff --git a/Group-Entropy-Equalization/train.py b/Group-Entropy-Equalization/train.py index 11f658a..d1ba4f0 100644 --- a/Group-Entropy-Equalization/train.py +++ b/Group-Entropy-Equalization/train.py @@ -124,6 +124,9 @@ def main(): if accelerator.is_main_process: wandb.init(project=args.wandb_project, name=args.run_name or args.wandb_name or args.model_name, config=vars(args)) + # Friendly error if the parquet path is missing + if not os.path.exists(args.train_data): + raise FileNotFoundError(f"Training data not found: {args.train_data}. Create/upload the parquet under the project folder or pass --train_data to an existing path.") df = pd.read_parquet(args.train_data) train_data = [{"input": apply_chat_template(tokenizer, p)} for p in df["problem"].dropna().tolist()] train_loader = DataLoader(FTDataset(train_data), batch_size=micro_bs, shuffle=True, collate_fn=custom_collate) @@ -146,15 +149,18 @@ def main(): max_length=2048).to(accelerator.device) with torch.no_grad(): - gen_ids = accelerator.unwrap_model(model).generate(**enc, - max_new_tokens=512, - do_sample=True, - top_p=0.95, - temperature=args.sample_temp, - synced_gpus=True, - repetition_penalty=1.15, - pad_token_id=tokenizer.pad_token_id, - use_cache=False) + use_synced = getattr(accelerator, "num_processes", 1) and accelerator.num_processes > 1 + gen_ids = accelerator.unwrap_model(model).generate( + **enc, + max_new_tokens=512, + do_sample=True, + top_p=0.95, + temperature=args.sample_temp, + synced_gpus=use_synced, + repetition_penalty=1.15, + pad_token_id=tokenizer.pad_token_id, + use_cache=False, + ) seq = torch.cat([enc.input_ids, gen_ids[:, enc.input_ids.shape[1]:]], dim=1)[:, :4096] pad_mask = seq.ne(tokenizer.pad_token_id) |
