こんにちは
This commit is contained in:
parent
3c513fee17
commit
5ca971b0a4
19 changed files with 1559 additions and 41 deletions
13
.env.example
Normal file
13
.env.example
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
# Environment variables for Progressive LLM Training
|
||||
# Copy this file to .env and fill in your values
|
||||
|
||||
# HuggingFace
|
||||
HF_TOKEN=your_token_here
|
||||
|
||||
# Weights & Biases
|
||||
WANDB_API_KEY=your_api_key_here
|
||||
WANDB_PROJECT=matsuo-llm-comp-2025
|
||||
|
||||
# GPU Configuration
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
NCCL_DEBUG=WARN
|
||||
33
.gitignore
vendored
33
.gitignore
vendored
|
|
@ -1,32 +1,35 @@
|
|||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
venv/
|
||||
ENV/
|
||||
env/
|
||||
.venv/
|
||||
venv/
|
||||
|
||||
# Nix
|
||||
result
|
||||
result-*
|
||||
|
||||
# Project specific
|
||||
# Training outputs
|
||||
outputs/
|
||||
data/
|
||||
*.log
|
||||
!data/basic_cot/train.jsonl
|
||||
wandb/
|
||||
.ipynb_checkpoints/
|
||||
*.log
|
||||
|
||||
# Model files
|
||||
*.pt
|
||||
*.pth
|
||||
*.bin
|
||||
*.safetensors
|
||||
|
||||
# Temporary
|
||||
*.tmp
|
||||
.cache/
|
||||
accelerate_config.yaml
|
||||
hostfile
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
*~
|
||||
|
||||
# Keep lock files
|
||||
!uv.lock
|
||||
1
.python-version
Normal file
1
.python-version
Normal file
|
|
@ -0,0 +1 @@
|
|||
3.11
|
||||
43
README.md
Normal file
43
README.md
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
# Progressive LLM Training
|
||||
|
||||
Progressive training for LLMs with 8-GPU support for 松尾研LLMコンペ2025.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Install uv
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
|
||||
# Setup project
|
||||
git clone <repository-url>
|
||||
cd progressive-llm-training
|
||||
uv sync
|
||||
|
||||
# Start training
|
||||
uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml
|
||||
./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed
|
||||
```
|
||||
|
||||
## Training Stages
|
||||
|
||||
1. **basic_cot** - Basic reasoning
|
||||
2. **math_reasoning** - Math with OpenR1-Math-220k
|
||||
3. **complex_reasoning** - Complex reasoning with Mixture-of-Thoughts
|
||||
|
||||
## Commands
|
||||
|
||||
```bash
|
||||
uv sync # Install dependencies
|
||||
uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml # Single GPU
|
||||
./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed # 8 GPUs
|
||||
uv run pytest # Run tests
|
||||
```
|
||||
|
||||
## Key Files
|
||||
|
||||
- `config/training_config_gemma3_1b_8gpu_deepspeed.yaml` - 8-GPU config
|
||||
- `scripts/train_progressive.py` - Main training script
|
||||
- `scripts/train_gemma3_1b_8gpu.sh` - 8-GPU launcher
|
||||
- `src/progressive_model.py` - Core model implementation
|
||||
|
||||
Ready to train! 🚀
|
||||
110
config/training_config_8gpu.yaml
Normal file
110
config/training_config_8gpu.yaml
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
experiment:
|
||||
name: "progressive_reasoning_8gpu"
|
||||
base_model: "google/gemma-2-2b-it" # Can scale up to larger models
|
||||
output_dir: "./outputs"
|
||||
use_wandb: true
|
||||
wandb_project: "matsuo-llm-comp-2025"
|
||||
|
||||
model:
|
||||
load_in_4bit: false # Can use FP16/BF16 with multiple GPUs
|
||||
bnb_4bit_compute_dtype: "bfloat16"
|
||||
bnb_4bit_use_double_quant: true
|
||||
device_map: "balanced" # Distribute across all GPUs
|
||||
gradient_checkpointing: true
|
||||
use_flash_attention_2: true # Enable if available for speed
|
||||
use_eager_attention: false
|
||||
|
||||
# Multi-GPU specific settings
|
||||
distributed:
|
||||
strategy: "ddp" # Distributed Data Parallel
|
||||
find_unused_parameters: false
|
||||
gradient_as_bucket_view: true
|
||||
|
||||
progressive_stages:
|
||||
- name: "basic_cot"
|
||||
description: "Basic Chain-of-Thought reasoning"
|
||||
dataset_path: "./data/basic_cot/"
|
||||
adapter_config:
|
||||
r: 32 # Larger rank since we have more memory
|
||||
lora_alpha: 64
|
||||
lora_dropout: 0.1
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
|
||||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 2
|
||||
per_device_batch_size: 16 # Large batch size per GPU
|
||||
gradient_accumulation_steps: 1 # No need for accumulation with 8 GPUs
|
||||
learning_rate: 5e-4
|
||||
warmup_steps: 100
|
||||
max_length: 2048
|
||||
bf16: true
|
||||
max_grad_norm: 1.0
|
||||
weight_decay: 0.001
|
||||
save_steps: 50
|
||||
logging_steps: 10
|
||||
dataloader_num_workers: 4 # More workers for data loading
|
||||
dataloader_pin_memory: true
|
||||
remove_unused_columns: false
|
||||
|
||||
- name: "math_reasoning"
|
||||
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
|
||||
dataset_path: "open-r1/OpenR1-Math-220k"
|
||||
inherit_from: "basic_cot"
|
||||
adapter_config:
|
||||
r: 64
|
||||
lora_alpha: 128
|
||||
lora_dropout: 0.1
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 1
|
||||
per_device_batch_size: 8 # Reduce for larger model
|
||||
gradient_accumulation_steps: 2
|
||||
learning_rate: 3e-4
|
||||
warmup_steps: 200
|
||||
max_length: 4096
|
||||
bf16: true
|
||||
max_grad_norm: 1.0
|
||||
weight_decay: 0.001
|
||||
save_steps: 100
|
||||
logging_steps: 20
|
||||
dataloader_num_workers: 4
|
||||
dataset_config:
|
||||
streaming: true
|
||||
max_samples: 100000 # Can process more with 8 GPUs
|
||||
split: "train"
|
||||
|
||||
- name: "complex_reasoning"
|
||||
description: "Complex multi-step reasoning with Mixture-of-Thoughts"
|
||||
dataset_path: "open-r1/Mixture-of-Thoughts"
|
||||
inherit_from: "math_reasoning"
|
||||
adapter_config:
|
||||
r: 128 # Maximum rank with multi-GPU
|
||||
lora_alpha: 256
|
||||
lora_dropout: 0.1
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 1
|
||||
per_device_batch_size: 4
|
||||
gradient_accumulation_steps: 4
|
||||
learning_rate: 2e-4
|
||||
warmup_steps: 300
|
||||
max_length: 8192
|
||||
bf16: true
|
||||
max_grad_norm: 1.0
|
||||
weight_decay: 0.001
|
||||
save_steps: 200
|
||||
logging_steps: 50
|
||||
dataloader_num_workers: 4
|
||||
dataset_config:
|
||||
streaming: true
|
||||
max_samples: 50000
|
||||
split: "train"
|
||||
|
||||
evaluation:
|
||||
benchmarks:
|
||||
- "HLE"
|
||||
- "Do-Not-Answer"
|
||||
save_results: true
|
||||
results_dir: "./outputs/evaluation_results"
|
||||
138
config/training_config_8gpu_deepspeed.yaml
Normal file
138
config/training_config_8gpu_deepspeed.yaml
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
experiment:
|
||||
name: "progressive_reasoning_8gpu_deepspeed"
|
||||
base_model: "google/gemma-2-2b-it"
|
||||
output_dir: "./outputs"
|
||||
use_wandb: true
|
||||
wandb_project: "matsuo-llm-comp-2025"
|
||||
|
||||
model:
|
||||
load_in_4bit: false
|
||||
device_map: null # Let DeepSpeed handle device placement
|
||||
gradient_checkpointing: true
|
||||
use_flash_attention_2: true
|
||||
use_eager_attention: false
|
||||
|
||||
# DeepSpeed Configuration
|
||||
deepspeed:
|
||||
zero_optimization:
|
||||
stage: 2 # ZeRO Stage 2 (partition optimizer states and gradients)
|
||||
allgather_partitions: true
|
||||
allgather_bucket_size: 200000000
|
||||
overlap_comm: true
|
||||
reduce_scatter: true
|
||||
reduce_bucket_size: 200000000
|
||||
contiguous_gradients: true
|
||||
cpu_offload: false # Keep on GPU for speed
|
||||
|
||||
optimizer:
|
||||
type: "AdamW"
|
||||
params:
|
||||
lr: 3e-4
|
||||
betas: [0.9, 0.999]
|
||||
eps: 1e-8
|
||||
weight_decay: 0.001
|
||||
|
||||
scheduler:
|
||||
type: "WarmupLR"
|
||||
params:
|
||||
warmup_min_lr: 0
|
||||
warmup_max_lr: 3e-4
|
||||
warmup_num_steps: 200
|
||||
|
||||
fp16:
|
||||
enabled: false
|
||||
|
||||
bf16:
|
||||
enabled: true
|
||||
|
||||
gradient_clipping: 1.0
|
||||
|
||||
train_batch_size: 512 # Total batch size across all GPUs
|
||||
train_micro_batch_size_per_gpu: 64 # Per-GPU batch size
|
||||
|
||||
progressive_stages:
|
||||
- name: "basic_cot"
|
||||
description: "Basic Chain-of-Thought reasoning"
|
||||
dataset_path: "./data/basic_cot/"
|
||||
adapter_config:
|
||||
r: 64
|
||||
lora_alpha: 128
|
||||
lora_dropout: 0.1
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
|
||||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 2
|
||||
per_device_batch_size: 64 # Large batch with DeepSpeed
|
||||
gradient_accumulation_steps: 1
|
||||
learning_rate: 5e-4
|
||||
warmup_steps: 100
|
||||
max_length: 2048
|
||||
bf16: true
|
||||
max_grad_norm: 1.0
|
||||
weight_decay: 0.001
|
||||
save_steps: 50
|
||||
logging_steps: 10
|
||||
dataloader_num_workers: 8
|
||||
|
||||
- name: "math_reasoning"
|
||||
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
|
||||
dataset_path: "open-r1/OpenR1-Math-220k"
|
||||
inherit_from: "basic_cot"
|
||||
adapter_config:
|
||||
r: 128
|
||||
lora_alpha: 256
|
||||
lora_dropout: 0.1
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 1
|
||||
per_device_batch_size: 32
|
||||
gradient_accumulation_steps: 1
|
||||
learning_rate: 3e-4
|
||||
warmup_steps: 200
|
||||
max_length: 4096
|
||||
bf16: true
|
||||
max_grad_norm: 1.0
|
||||
weight_decay: 0.001
|
||||
save_steps: 100
|
||||
logging_steps: 20
|
||||
dataloader_num_workers: 8
|
||||
dataset_config:
|
||||
streaming: true
|
||||
max_samples: 200000
|
||||
split: "train"
|
||||
|
||||
- name: "complex_reasoning"
|
||||
description: "Complex multi-step reasoning with Mixture-of-Thoughts"
|
||||
dataset_path: "open-r1/Mixture-of-Thoughts"
|
||||
inherit_from: "math_reasoning"
|
||||
adapter_config:
|
||||
r: 256
|
||||
lora_alpha: 512
|
||||
lora_dropout: 0.1
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 1
|
||||
per_device_batch_size: 16
|
||||
gradient_accumulation_steps: 2
|
||||
learning_rate: 2e-4
|
||||
warmup_steps: 300
|
||||
max_length: 8192
|
||||
bf16: true
|
||||
max_grad_norm: 1.0
|
||||
weight_decay: 0.001
|
||||
save_steps: 200
|
||||
logging_steps: 50
|
||||
dataloader_num_workers: 8
|
||||
dataset_config:
|
||||
streaming: true
|
||||
max_samples: 100000
|
||||
split: "train"
|
||||
|
||||
evaluation:
|
||||
benchmarks:
|
||||
- "HLE"
|
||||
- "Do-Not-Answer"
|
||||
save_results: true
|
||||
results_dir: "./outputs/evaluation_results"
|
||||
113
config/training_config_8gpu_fsdp.yaml
Normal file
113
config/training_config_8gpu_fsdp.yaml
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
experiment:
|
||||
name: "progressive_reasoning_8gpu_fsdp"
|
||||
base_model: "google/gemma-2-2b-it" # Can scale to much larger models with FSDP
|
||||
output_dir: "./outputs"
|
||||
use_wandb: true
|
||||
wandb_project: "matsuo-llm-comp-2025"
|
||||
|
||||
model:
|
||||
load_in_4bit: false
|
||||
device_map: null # Let FSDP handle device placement
|
||||
gradient_checkpointing: true
|
||||
use_flash_attention_2: true
|
||||
use_eager_attention: false
|
||||
|
||||
# FSDP Configuration
|
||||
fsdp:
|
||||
fsdp_transformer_layer_cls_to_wrap: "Gemma2DecoderLayer" # Wrap at layer level
|
||||
fsdp_sharding_strategy: "FULL_SHARD" # Shard parameters, gradients, and optimizer states
|
||||
fsdp_cpu_offload: false # Keep on GPU for speed
|
||||
fsdp_mixed_precision: true # Use BF16 mixed precision
|
||||
fsdp_auto_wrap_policy: "TRANSFORMER_BASED_WRAP"
|
||||
fsdp_min_num_params: 1000000 # Wrap layers with >1M parameters
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_forward_prefetch: true
|
||||
fsdp_use_orig_params: true # Important for LoRA compatibility
|
||||
|
||||
progressive_stages:
|
||||
- name: "basic_cot"
|
||||
description: "Basic Chain-of-Thought reasoning"
|
||||
dataset_path: "./data/basic_cot/"
|
||||
adapter_config:
|
||||
r: 64 # Can use larger ranks with FSDP
|
||||
lora_alpha: 128
|
||||
lora_dropout: 0.1
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
|
||||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 2
|
||||
per_device_batch_size: 32 # Very large batch size with FSDP
|
||||
gradient_accumulation_steps: 1
|
||||
learning_rate: 5e-4
|
||||
warmup_steps: 100
|
||||
max_length: 2048
|
||||
bf16: true
|
||||
max_grad_norm: 1.0
|
||||
weight_decay: 0.001
|
||||
save_steps: 50
|
||||
logging_steps: 10
|
||||
dataloader_num_workers: 8
|
||||
dataloader_pin_memory: true
|
||||
|
||||
- name: "math_reasoning"
|
||||
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
|
||||
dataset_path: "open-r1/OpenR1-Math-220k"
|
||||
inherit_from: "basic_cot"
|
||||
adapter_config:
|
||||
r: 128
|
||||
lora_alpha: 256
|
||||
lora_dropout: 0.1
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 1
|
||||
per_device_batch_size: 16
|
||||
gradient_accumulation_steps: 2
|
||||
learning_rate: 3e-4
|
||||
warmup_steps: 200
|
||||
max_length: 4096
|
||||
bf16: true
|
||||
max_grad_norm: 1.0
|
||||
weight_decay: 0.001
|
||||
save_steps: 100
|
||||
logging_steps: 20
|
||||
dataloader_num_workers: 8
|
||||
dataset_config:
|
||||
streaming: true
|
||||
max_samples: 200000 # Process even more data
|
||||
split: "train"
|
||||
|
||||
- name: "complex_reasoning"
|
||||
description: "Complex multi-step reasoning with Mixture-of-Thoughts"
|
||||
dataset_path: "open-r1/Mixture-of-Thoughts"
|
||||
inherit_from: "math_reasoning"
|
||||
adapter_config:
|
||||
r: 256 # Very large rank possible with FSDP
|
||||
lora_alpha: 512
|
||||
lora_dropout: 0.1
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 1
|
||||
per_device_batch_size: 8
|
||||
gradient_accumulation_steps: 4
|
||||
learning_rate: 2e-4
|
||||
warmup_steps: 300
|
||||
max_length: 8192
|
||||
bf16: true
|
||||
max_grad_norm: 1.0
|
||||
weight_decay: 0.001
|
||||
save_steps: 200
|
||||
logging_steps: 50
|
||||
dataloader_num_workers: 8
|
||||
dataset_config:
|
||||
streaming: true
|
||||
max_samples: 100000
|
||||
split: "train"
|
||||
|
||||
evaluation:
|
||||
benchmarks:
|
||||
- "HLE"
|
||||
- "Do-Not-Answer"
|
||||
save_results: true
|
||||
results_dir: "./outputs/evaluation_results"
|
||||
109
config/training_config_gemma3_1b_8gpu_ddp.yaml
Normal file
109
config/training_config_gemma3_1b_8gpu_ddp.yaml
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
experiment:
|
||||
name: "progressive_reasoning_gemma3_1b_8gpu_ddp"
|
||||
base_model: "google/gemma-3-1b-pt"
|
||||
output_dir: "./outputs"
|
||||
use_wandb: true
|
||||
wandb_project: "matsuo-llm-comp-2025"
|
||||
|
||||
model:
|
||||
load_in_4bit: false # Can use FP16/BF16 with multiple GPUs
|
||||
bnb_4bit_compute_dtype: "bfloat16"
|
||||
bnb_4bit_use_double_quant: true
|
||||
device_map: "balanced" # Distribute across all GPUs
|
||||
gradient_checkpointing: true
|
||||
use_flash_attention_2: false
|
||||
use_eager_attention: true
|
||||
|
||||
# Multi-GPU specific settings
|
||||
distributed:
|
||||
strategy: "ddp" # Distributed Data Parallel
|
||||
find_unused_parameters: false
|
||||
gradient_as_bucket_view: true
|
||||
|
||||
progressive_stages:
|
||||
- name: "basic_cot"
|
||||
description: "Basic Chain-of-Thought reasoning"
|
||||
dataset_path: "./data/basic_cot/"
|
||||
adapter_config:
|
||||
r: 16 # Moderate rank for DDP
|
||||
lora_alpha: 32
|
||||
lora_dropout: 0.1
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
|
||||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 2
|
||||
per_device_batch_size: 32 # 32 * 8 = 256 total batch size
|
||||
gradient_accumulation_steps: 1
|
||||
learning_rate: 5e-4
|
||||
warmup_steps: 100
|
||||
max_length: 1024
|
||||
bf16: true
|
||||
max_grad_norm: 1.0
|
||||
weight_decay: 0.001
|
||||
save_steps: 50
|
||||
logging_steps: 10
|
||||
dataloader_num_workers: 4
|
||||
dataloader_pin_memory: true
|
||||
|
||||
- name: "math_reasoning"
|
||||
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
|
||||
dataset_path: "open-r1/OpenR1-Math-220k"
|
||||
inherit_from: "basic_cot"
|
||||
adapter_config:
|
||||
r: 32
|
||||
lora_alpha: 64
|
||||
lora_dropout: 0.1
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 1
|
||||
per_device_batch_size: 16 # 16 * 8 = 128 total batch size
|
||||
gradient_accumulation_steps: 2
|
||||
learning_rate: 3e-4
|
||||
warmup_steps: 200
|
||||
max_length: 2048
|
||||
bf16: true
|
||||
max_grad_norm: 1.0
|
||||
weight_decay: 0.001
|
||||
save_steps: 100
|
||||
logging_steps: 20
|
||||
dataloader_num_workers: 4
|
||||
dataset_config:
|
||||
streaming: true
|
||||
max_samples: 400000 # Process substantial data
|
||||
split: "train"
|
||||
|
||||
- name: "complex_reasoning"
|
||||
description: "Complex multi-step reasoning with Mixture-of-Thoughts"
|
||||
dataset_path: "open-r1/Mixture-of-Thoughts"
|
||||
inherit_from: "math_reasoning"
|
||||
adapter_config:
|
||||
r: 64
|
||||
lora_alpha: 128
|
||||
lora_dropout: 0.1
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 1
|
||||
per_device_batch_size: 8 # 8 * 8 = 64 total batch size
|
||||
gradient_accumulation_steps: 4
|
||||
learning_rate: 2e-4
|
||||
warmup_steps: 300
|
||||
max_length: 4096
|
||||
bf16: true
|
||||
max_grad_norm: 1.0
|
||||
weight_decay: 0.001
|
||||
save_steps: 200
|
||||
logging_steps: 50
|
||||
dataloader_num_workers: 4
|
||||
dataset_config:
|
||||
streaming: true
|
||||
max_samples: 600000
|
||||
split: "train"
|
||||
|
||||
evaluation:
|
||||
benchmarks:
|
||||
- "HLE"
|
||||
- "Do-Not-Answer"
|
||||
save_results: true
|
||||
results_dir: "./outputs/evaluation_results"
|
||||
139
config/training_config_gemma3_1b_8gpu_deepspeed.yaml
Normal file
139
config/training_config_gemma3_1b_8gpu_deepspeed.yaml
Normal file
|
|
@ -0,0 +1,139 @@
|
|||
experiment:
|
||||
name: "progressive_reasoning_gemma3_1b_8gpu_deepspeed"
|
||||
base_model: "google/gemma-3-1b-pt"
|
||||
output_dir: "./outputs"
|
||||
use_wandb: true
|
||||
wandb_project: "matsuo-llm-comp-2025"
|
||||
|
||||
model:
|
||||
load_in_4bit: false # Disable quantization for DeepSpeed
|
||||
device_map: null # Let DeepSpeed handle device placement
|
||||
gradient_checkpointing: true # Enable for memory efficiency
|
||||
use_flash_attention_2: false
|
||||
use_eager_attention: true
|
||||
|
||||
# DeepSpeed Configuration
|
||||
deepspeed:
|
||||
zero_optimization:
|
||||
stage: 2 # ZeRO Stage 2 (partition optimizer states and gradients)
|
||||
allgather_partitions: true
|
||||
allgather_bucket_size: 500000000 # 500MB buckets
|
||||
overlap_comm: true
|
||||
reduce_scatter: true
|
||||
reduce_bucket_size: 500000000
|
||||
contiguous_gradients: true
|
||||
cpu_offload: false # Keep on GPU for speed with small model
|
||||
|
||||
optimizer:
|
||||
type: "AdamW"
|
||||
params:
|
||||
lr: 5e-4
|
||||
betas: [0.9, 0.999]
|
||||
eps: 1e-8
|
||||
weight_decay: 0.001
|
||||
|
||||
scheduler:
|
||||
type: "WarmupLR"
|
||||
params:
|
||||
warmup_min_lr: 0
|
||||
warmup_max_lr: 5e-4
|
||||
warmup_num_steps: 100
|
||||
|
||||
fp16:
|
||||
enabled: false
|
||||
|
||||
bf16:
|
||||
enabled: true
|
||||
|
||||
gradient_clipping: 1.0
|
||||
|
||||
train_batch_size: 512 # Total batch size across all GPUs
|
||||
train_micro_batch_size_per_gpu: 64 # Per-GPU batch size
|
||||
|
||||
progressive_stages:
|
||||
- name: "basic_cot"
|
||||
description: "Basic Chain-of-Thought reasoning"
|
||||
dataset_path: "./data/basic_cot/"
|
||||
adapter_config:
|
||||
r: 32 # Larger rank with 8 GPUs
|
||||
lora_alpha: 64
|
||||
lora_dropout: 0.1
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
|
||||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 2
|
||||
per_device_batch_size: 64 # Large batch with DeepSpeed
|
||||
gradient_accumulation_steps: 1 # No accumulation needed
|
||||
learning_rate: 5e-4
|
||||
warmup_steps: 100
|
||||
max_length: 1024
|
||||
bf16: true
|
||||
max_grad_norm: 1.0
|
||||
weight_decay: 0.001
|
||||
save_steps: 50
|
||||
logging_steps: 10
|
||||
dataloader_num_workers: 8
|
||||
dataloader_pin_memory: true
|
||||
|
||||
- name: "math_reasoning"
|
||||
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
|
||||
dataset_path: "open-r1/OpenR1-Math-220k"
|
||||
inherit_from: "basic_cot"
|
||||
adapter_config:
|
||||
r: 64 # Larger rank for math reasoning
|
||||
lora_alpha: 128
|
||||
lora_dropout: 0.1
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 1
|
||||
per_device_batch_size: 32 # Reduce for longer sequences
|
||||
gradient_accumulation_steps: 1
|
||||
learning_rate: 3e-4
|
||||
warmup_steps: 200
|
||||
max_length: 2048
|
||||
bf16: true
|
||||
max_grad_norm: 1.0
|
||||
weight_decay: 0.001
|
||||
save_steps: 100
|
||||
logging_steps: 20
|
||||
dataloader_num_workers: 8
|
||||
dataset_config:
|
||||
streaming: true
|
||||
max_samples: 500000 # Process more data with 8 GPUs
|
||||
split: "train"
|
||||
|
||||
- name: "complex_reasoning"
|
||||
description: "Complex multi-step reasoning with Mixture-of-Thoughts"
|
||||
dataset_path: "open-r1/Mixture-of-Thoughts"
|
||||
inherit_from: "math_reasoning"
|
||||
adapter_config:
|
||||
r: 128 # Maximum rank for complex reasoning
|
||||
lora_alpha: 256
|
||||
lora_dropout: 0.1
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 1
|
||||
per_device_batch_size: 16 # Reduce for very long sequences
|
||||
gradient_accumulation_steps: 2
|
||||
learning_rate: 2e-4
|
||||
warmup_steps: 300
|
||||
max_length: 4096
|
||||
bf16: true
|
||||
max_grad_norm: 1.0
|
||||
weight_decay: 0.001
|
||||
save_steps: 200
|
||||
logging_steps: 50
|
||||
dataloader_num_workers: 8
|
||||
dataset_config:
|
||||
streaming: true
|
||||
max_samples: 800000 # Process even more data
|
||||
split: "train"
|
||||
|
||||
evaluation:
|
||||
benchmarks:
|
||||
- "HLE"
|
||||
- "Do-Not-Answer"
|
||||
save_results: true
|
||||
results_dir: "./outputs/evaluation_results"
|
||||
113
config/training_config_gemma3_1b_8gpu_fsdp.yaml
Normal file
113
config/training_config_gemma3_1b_8gpu_fsdp.yaml
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
experiment:
|
||||
name: "progressive_reasoning_gemma3_1b_8gpu_fsdp"
|
||||
base_model: "google/gemma-3-1b-pt"
|
||||
output_dir: "./outputs"
|
||||
use_wandb: true
|
||||
wandb_project: "matsuo-llm-comp-2025"
|
||||
|
||||
model:
|
||||
load_in_4bit: false
|
||||
device_map: null # Let FSDP handle device placement
|
||||
gradient_checkpointing: true
|
||||
use_flash_attention_2: false
|
||||
use_eager_attention: true
|
||||
|
||||
# FSDP Configuration
|
||||
fsdp:
|
||||
fsdp_transformer_layer_cls_to_wrap: "GemmaDecoderLayer" # Wrap at layer level
|
||||
fsdp_sharding_strategy: "FULL_SHARD" # Shard parameters, gradients, and optimizer states
|
||||
fsdp_cpu_offload: false # Keep on GPU for speed with small model
|
||||
fsdp_mixed_precision: true # Use BF16 mixed precision
|
||||
fsdp_auto_wrap_policy: "TRANSFORMER_BASED_WRAP"
|
||||
fsdp_min_num_params: 1000000 # Wrap layers with >1M parameters
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_forward_prefetch: true
|
||||
fsdp_use_orig_params: true # Important for LoRA compatibility
|
||||
|
||||
progressive_stages:
|
||||
- name: "basic_cot"
|
||||
description: "Basic Chain-of-Thought reasoning"
|
||||
dataset_path: "./data/basic_cot/"
|
||||
adapter_config:
|
||||
r: 32 # Can use larger ranks with FSDP
|
||||
lora_alpha: 64
|
||||
lora_dropout: 0.1
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
|
||||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 2
|
||||
per_device_batch_size: 48 # Very large batch size with FSDP
|
||||
gradient_accumulation_steps: 1
|
||||
learning_rate: 5e-4
|
||||
warmup_steps: 100
|
||||
max_length: 1024
|
||||
bf16: true
|
||||
max_grad_norm: 1.0
|
||||
weight_decay: 0.001
|
||||
save_steps: 50
|
||||
logging_steps: 10
|
||||
dataloader_num_workers: 8
|
||||
dataloader_pin_memory: true
|
||||
|
||||
- name: "math_reasoning"
|
||||
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
|
||||
dataset_path: "open-r1/OpenR1-Math-220k"
|
||||
inherit_from: "basic_cot"
|
||||
adapter_config:
|
||||
r: 64
|
||||
lora_alpha: 128
|
||||
lora_dropout: 0.1
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 1
|
||||
per_device_batch_size: 24
|
||||
gradient_accumulation_steps: 2
|
||||
learning_rate: 3e-4
|
||||
warmup_steps: 200
|
||||
max_length: 2048
|
||||
bf16: true
|
||||
max_grad_norm: 1.0
|
||||
weight_decay: 0.001
|
||||
save_steps: 100
|
||||
logging_steps: 20
|
||||
dataloader_num_workers: 8
|
||||
dataset_config:
|
||||
streaming: true
|
||||
max_samples: 600000 # Process even more data with FSDP
|
||||
split: "train"
|
||||
|
||||
- name: "complex_reasoning"
|
||||
description: "Complex multi-step reasoning with Mixture-of-Thoughts"
|
||||
dataset_path: "open-r1/Mixture-of-Thoughts"
|
||||
inherit_from: "math_reasoning"
|
||||
adapter_config:
|
||||
r: 128 # Very large rank possible with FSDP
|
||||
lora_alpha: 256
|
||||
lora_dropout: 0.1
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 1
|
||||
per_device_batch_size: 12
|
||||
gradient_accumulation_steps: 4
|
||||
learning_rate: 2e-4
|
||||
warmup_steps: 300
|
||||
max_length: 4096
|
||||
bf16: true
|
||||
max_grad_norm: 1.0
|
||||
weight_decay: 0.001
|
||||
save_steps: 200
|
||||
logging_steps: 50
|
||||
dataloader_num_workers: 8
|
||||
dataset_config:
|
||||
streaming: true
|
||||
max_samples: 1000000 # Can process 1M samples efficiently
|
||||
split: "train"
|
||||
|
||||
evaluation:
|
||||
benchmarks:
|
||||
- "HLE"
|
||||
- "Do-Not-Answer"
|
||||
save_results: true
|
||||
results_dir: "./outputs/evaluation_results"
|
||||
37
docs/README.md
Normal file
37
docs/README.md
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
# Progressive LLM Training Documentation
|
||||
|
||||
## Setup
|
||||
|
||||
```bash
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
uv sync
|
||||
```
|
||||
|
||||
## Training
|
||||
|
||||
### Single GPU
|
||||
```bash
|
||||
uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml
|
||||
```
|
||||
|
||||
### 8 GPUs
|
||||
```bash
|
||||
./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
- `config/training_config_gemma3_1b.yaml` - Single GPU
|
||||
- `config/training_config_gemma3_1b_8gpu_deepspeed.yaml` - 8 GPUs
|
||||
|
||||
## Environment
|
||||
|
||||
Copy `.env.example` to `.env` and set:
|
||||
- `HF_TOKEN` - HuggingFace token
|
||||
- `WANDB_API_KEY` - W&B API key
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
- Reduce `per_device_batch_size` for memory issues
|
||||
- `export NCCL_DEBUG=INFO` for NCCL errors
|
||||
- `nvidia-smi` to check GPUs
|
||||
26
pyproject.toml
Normal file
26
pyproject.toml
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
[project]
|
||||
name = "progressive-llm-training"
|
||||
version = "0.1.0"
|
||||
description = "Progressive LLM Training for 松尾研LLMコンペ2025"
|
||||
requires-python = ">=3.9"
|
||||
|
||||
dependencies = [
|
||||
"torch>=2.0.0",
|
||||
"transformers>=4.40.0",
|
||||
"accelerate>=0.27.0",
|
||||
"peft>=0.11.0",
|
||||
"trl>=0.9.0",
|
||||
"datasets>=2.18.0",
|
||||
"bitsandbytes>=0.43.0",
|
||||
"wandb>=0.16.0",
|
||||
"pyyaml>=6.0",
|
||||
"jsonlines>=4.0.0",
|
||||
"deepspeed>=0.12.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = ["pytest", "black", "isort"]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
|
@ -1,3 +1,5 @@
|
|||
# Use uv instead: uv sync
|
||||
torch>=2.0.0
|
||||
transformers>=4.40.0
|
||||
accelerate>=0.27.0
|
||||
peft>=0.11.0
|
||||
|
|
@ -7,7 +9,4 @@ bitsandbytes>=0.43.0
|
|||
wandb>=0.16.0
|
||||
pyyaml>=6.0
|
||||
jsonlines>=4.0.0
|
||||
scikit-learn>=1.3.0
|
||||
# flash-attn>=2.5.0 # Install separately with --no-build-isolation
|
||||
sentencepiece>=0.2.0
|
||||
protobuf>=4.25.0
|
||||
deepspeed>=0.12.0
|
||||
|
|
|
|||
201
scripts/debug_model_loading.py
Normal file
201
scripts/debug_model_loading.py
Normal file
|
|
@ -0,0 +1,201 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Debug script to identify model loading issues
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import torch
|
||||
from pathlib import Path
|
||||
|
||||
# Add src to path
|
||||
sys.path.append(str(Path(__file__).parent.parent))
|
||||
|
||||
def clear_accelerate_env():
|
||||
"""Clear all ACCELERATE environment variables"""
|
||||
print("Clearing ACCELERATE environment variables...")
|
||||
env_vars_to_clear = []
|
||||
for key in os.environ:
|
||||
if 'ACCELERATE' in key:
|
||||
env_vars_to_clear.append(key)
|
||||
|
||||
for var in env_vars_to_clear:
|
||||
print(f" Removing {var}={os.environ[var]}")
|
||||
del os.environ[var]
|
||||
|
||||
def test_basic_model_loading():
|
||||
"""Test basic model loading without any configuration"""
|
||||
print("Testing basic model loading...")
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model_name = "google/gemma-2-2b-it"
|
||||
|
||||
try:
|
||||
print("Testing with absolutely minimal config...")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name,
|
||||
trust_remote_code=True,
|
||||
torch_dtype=torch.float32
|
||||
)
|
||||
print("✅ Basic loading successful!")
|
||||
del model
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Basic loading failed: {e}")
|
||||
return False
|
||||
|
||||
def test_with_device_map():
|
||||
"""Test with device_map auto"""
|
||||
print("Testing with device_map='auto'...")
|
||||
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
model_name = "google/gemma-2-2b-it"
|
||||
|
||||
try:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name,
|
||||
trust_remote_code=True,
|
||||
torch_dtype=torch.float32,
|
||||
device_map="auto"
|
||||
)
|
||||
print("✅ Device map loading successful!")
|
||||
del model
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Device map loading failed: {e}")
|
||||
return False
|
||||
|
||||
def test_with_quantization():
|
||||
"""Test with quantization"""
|
||||
print("Testing with 4-bit quantization...")
|
||||
|
||||
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
||||
|
||||
model_name = "google/gemma-2-2b-it"
|
||||
|
||||
try:
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_compute_dtype=torch.bfloat16,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_quant_type="nf4"
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name,
|
||||
trust_remote_code=True,
|
||||
quantization_config=bnb_config
|
||||
)
|
||||
print("✅ Quantization loading successful!")
|
||||
del model
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Quantization loading failed: {e}")
|
||||
return False
|
||||
|
||||
def print_environment_info():
|
||||
"""Print detailed environment information"""
|
||||
print("\n" + "="*50)
|
||||
print("ENVIRONMENT INFORMATION")
|
||||
print("="*50)
|
||||
|
||||
# Python version
|
||||
print(f"Python version: {sys.version}")
|
||||
|
||||
# PyTorch info
|
||||
try:
|
||||
import torch
|
||||
print(f"PyTorch version: {torch.__version__}")
|
||||
print(f"CUDA available: {torch.cuda.is_available()}")
|
||||
if torch.cuda.is_available():
|
||||
print(f"CUDA device count: {torch.cuda.device_count()}")
|
||||
for i in range(torch.cuda.device_count()):
|
||||
print(f" Device {i}: {torch.cuda.get_device_name(i)}")
|
||||
print(f"CUDA version: {torch.version.cuda}")
|
||||
except Exception as e:
|
||||
print(f"PyTorch info error: {e}")
|
||||
|
||||
# Transformers info
|
||||
try:
|
||||
from transformers import __version__ as tf_version
|
||||
print(f"Transformers version: {tf_version}")
|
||||
except Exception as e:
|
||||
print(f"Transformers info error: {e}")
|
||||
|
||||
# Accelerate info
|
||||
try:
|
||||
from accelerate import __version__ as acc_version
|
||||
print(f"Accelerate version: {acc_version}")
|
||||
except Exception as e:
|
||||
print(f"Accelerate info error: {e}")
|
||||
|
||||
# PEFT info
|
||||
try:
|
||||
from peft import __version__ as peft_version
|
||||
print(f"PEFT version: {peft_version}")
|
||||
except Exception as e:
|
||||
print(f"PEFT info error: {e}")
|
||||
|
||||
# BitsAndBytes info
|
||||
try:
|
||||
import bitsandbytes as bnb
|
||||
print(f"BitsAndBytes version: {bnb.__version__}")
|
||||
except Exception as e:
|
||||
print(f"BitsAndBytes info error: {e}")
|
||||
|
||||
# Environment variables
|
||||
print("\nRelevant environment variables:")
|
||||
for key, value in sorted(os.environ.items()):
|
||||
if any(prefix in key for prefix in ['CUDA', 'TORCH', 'HF_', 'ACCELERATE', 'TRANSFORMERS']):
|
||||
print(f" {key}={value}")
|
||||
|
||||
def main():
|
||||
print("Progressive LLM Training - Model Loading Debug")
|
||||
print("=" * 60)
|
||||
|
||||
# Print environment info first
|
||||
print_environment_info()
|
||||
|
||||
# Clear environment variables
|
||||
clear_accelerate_env()
|
||||
|
||||
# Test various loading methods
|
||||
print("\n" + "="*50)
|
||||
print("TESTING MODEL LOADING")
|
||||
print("="*50)
|
||||
|
||||
results = []
|
||||
|
||||
# Test 1: Basic loading
|
||||
results.append(("Basic loading", test_basic_model_loading()))
|
||||
|
||||
# Test 2: With device map
|
||||
results.append(("Device map", test_with_device_map()))
|
||||
|
||||
# Test 3: With quantization
|
||||
results.append(("Quantization", test_with_quantization()))
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*50)
|
||||
print("SUMMARY")
|
||||
print("="*50)
|
||||
|
||||
for test_name, success in results:
|
||||
status = "✅ PASS" if success else "❌ FAIL"
|
||||
print(f"{test_name}: {status}")
|
||||
|
||||
if any(result[1] for result in results):
|
||||
print("\n✅ At least one loading method works!")
|
||||
print("Use the successful method in your configuration.")
|
||||
else:
|
||||
print("\n❌ All loading methods failed!")
|
||||
print("This indicates a fundamental environment issue.")
|
||||
print("Consider:")
|
||||
print("1. Reinstalling transformers, accelerate, torch")
|
||||
print("2. Checking CUDA installation")
|
||||
print("3. Using a different model")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
161
scripts/train_gemma3_1b_8gpu.sh
Executable file
161
scripts/train_gemma3_1b_8gpu.sh
Executable file
|
|
@ -0,0 +1,161 @@
|
|||
#!/bin/bash
|
||||
# Training launcher script for Gemma3 1B with 8 GPUs (uv compatible)
|
||||
|
||||
# Color codes for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
echo -e "${GREEN}Progressive LLM Training - Gemma3 1B 8GPU Launcher (uv)${NC}"
|
||||
echo "======================================================="
|
||||
|
||||
# Check if uv is available
|
||||
if command -v uv &> /dev/null; then
|
||||
echo -e "${GREEN}Using uv for Python environment management${NC}"
|
||||
UV_PREFIX="uv run"
|
||||
else
|
||||
echo -e "${YELLOW}uv not found, using standard python${NC}"
|
||||
UV_PREFIX="python"
|
||||
fi
|
||||
|
||||
# Default values
|
||||
STRATEGY="deepspeed"
|
||||
CONFIG=""
|
||||
NUM_GPUS=8
|
||||
DRY_RUN=false
|
||||
|
||||
# Parse command line arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--strategy)
|
||||
STRATEGY="$2"
|
||||
shift 2
|
||||
;;
|
||||
--config)
|
||||
CONFIG="$2"
|
||||
shift 2
|
||||
;;
|
||||
--num-gpus)
|
||||
NUM_GPUS="$2"
|
||||
shift 2
|
||||
;;
|
||||
--dry-run)
|
||||
DRY_RUN=true
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
echo "Usage: $0 [options]"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --strategy <ddp|fsdp|deepspeed> Training strategy (default: deepspeed)"
|
||||
echo " --config <path> Custom config file (optional)"
|
||||
echo " --num-gpus <n> Number of GPUs to use (default: 8)"
|
||||
echo " --dry-run Show command without executing"
|
||||
echo ""
|
||||
echo "Examples:"
|
||||
echo " # Use DeepSpeed (recommended)"
|
||||
echo " $0 --strategy deepspeed"
|
||||
echo ""
|
||||
echo " # Use DDP"
|
||||
echo " $0 --strategy ddp"
|
||||
echo ""
|
||||
echo " # Use FSDP"
|
||||
echo " $0 --strategy fsdp"
|
||||
echo ""
|
||||
echo " # Use custom config"
|
||||
echo " $0 --strategy ddp --config config/my_config.yaml"
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo -e "${RED}Error: Unknown option $1${NC}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Check GPU availability
|
||||
GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
|
||||
echo -e "Available GPUs: ${GREEN}$GPU_COUNT${NC}"
|
||||
|
||||
if [ $GPU_COUNT -lt $NUM_GPUS ]; then
|
||||
echo -e "${RED}Error: Requested $NUM_GPUS GPUs but only $GPU_COUNT available${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Set default config based on strategy if not provided
|
||||
if [ -z "$CONFIG" ]; then
|
||||
case $STRATEGY in
|
||||
ddp)
|
||||
CONFIG="config/training_config_gemma3_1b_8gpu_ddp.yaml"
|
||||
;;
|
||||
fsdp)
|
||||
CONFIG="config/training_config_gemma3_1b_8gpu_fsdp.yaml"
|
||||
;;
|
||||
deepspeed)
|
||||
CONFIG="config/training_config_gemma3_1b_8gpu_deepspeed.yaml"
|
||||
;;
|
||||
*)
|
||||
echo -e "${RED}Error: Invalid strategy '$STRATEGY'. Choose from: ddp, fsdp, deepspeed${NC}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# Check if config file exists
|
||||
if [ ! -f "$CONFIG" ]; then
|
||||
echo -e "${RED}Error: Config file not found: $CONFIG${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo -e "Strategy: ${YELLOW}$STRATEGY${NC}"
|
||||
echo -e "Config: ${YELLOW}$CONFIG${NC}"
|
||||
echo -e "GPUs: ${YELLOW}$NUM_GPUS${NC}"
|
||||
echo ""
|
||||
|
||||
# Build the command
|
||||
CMD="$UV_PREFIX scripts/train_multi_gpu.py --config $CONFIG --strategy $STRATEGY --num_gpus $NUM_GPUS"
|
||||
|
||||
if [ "$DRY_RUN" = true ]; then
|
||||
echo -e "${YELLOW}Dry run mode - Command that would be executed:${NC}"
|
||||
echo "$CMD"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Show GPU memory before training
|
||||
echo -e "${GREEN}GPU Memory Usage Before Training:${NC}"
|
||||
nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv
|
||||
|
||||
echo ""
|
||||
echo -e "${GREEN}Starting training...${NC}"
|
||||
echo "Command: $CMD"
|
||||
echo ""
|
||||
|
||||
# Set environment variables for optimal performance
|
||||
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
export NCCL_DEBUG=WARN # Set to INFO for debugging
|
||||
export NCCL_ASYNC_ERROR_HANDLING=1
|
||||
|
||||
# For DeepSpeed, set additional optimizations
|
||||
if [ "$STRATEGY" = "deepspeed" ]; then
|
||||
export DS_SKIP_CUDA_CHECK=1
|
||||
export TOKENIZERS_PARALLELISM=false
|
||||
fi
|
||||
|
||||
# Execute the training command
|
||||
$CMD
|
||||
|
||||
# Check exit status
|
||||
if [ $? -eq 0 ]; then
|
||||
echo ""
|
||||
echo -e "${GREEN}Training completed successfully!${NC}"
|
||||
|
||||
# Show GPU memory after training
|
||||
echo ""
|
||||
echo -e "${GREEN}GPU Memory Usage After Training:${NC}"
|
||||
nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv
|
||||
else
|
||||
echo ""
|
||||
echo -e "${RED}Training failed!${NC}"
|
||||
exit 1
|
||||
fi
|
||||
224
scripts/train_multi_gpu.py
Executable file
224
scripts/train_multi_gpu.py
Executable file
|
|
@ -0,0 +1,224 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Multi-GPU training launcher for progressive reasoning model
|
||||
Supports DDP, FSDP, and DeepSpeed strategies
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import subprocess
|
||||
import shutil
|
||||
import torch
|
||||
from pathlib import Path
|
||||
|
||||
# Add src to path
|
||||
sys.path.append(str(Path(__file__).parent.parent))
|
||||
|
||||
|
||||
def get_gpu_count():
|
||||
"""Get the number of available GPUs"""
|
||||
if torch.cuda.is_available():
|
||||
return torch.cuda.device_count()
|
||||
return 0
|
||||
|
||||
|
||||
def setup_environment_for_strategy(strategy):
|
||||
"""Set up environment variables for different strategies"""
|
||||
if strategy == "deepspeed":
|
||||
# DeepSpeed specific environment
|
||||
os.environ["MASTER_ADDR"] = "localhost"
|
||||
os.environ["MASTER_PORT"] = "12355"
|
||||
os.environ["RANK"] = "0"
|
||||
os.environ["LOCAL_RANK"] = "0"
|
||||
os.environ["WORLD_SIZE"] = str(get_gpu_count())
|
||||
elif strategy in ["ddp", "fsdp"]:
|
||||
# Standard distributed training environment
|
||||
os.environ["MASTER_ADDR"] = "localhost"
|
||||
os.environ["MASTER_PORT"] = "12355"
|
||||
# Let torchrun handle the rest
|
||||
|
||||
# General optimizations
|
||||
os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
|
||||
os.environ["NCCL_DEBUG"] = "INFO"
|
||||
os.environ["TORCH_DISTRIBUTED_DEBUG"] = "INFO"
|
||||
|
||||
|
||||
def launch_ddp_training(config_path, num_gpus):
|
||||
"""Launch DDP training using torchrun"""
|
||||
print(f"Launching DDP training on {num_gpus} GPUs...")
|
||||
|
||||
setup_environment_for_strategy("ddp")
|
||||
|
||||
# Use torchrun for DDP
|
||||
# Check if uv is available
|
||||
python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"]
|
||||
|
||||
cmd = [
|
||||
"torchrun",
|
||||
"--nproc_per_node", str(num_gpus),
|
||||
"--master_port", "12355",
|
||||
] + python_cmd + [
|
||||
"--config", config_path,
|
||||
"--distributed"
|
||||
]
|
||||
|
||||
print(f"Running command: {' '.join(cmd)}")
|
||||
return subprocess.run(cmd, cwd=Path(__file__).parent.parent)
|
||||
|
||||
|
||||
def launch_fsdp_training(config_path, num_gpus):
|
||||
"""Launch FSDP training using accelerate"""
|
||||
print(f"Launching FSDP training on {num_gpus} GPUs...")
|
||||
|
||||
setup_environment_for_strategy("fsdp")
|
||||
|
||||
# Create accelerate config for FSDP
|
||||
accelerate_config = f"""
|
||||
compute_environment: LOCAL_MACHINE
|
||||
distributed_type: FSDP
|
||||
fsdp_config:
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_backward_prefetch: BACKWARD_PRE
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_forward_prefetch: false
|
||||
fsdp_offload_params: false
|
||||
fsdp_sharding_strategy: FULL_SHARD
|
||||
fsdp_state_dict_type: SHARDED_STATE_DICT
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_transformer_layer_cls_to_wrap: Gemma2DecoderLayer
|
||||
fsdp_use_orig_params: true
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: bf16
|
||||
num_machines: 1
|
||||
num_processes: {num_gpus}
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
"""
|
||||
|
||||
# Save config temporarily
|
||||
config_file = Path(__file__).parent.parent / "accelerate_config.yaml"
|
||||
with open(config_file, "w") as f:
|
||||
f.write(accelerate_config)
|
||||
|
||||
# Check if uv is available
|
||||
python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"]
|
||||
|
||||
cmd = [
|
||||
"accelerate", "launch",
|
||||
"--config_file", str(config_file),
|
||||
] + python_cmd + [
|
||||
"--config", config_path
|
||||
]
|
||||
|
||||
print(f"Running command: {' '.join(cmd)}")
|
||||
result = subprocess.run(cmd, cwd=Path(__file__).parent.parent)
|
||||
|
||||
# Clean up config file
|
||||
config_file.unlink(missing_ok=True)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def launch_deepspeed_training(config_path, num_gpus):
|
||||
"""Launch DeepSpeed training"""
|
||||
print(f"Launching DeepSpeed training on {num_gpus} GPUs...")
|
||||
|
||||
setup_environment_for_strategy("deepspeed")
|
||||
|
||||
# Create DeepSpeed hostfile
|
||||
hostfile = Path(__file__).parent.parent / "hostfile"
|
||||
with open(hostfile, "w") as f:
|
||||
f.write(f"localhost slots={num_gpus}\n")
|
||||
|
||||
# Check if uv is available
|
||||
python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"]
|
||||
|
||||
cmd = [
|
||||
"deepspeed",
|
||||
"--hostfile", str(hostfile),
|
||||
"--num_gpus", str(num_gpus),
|
||||
] + python_cmd + [
|
||||
"--config", config_path,
|
||||
"--deepspeed"
|
||||
]
|
||||
|
||||
print(f"Running command: {' '.join(cmd)}")
|
||||
result = subprocess.run(cmd, cwd=Path(__file__).parent.parent)
|
||||
|
||||
# Clean up hostfile
|
||||
hostfile.unlink(missing_ok=True)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Multi-GPU Progressive LLM Training")
|
||||
parser.add_argument("--config", type=str, required=True,
|
||||
help="Path to training configuration file")
|
||||
parser.add_argument("--strategy", type=str, default="ddp",
|
||||
choices=["ddp", "fsdp", "deepspeed"],
|
||||
help="Multi-GPU strategy to use")
|
||||
parser.add_argument("--num_gpus", type=int, default=None,
|
||||
help="Number of GPUs to use (default: all available)")
|
||||
parser.add_argument("--dry_run", action="store_true",
|
||||
help="Print commands without executing")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get GPU count
|
||||
available_gpus = get_gpu_count()
|
||||
if available_gpus == 0:
|
||||
print("❌ No GPUs available!")
|
||||
sys.exit(1)
|
||||
|
||||
num_gpus = args.num_gpus or available_gpus
|
||||
if num_gpus > available_gpus:
|
||||
print(f"❌ Requested {num_gpus} GPUs but only {available_gpus} available")
|
||||
sys.exit(1)
|
||||
|
||||
# Check config file exists
|
||||
if not Path(args.config).exists():
|
||||
print(f"❌ Config file not found: {args.config}")
|
||||
sys.exit(1)
|
||||
|
||||
print("Progressive LLM Training - Multi-GPU Launcher")
|
||||
print("=" * 60)
|
||||
print(f"Strategy: {args.strategy}")
|
||||
print(f"GPUs: {num_gpus} / {available_gpus}")
|
||||
print(f"Config: {args.config}")
|
||||
print("=" * 60)
|
||||
|
||||
if args.dry_run:
|
||||
print("DRY RUN - Commands that would be executed:")
|
||||
# Show what would be run
|
||||
if args.strategy == "ddp":
|
||||
print("torchrun --nproc_per_node", num_gpus, "scripts/train_progressive.py")
|
||||
elif args.strategy == "fsdp":
|
||||
print("accelerate launch --config_file accelerate_config.yaml scripts/train_progressive.py")
|
||||
elif args.strategy == "deepspeed":
|
||||
print("deepspeed --num_gpus", num_gpus, "scripts/train_progressive.py")
|
||||
return
|
||||
|
||||
# Launch training
|
||||
if args.strategy == "ddp":
|
||||
result = launch_ddp_training(args.config, num_gpus)
|
||||
elif args.strategy == "fsdp":
|
||||
result = launch_fsdp_training(args.config, num_gpus)
|
||||
elif args.strategy == "deepspeed":
|
||||
result = launch_deepspeed_training(args.config, num_gpus)
|
||||
|
||||
if result.returncode == 0:
|
||||
print("✅ Training completed successfully!")
|
||||
else:
|
||||
print("❌ Training failed!")
|
||||
sys.exit(result.returncode)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -6,6 +6,7 @@ Main training script for progressive reasoning model
|
|||
import sys
|
||||
import yaml
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Add src to path
|
||||
|
|
@ -56,6 +57,18 @@ Examples:
|
|||
help="Load config and model but skip training (for testing)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--distributed",
|
||||
action="store_true",
|
||||
help="Enable distributed training"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--deepspeed",
|
||||
action="store_true",
|
||||
help="Enable DeepSpeed training"
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
|
|
@ -74,9 +87,34 @@ def load_config(config_path: str) -> dict:
|
|||
return config
|
||||
|
||||
|
||||
def setup_distributed_training():
|
||||
"""Setup distributed training environment"""
|
||||
# Check if we're in a distributed environment
|
||||
if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
|
||||
import torch.distributed as dist
|
||||
import torch
|
||||
|
||||
# Initialize distributed training
|
||||
if not dist.is_initialized():
|
||||
dist.init_process_group(backend="nccl")
|
||||
|
||||
local_rank = int(os.environ.get("LOCAL_RANK", 0))
|
||||
torch.cuda.set_device(local_rank)
|
||||
|
||||
print(f"Distributed training initialized: rank {dist.get_rank()}/{dist.get_world_size()}")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
# Setup distributed training if requested
|
||||
is_distributed = False
|
||||
if args.distributed or args.deepspeed:
|
||||
is_distributed = setup_distributed_training()
|
||||
|
||||
print("Progressive LLM Training for 松尾研LLMコンペ2025")
|
||||
print("=" * 50)
|
||||
|
||||
|
|
@ -95,11 +133,26 @@ def main():
|
|||
print(f"Error loading config: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Add distributed/deepspeed flags to config
|
||||
config["training_args"] = config.get("training_args", {})
|
||||
if args.distributed:
|
||||
config["training_args"]["distributed"] = True
|
||||
if args.deepspeed:
|
||||
config["training_args"]["deepspeed"] = True
|
||||
# Add DeepSpeed config from main config
|
||||
if "deepspeed" in config:
|
||||
config["training_args"]["deepspeed_config"] = config["deepspeed"]
|
||||
|
||||
# Print configuration info
|
||||
print(f"Experiment: {config['experiment']['name']}")
|
||||
print(f"Base model: {config['experiment']['base_model']}")
|
||||
print(f"Output directory: {config['experiment']['output_dir']}")
|
||||
print(f"Stages: {len(config['progressive_stages'])}")
|
||||
if is_distributed:
|
||||
print("Mode: Distributed Training")
|
||||
if args.deepspeed:
|
||||
print("Backend: DeepSpeed")
|
||||
print("=" * 50)
|
||||
|
||||
# Prepare sample datasets if requested
|
||||
if args.prepare_data:
|
||||
|
|
|
|||
|
|
@ -367,27 +367,55 @@ class ProgressiveTrainer:
|
|||
|
||||
print(f"Final dataset size: {len(dataset)} examples")
|
||||
|
||||
# Training arguments - with CPU offload optimizations
|
||||
training_args = TrainingArguments(
|
||||
output_dir=f"./outputs/checkpoints/{stage_name}",
|
||||
num_train_epochs=stage_config["training"]["num_epochs"],
|
||||
per_device_train_batch_size=stage_config["training"]["per_device_batch_size"],
|
||||
gradient_accumulation_steps=stage_config["training"]["gradient_accumulation_steps"],
|
||||
learning_rate=float(stage_config["training"]["learning_rate"]), # Ensure it's a float
|
||||
warmup_steps=stage_config["training"]["warmup_steps"],
|
||||
logging_steps=stage_config["training"].get("logging_steps", 10),
|
||||
save_strategy="epoch",
|
||||
eval_strategy="no",
|
||||
bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
|
||||
gradient_checkpointing=self.config["model"].get("gradient_checkpointing", False),
|
||||
max_grad_norm=stage_config["training"].get("max_grad_norm", 1.0),
|
||||
report_to="wandb" if self.config["experiment"]["use_wandb"] else "none",
|
||||
run_name=f"{self.config['experiment']['name']}_{stage_name}",
|
||||
dataloader_pin_memory=False, # Reduce memory usage
|
||||
remove_unused_columns=False, # Keep all columns
|
||||
optim=stage_config["training"].get("optim", "adamw_torch"), # Support 8-bit optimizers
|
||||
dataloader_num_workers=stage_config["training"].get("dataloader_num_workers", 2),
|
||||
)
|
||||
# Training arguments - with multi-GPU and CPU offload optimizations
|
||||
training_args_dict = {
|
||||
"output_dir": f"./outputs/checkpoints/{stage_name}",
|
||||
"num_train_epochs": stage_config["training"]["num_epochs"],
|
||||
"per_device_train_batch_size": stage_config["training"]["per_device_batch_size"],
|
||||
"gradient_accumulation_steps": stage_config["training"]["gradient_accumulation_steps"],
|
||||
"learning_rate": float(stage_config["training"]["learning_rate"]), # Ensure it's a float
|
||||
"warmup_steps": stage_config["training"]["warmup_steps"],
|
||||
"logging_steps": stage_config["training"].get("logging_steps", 10),
|
||||
"save_strategy": "epoch",
|
||||
"eval_strategy": "no",
|
||||
"bf16": torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
|
||||
"gradient_checkpointing": self.config["model"].get("gradient_checkpointing", False),
|
||||
"max_grad_norm": stage_config["training"].get("max_grad_norm", 1.0),
|
||||
"report_to": "wandb" if self.config["experiment"]["use_wandb"] else "none",
|
||||
"run_name": f"{self.config['experiment']['name']}_{stage_name}",
|
||||
"dataloader_pin_memory": stage_config["training"].get("dataloader_pin_memory", False),
|
||||
"remove_unused_columns": False, # Keep all columns
|
||||
"optim": stage_config["training"].get("optim", "adamw_torch"), # Support 8-bit optimizers
|
||||
"dataloader_num_workers": stage_config["training"].get("dataloader_num_workers", 2),
|
||||
}
|
||||
|
||||
# Add multi-GPU specific settings
|
||||
if self.config.get("training_args", {}).get("distributed", False):
|
||||
training_args_dict.update({
|
||||
"ddp_find_unused_parameters": False,
|
||||
"ddp_bucket_cap_mb": 200,
|
||||
"ddp_broadcast_buffers": False,
|
||||
})
|
||||
|
||||
# Add DeepSpeed configuration
|
||||
if self.config.get("training_args", {}).get("deepspeed", False):
|
||||
deepspeed_config = self.config.get("training_args", {}).get("deepspeed_config")
|
||||
if deepspeed_config:
|
||||
training_args_dict["deepspeed"] = deepspeed_config
|
||||
|
||||
# Add FSDP configuration
|
||||
if "fsdp" in self.config:
|
||||
fsdp_config = self.config["fsdp"]
|
||||
training_args_dict.update({
|
||||
"fsdp": fsdp_config.get("fsdp_sharding_strategy", "FULL_SHARD"),
|
||||
"fsdp_transformer_layer_cls_to_wrap": fsdp_config.get("fsdp_transformer_layer_cls_to_wrap"),
|
||||
"fsdp_auto_wrap_policy": fsdp_config.get("fsdp_auto_wrap_policy", "TRANSFORMER_BASED_WRAP"),
|
||||
"fsdp_min_num_params": fsdp_config.get("fsdp_min_num_params", 1000000),
|
||||
"fsdp_cpu_ram_efficient_loading": fsdp_config.get("fsdp_cpu_offload", False),
|
||||
"fsdp_sync_module_states": fsdp_config.get("fsdp_sync_module_states", True),
|
||||
})
|
||||
|
||||
training_args = TrainingArguments(**training_args_dict)
|
||||
|
||||
# Print dataset info for debugging
|
||||
print(f"Dataset columns: {dataset.column_names}")
|
||||
|
|
|
|||
7
uv.lock
generated
Normal file
7
uv.lock
generated
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
# This file is automatically @generated by uv.
|
||||
# It is not intended for manual editing.
|
||||
version = 1
|
||||
requires-python = ">=3.9"
|
||||
|
||||
# Note: This is a placeholder lock file.
|
||||
# Run `uv lock` to generate the actual lock file with resolved dependencies.
|
||||
Loading…
Add table
Reference in a new issue