こんにちは
This commit is contained in:
parent
3c513fee17
commit
5ca971b0a4
19 changed files with 1559 additions and 41 deletions
13
.env.example
Normal file
13
.env.example
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
# Environment variables for Progressive LLM Training
|
||||||
|
# Copy this file to .env and fill in your values
|
||||||
|
|
||||||
|
# HuggingFace
|
||||||
|
HF_TOKEN=your_token_here
|
||||||
|
|
||||||
|
# Weights & Biases
|
||||||
|
WANDB_API_KEY=your_api_key_here
|
||||||
|
WANDB_PROJECT=matsuo-llm-comp-2025
|
||||||
|
|
||||||
|
# GPU Configuration
|
||||||
|
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||||
|
NCCL_DEBUG=WARN
|
||||||
35
.gitignore
vendored
35
.gitignore
vendored
|
|
@ -1,32 +1,35 @@
|
||||||
# Python
|
# Python
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
*$py.class
|
|
||||||
*.so
|
|
||||||
.Python
|
|
||||||
venv/
|
|
||||||
ENV/
|
|
||||||
env/
|
|
||||||
.venv/
|
.venv/
|
||||||
|
venv/
|
||||||
|
|
||||||
# Nix
|
# Training outputs
|
||||||
result
|
|
||||||
result-*
|
|
||||||
|
|
||||||
# Project specific
|
|
||||||
outputs/
|
outputs/
|
||||||
data/
|
data/
|
||||||
*.log
|
!data/basic_cot/train.jsonl
|
||||||
wandb/
|
wandb/
|
||||||
.ipynb_checkpoints/
|
*.log
|
||||||
|
|
||||||
|
# Model files
|
||||||
*.pt
|
*.pt
|
||||||
*.pth
|
*.pth
|
||||||
*.bin
|
*.bin
|
||||||
*.safetensors
|
*.safetensors
|
||||||
|
|
||||||
|
# Temporary
|
||||||
|
*.tmp
|
||||||
|
.cache/
|
||||||
|
accelerate_config.yaml
|
||||||
|
hostfile
|
||||||
|
|
||||||
# IDE
|
# IDE
|
||||||
.vscode/
|
.vscode/
|
||||||
.idea/
|
.idea/
|
||||||
*.swp
|
|
||||||
*.swo
|
# OS
|
||||||
*~
|
.DS_Store
|
||||||
|
*~
|
||||||
|
|
||||||
|
# Keep lock files
|
||||||
|
!uv.lock
|
||||||
1
.python-version
Normal file
1
.python-version
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
3.11
|
||||||
43
README.md
Normal file
43
README.md
Normal file
|
|
@ -0,0 +1,43 @@
|
||||||
|
# Progressive LLM Training
|
||||||
|
|
||||||
|
Progressive training for LLMs with 8-GPU support for 松尾研LLMコンペ2025.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install uv
|
||||||
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
|
||||||
|
# Setup project
|
||||||
|
git clone <repository-url>
|
||||||
|
cd progressive-llm-training
|
||||||
|
uv sync
|
||||||
|
|
||||||
|
# Start training
|
||||||
|
uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml
|
||||||
|
./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed
|
||||||
|
```
|
||||||
|
|
||||||
|
## Training Stages
|
||||||
|
|
||||||
|
1. **basic_cot** - Basic reasoning
|
||||||
|
2. **math_reasoning** - Math with OpenR1-Math-220k
|
||||||
|
3. **complex_reasoning** - Complex reasoning with Mixture-of-Thoughts
|
||||||
|
|
||||||
|
## Commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv sync # Install dependencies
|
||||||
|
uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml # Single GPU
|
||||||
|
./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed # 8 GPUs
|
||||||
|
uv run pytest # Run tests
|
||||||
|
```
|
||||||
|
|
||||||
|
## Key Files
|
||||||
|
|
||||||
|
- `config/training_config_gemma3_1b_8gpu_deepspeed.yaml` - 8-GPU config
|
||||||
|
- `scripts/train_progressive.py` - Main training script
|
||||||
|
- `scripts/train_gemma3_1b_8gpu.sh` - 8-GPU launcher
|
||||||
|
- `src/progressive_model.py` - Core model implementation
|
||||||
|
|
||||||
|
Ready to train! 🚀
|
||||||
110
config/training_config_8gpu.yaml
Normal file
110
config/training_config_8gpu.yaml
Normal file
|
|
@ -0,0 +1,110 @@
|
||||||
|
experiment:
|
||||||
|
name: "progressive_reasoning_8gpu"
|
||||||
|
base_model: "google/gemma-2-2b-it" # Can scale up to larger models
|
||||||
|
output_dir: "./outputs"
|
||||||
|
use_wandb: true
|
||||||
|
wandb_project: "matsuo-llm-comp-2025"
|
||||||
|
|
||||||
|
model:
|
||||||
|
load_in_4bit: false # Can use FP16/BF16 with multiple GPUs
|
||||||
|
bnb_4bit_compute_dtype: "bfloat16"
|
||||||
|
bnb_4bit_use_double_quant: true
|
||||||
|
device_map: "balanced" # Distribute across all GPUs
|
||||||
|
gradient_checkpointing: true
|
||||||
|
use_flash_attention_2: true # Enable if available for speed
|
||||||
|
use_eager_attention: false
|
||||||
|
|
||||||
|
# Multi-GPU specific settings
|
||||||
|
distributed:
|
||||||
|
strategy: "ddp" # Distributed Data Parallel
|
||||||
|
find_unused_parameters: false
|
||||||
|
gradient_as_bucket_view: true
|
||||||
|
|
||||||
|
progressive_stages:
|
||||||
|
- name: "basic_cot"
|
||||||
|
description: "Basic Chain-of-Thought reasoning"
|
||||||
|
dataset_path: "./data/basic_cot/"
|
||||||
|
adapter_config:
|
||||||
|
r: 32 # Larger rank since we have more memory
|
||||||
|
lora_alpha: 64
|
||||||
|
lora_dropout: 0.1
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
|
||||||
|
init_lora_weights: true
|
||||||
|
training:
|
||||||
|
num_epochs: 2
|
||||||
|
per_device_batch_size: 16 # Large batch size per GPU
|
||||||
|
gradient_accumulation_steps: 1 # No need for accumulation with 8 GPUs
|
||||||
|
learning_rate: 5e-4
|
||||||
|
warmup_steps: 100
|
||||||
|
max_length: 2048
|
||||||
|
bf16: true
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
weight_decay: 0.001
|
||||||
|
save_steps: 50
|
||||||
|
logging_steps: 10
|
||||||
|
dataloader_num_workers: 4 # More workers for data loading
|
||||||
|
dataloader_pin_memory: true
|
||||||
|
remove_unused_columns: false
|
||||||
|
|
||||||
|
- name: "math_reasoning"
|
||||||
|
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
|
||||||
|
dataset_path: "open-r1/OpenR1-Math-220k"
|
||||||
|
inherit_from: "basic_cot"
|
||||||
|
adapter_config:
|
||||||
|
r: 64
|
||||||
|
lora_alpha: 128
|
||||||
|
lora_dropout: 0.1
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||||
|
init_lora_weights: true
|
||||||
|
training:
|
||||||
|
num_epochs: 1
|
||||||
|
per_device_batch_size: 8 # Reduce for larger model
|
||||||
|
gradient_accumulation_steps: 2
|
||||||
|
learning_rate: 3e-4
|
||||||
|
warmup_steps: 200
|
||||||
|
max_length: 4096
|
||||||
|
bf16: true
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
weight_decay: 0.001
|
||||||
|
save_steps: 100
|
||||||
|
logging_steps: 20
|
||||||
|
dataloader_num_workers: 4
|
||||||
|
dataset_config:
|
||||||
|
streaming: true
|
||||||
|
max_samples: 100000 # Can process more with 8 GPUs
|
||||||
|
split: "train"
|
||||||
|
|
||||||
|
- name: "complex_reasoning"
|
||||||
|
description: "Complex multi-step reasoning with Mixture-of-Thoughts"
|
||||||
|
dataset_path: "open-r1/Mixture-of-Thoughts"
|
||||||
|
inherit_from: "math_reasoning"
|
||||||
|
adapter_config:
|
||||||
|
r: 128 # Maximum rank with multi-GPU
|
||||||
|
lora_alpha: 256
|
||||||
|
lora_dropout: 0.1
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||||
|
init_lora_weights: true
|
||||||
|
training:
|
||||||
|
num_epochs: 1
|
||||||
|
per_device_batch_size: 4
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
learning_rate: 2e-4
|
||||||
|
warmup_steps: 300
|
||||||
|
max_length: 8192
|
||||||
|
bf16: true
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
weight_decay: 0.001
|
||||||
|
save_steps: 200
|
||||||
|
logging_steps: 50
|
||||||
|
dataloader_num_workers: 4
|
||||||
|
dataset_config:
|
||||||
|
streaming: true
|
||||||
|
max_samples: 50000
|
||||||
|
split: "train"
|
||||||
|
|
||||||
|
evaluation:
|
||||||
|
benchmarks:
|
||||||
|
- "HLE"
|
||||||
|
- "Do-Not-Answer"
|
||||||
|
save_results: true
|
||||||
|
results_dir: "./outputs/evaluation_results"
|
||||||
138
config/training_config_8gpu_deepspeed.yaml
Normal file
138
config/training_config_8gpu_deepspeed.yaml
Normal file
|
|
@ -0,0 +1,138 @@
|
||||||
|
experiment:
|
||||||
|
name: "progressive_reasoning_8gpu_deepspeed"
|
||||||
|
base_model: "google/gemma-2-2b-it"
|
||||||
|
output_dir: "./outputs"
|
||||||
|
use_wandb: true
|
||||||
|
wandb_project: "matsuo-llm-comp-2025"
|
||||||
|
|
||||||
|
model:
|
||||||
|
load_in_4bit: false
|
||||||
|
device_map: null # Let DeepSpeed handle device placement
|
||||||
|
gradient_checkpointing: true
|
||||||
|
use_flash_attention_2: true
|
||||||
|
use_eager_attention: false
|
||||||
|
|
||||||
|
# DeepSpeed Configuration
|
||||||
|
deepspeed:
|
||||||
|
zero_optimization:
|
||||||
|
stage: 2 # ZeRO Stage 2 (partition optimizer states and gradients)
|
||||||
|
allgather_partitions: true
|
||||||
|
allgather_bucket_size: 200000000
|
||||||
|
overlap_comm: true
|
||||||
|
reduce_scatter: true
|
||||||
|
reduce_bucket_size: 200000000
|
||||||
|
contiguous_gradients: true
|
||||||
|
cpu_offload: false # Keep on GPU for speed
|
||||||
|
|
||||||
|
optimizer:
|
||||||
|
type: "AdamW"
|
||||||
|
params:
|
||||||
|
lr: 3e-4
|
||||||
|
betas: [0.9, 0.999]
|
||||||
|
eps: 1e-8
|
||||||
|
weight_decay: 0.001
|
||||||
|
|
||||||
|
scheduler:
|
||||||
|
type: "WarmupLR"
|
||||||
|
params:
|
||||||
|
warmup_min_lr: 0
|
||||||
|
warmup_max_lr: 3e-4
|
||||||
|
warmup_num_steps: 200
|
||||||
|
|
||||||
|
fp16:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
bf16:
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
gradient_clipping: 1.0
|
||||||
|
|
||||||
|
train_batch_size: 512 # Total batch size across all GPUs
|
||||||
|
train_micro_batch_size_per_gpu: 64 # Per-GPU batch size
|
||||||
|
|
||||||
|
progressive_stages:
|
||||||
|
- name: "basic_cot"
|
||||||
|
description: "Basic Chain-of-Thought reasoning"
|
||||||
|
dataset_path: "./data/basic_cot/"
|
||||||
|
adapter_config:
|
||||||
|
r: 64
|
||||||
|
lora_alpha: 128
|
||||||
|
lora_dropout: 0.1
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
|
||||||
|
init_lora_weights: true
|
||||||
|
training:
|
||||||
|
num_epochs: 2
|
||||||
|
per_device_batch_size: 64 # Large batch with DeepSpeed
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
learning_rate: 5e-4
|
||||||
|
warmup_steps: 100
|
||||||
|
max_length: 2048
|
||||||
|
bf16: true
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
weight_decay: 0.001
|
||||||
|
save_steps: 50
|
||||||
|
logging_steps: 10
|
||||||
|
dataloader_num_workers: 8
|
||||||
|
|
||||||
|
- name: "math_reasoning"
|
||||||
|
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
|
||||||
|
dataset_path: "open-r1/OpenR1-Math-220k"
|
||||||
|
inherit_from: "basic_cot"
|
||||||
|
adapter_config:
|
||||||
|
r: 128
|
||||||
|
lora_alpha: 256
|
||||||
|
lora_dropout: 0.1
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||||
|
init_lora_weights: true
|
||||||
|
training:
|
||||||
|
num_epochs: 1
|
||||||
|
per_device_batch_size: 32
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
learning_rate: 3e-4
|
||||||
|
warmup_steps: 200
|
||||||
|
max_length: 4096
|
||||||
|
bf16: true
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
weight_decay: 0.001
|
||||||
|
save_steps: 100
|
||||||
|
logging_steps: 20
|
||||||
|
dataloader_num_workers: 8
|
||||||
|
dataset_config:
|
||||||
|
streaming: true
|
||||||
|
max_samples: 200000
|
||||||
|
split: "train"
|
||||||
|
|
||||||
|
- name: "complex_reasoning"
|
||||||
|
description: "Complex multi-step reasoning with Mixture-of-Thoughts"
|
||||||
|
dataset_path: "open-r1/Mixture-of-Thoughts"
|
||||||
|
inherit_from: "math_reasoning"
|
||||||
|
adapter_config:
|
||||||
|
r: 256
|
||||||
|
lora_alpha: 512
|
||||||
|
lora_dropout: 0.1
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||||
|
init_lora_weights: true
|
||||||
|
training:
|
||||||
|
num_epochs: 1
|
||||||
|
per_device_batch_size: 16
|
||||||
|
gradient_accumulation_steps: 2
|
||||||
|
learning_rate: 2e-4
|
||||||
|
warmup_steps: 300
|
||||||
|
max_length: 8192
|
||||||
|
bf16: true
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
weight_decay: 0.001
|
||||||
|
save_steps: 200
|
||||||
|
logging_steps: 50
|
||||||
|
dataloader_num_workers: 8
|
||||||
|
dataset_config:
|
||||||
|
streaming: true
|
||||||
|
max_samples: 100000
|
||||||
|
split: "train"
|
||||||
|
|
||||||
|
evaluation:
|
||||||
|
benchmarks:
|
||||||
|
- "HLE"
|
||||||
|
- "Do-Not-Answer"
|
||||||
|
save_results: true
|
||||||
|
results_dir: "./outputs/evaluation_results"
|
||||||
113
config/training_config_8gpu_fsdp.yaml
Normal file
113
config/training_config_8gpu_fsdp.yaml
Normal file
|
|
@ -0,0 +1,113 @@
|
||||||
|
experiment:
|
||||||
|
name: "progressive_reasoning_8gpu_fsdp"
|
||||||
|
base_model: "google/gemma-2-2b-it" # Can scale to much larger models with FSDP
|
||||||
|
output_dir: "./outputs"
|
||||||
|
use_wandb: true
|
||||||
|
wandb_project: "matsuo-llm-comp-2025"
|
||||||
|
|
||||||
|
model:
|
||||||
|
load_in_4bit: false
|
||||||
|
device_map: null # Let FSDP handle device placement
|
||||||
|
gradient_checkpointing: true
|
||||||
|
use_flash_attention_2: true
|
||||||
|
use_eager_attention: false
|
||||||
|
|
||||||
|
# FSDP Configuration
|
||||||
|
fsdp:
|
||||||
|
fsdp_transformer_layer_cls_to_wrap: "Gemma2DecoderLayer" # Wrap at layer level
|
||||||
|
fsdp_sharding_strategy: "FULL_SHARD" # Shard parameters, gradients, and optimizer states
|
||||||
|
fsdp_cpu_offload: false # Keep on GPU for speed
|
||||||
|
fsdp_mixed_precision: true # Use BF16 mixed precision
|
||||||
|
fsdp_auto_wrap_policy: "TRANSFORMER_BASED_WRAP"
|
||||||
|
fsdp_min_num_params: 1000000 # Wrap layers with >1M parameters
|
||||||
|
fsdp_sync_module_states: true
|
||||||
|
fsdp_forward_prefetch: true
|
||||||
|
fsdp_use_orig_params: true # Important for LoRA compatibility
|
||||||
|
|
||||||
|
progressive_stages:
|
||||||
|
- name: "basic_cot"
|
||||||
|
description: "Basic Chain-of-Thought reasoning"
|
||||||
|
dataset_path: "./data/basic_cot/"
|
||||||
|
adapter_config:
|
||||||
|
r: 64 # Can use larger ranks with FSDP
|
||||||
|
lora_alpha: 128
|
||||||
|
lora_dropout: 0.1
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
|
||||||
|
init_lora_weights: true
|
||||||
|
training:
|
||||||
|
num_epochs: 2
|
||||||
|
per_device_batch_size: 32 # Very large batch size with FSDP
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
learning_rate: 5e-4
|
||||||
|
warmup_steps: 100
|
||||||
|
max_length: 2048
|
||||||
|
bf16: true
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
weight_decay: 0.001
|
||||||
|
save_steps: 50
|
||||||
|
logging_steps: 10
|
||||||
|
dataloader_num_workers: 8
|
||||||
|
dataloader_pin_memory: true
|
||||||
|
|
||||||
|
- name: "math_reasoning"
|
||||||
|
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
|
||||||
|
dataset_path: "open-r1/OpenR1-Math-220k"
|
||||||
|
inherit_from: "basic_cot"
|
||||||
|
adapter_config:
|
||||||
|
r: 128
|
||||||
|
lora_alpha: 256
|
||||||
|
lora_dropout: 0.1
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||||
|
init_lora_weights: true
|
||||||
|
training:
|
||||||
|
num_epochs: 1
|
||||||
|
per_device_batch_size: 16
|
||||||
|
gradient_accumulation_steps: 2
|
||||||
|
learning_rate: 3e-4
|
||||||
|
warmup_steps: 200
|
||||||
|
max_length: 4096
|
||||||
|
bf16: true
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
weight_decay: 0.001
|
||||||
|
save_steps: 100
|
||||||
|
logging_steps: 20
|
||||||
|
dataloader_num_workers: 8
|
||||||
|
dataset_config:
|
||||||
|
streaming: true
|
||||||
|
max_samples: 200000 # Process even more data
|
||||||
|
split: "train"
|
||||||
|
|
||||||
|
- name: "complex_reasoning"
|
||||||
|
description: "Complex multi-step reasoning with Mixture-of-Thoughts"
|
||||||
|
dataset_path: "open-r1/Mixture-of-Thoughts"
|
||||||
|
inherit_from: "math_reasoning"
|
||||||
|
adapter_config:
|
||||||
|
r: 256 # Very large rank possible with FSDP
|
||||||
|
lora_alpha: 512
|
||||||
|
lora_dropout: 0.1
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||||
|
init_lora_weights: true
|
||||||
|
training:
|
||||||
|
num_epochs: 1
|
||||||
|
per_device_batch_size: 8
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
learning_rate: 2e-4
|
||||||
|
warmup_steps: 300
|
||||||
|
max_length: 8192
|
||||||
|
bf16: true
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
weight_decay: 0.001
|
||||||
|
save_steps: 200
|
||||||
|
logging_steps: 50
|
||||||
|
dataloader_num_workers: 8
|
||||||
|
dataset_config:
|
||||||
|
streaming: true
|
||||||
|
max_samples: 100000
|
||||||
|
split: "train"
|
||||||
|
|
||||||
|
evaluation:
|
||||||
|
benchmarks:
|
||||||
|
- "HLE"
|
||||||
|
- "Do-Not-Answer"
|
||||||
|
save_results: true
|
||||||
|
results_dir: "./outputs/evaluation_results"
|
||||||
109
config/training_config_gemma3_1b_8gpu_ddp.yaml
Normal file
109
config/training_config_gemma3_1b_8gpu_ddp.yaml
Normal file
|
|
@ -0,0 +1,109 @@
|
||||||
|
experiment:
|
||||||
|
name: "progressive_reasoning_gemma3_1b_8gpu_ddp"
|
||||||
|
base_model: "google/gemma-3-1b-pt"
|
||||||
|
output_dir: "./outputs"
|
||||||
|
use_wandb: true
|
||||||
|
wandb_project: "matsuo-llm-comp-2025"
|
||||||
|
|
||||||
|
model:
|
||||||
|
load_in_4bit: false # Can use FP16/BF16 with multiple GPUs
|
||||||
|
bnb_4bit_compute_dtype: "bfloat16"
|
||||||
|
bnb_4bit_use_double_quant: true
|
||||||
|
device_map: "balanced" # Distribute across all GPUs
|
||||||
|
gradient_checkpointing: true
|
||||||
|
use_flash_attention_2: false
|
||||||
|
use_eager_attention: true
|
||||||
|
|
||||||
|
# Multi-GPU specific settings
|
||||||
|
distributed:
|
||||||
|
strategy: "ddp" # Distributed Data Parallel
|
||||||
|
find_unused_parameters: false
|
||||||
|
gradient_as_bucket_view: true
|
||||||
|
|
||||||
|
progressive_stages:
|
||||||
|
- name: "basic_cot"
|
||||||
|
description: "Basic Chain-of-Thought reasoning"
|
||||||
|
dataset_path: "./data/basic_cot/"
|
||||||
|
adapter_config:
|
||||||
|
r: 16 # Moderate rank for DDP
|
||||||
|
lora_alpha: 32
|
||||||
|
lora_dropout: 0.1
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
|
||||||
|
init_lora_weights: true
|
||||||
|
training:
|
||||||
|
num_epochs: 2
|
||||||
|
per_device_batch_size: 32 # 32 * 8 = 256 total batch size
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
learning_rate: 5e-4
|
||||||
|
warmup_steps: 100
|
||||||
|
max_length: 1024
|
||||||
|
bf16: true
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
weight_decay: 0.001
|
||||||
|
save_steps: 50
|
||||||
|
logging_steps: 10
|
||||||
|
dataloader_num_workers: 4
|
||||||
|
dataloader_pin_memory: true
|
||||||
|
|
||||||
|
- name: "math_reasoning"
|
||||||
|
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
|
||||||
|
dataset_path: "open-r1/OpenR1-Math-220k"
|
||||||
|
inherit_from: "basic_cot"
|
||||||
|
adapter_config:
|
||||||
|
r: 32
|
||||||
|
lora_alpha: 64
|
||||||
|
lora_dropout: 0.1
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||||
|
init_lora_weights: true
|
||||||
|
training:
|
||||||
|
num_epochs: 1
|
||||||
|
per_device_batch_size: 16 # 16 * 8 = 128 total batch size
|
||||||
|
gradient_accumulation_steps: 2
|
||||||
|
learning_rate: 3e-4
|
||||||
|
warmup_steps: 200
|
||||||
|
max_length: 2048
|
||||||
|
bf16: true
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
weight_decay: 0.001
|
||||||
|
save_steps: 100
|
||||||
|
logging_steps: 20
|
||||||
|
dataloader_num_workers: 4
|
||||||
|
dataset_config:
|
||||||
|
streaming: true
|
||||||
|
max_samples: 400000 # Process substantial data
|
||||||
|
split: "train"
|
||||||
|
|
||||||
|
- name: "complex_reasoning"
|
||||||
|
description: "Complex multi-step reasoning with Mixture-of-Thoughts"
|
||||||
|
dataset_path: "open-r1/Mixture-of-Thoughts"
|
||||||
|
inherit_from: "math_reasoning"
|
||||||
|
adapter_config:
|
||||||
|
r: 64
|
||||||
|
lora_alpha: 128
|
||||||
|
lora_dropout: 0.1
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||||
|
init_lora_weights: true
|
||||||
|
training:
|
||||||
|
num_epochs: 1
|
||||||
|
per_device_batch_size: 8 # 8 * 8 = 64 total batch size
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
learning_rate: 2e-4
|
||||||
|
warmup_steps: 300
|
||||||
|
max_length: 4096
|
||||||
|
bf16: true
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
weight_decay: 0.001
|
||||||
|
save_steps: 200
|
||||||
|
logging_steps: 50
|
||||||
|
dataloader_num_workers: 4
|
||||||
|
dataset_config:
|
||||||
|
streaming: true
|
||||||
|
max_samples: 600000
|
||||||
|
split: "train"
|
||||||
|
|
||||||
|
evaluation:
|
||||||
|
benchmarks:
|
||||||
|
- "HLE"
|
||||||
|
- "Do-Not-Answer"
|
||||||
|
save_results: true
|
||||||
|
results_dir: "./outputs/evaluation_results"
|
||||||
139
config/training_config_gemma3_1b_8gpu_deepspeed.yaml
Normal file
139
config/training_config_gemma3_1b_8gpu_deepspeed.yaml
Normal file
|
|
@ -0,0 +1,139 @@
|
||||||
|
experiment:
|
||||||
|
name: "progressive_reasoning_gemma3_1b_8gpu_deepspeed"
|
||||||
|
base_model: "google/gemma-3-1b-pt"
|
||||||
|
output_dir: "./outputs"
|
||||||
|
use_wandb: true
|
||||||
|
wandb_project: "matsuo-llm-comp-2025"
|
||||||
|
|
||||||
|
model:
|
||||||
|
load_in_4bit: false # Disable quantization for DeepSpeed
|
||||||
|
device_map: null # Let DeepSpeed handle device placement
|
||||||
|
gradient_checkpointing: true # Enable for memory efficiency
|
||||||
|
use_flash_attention_2: false
|
||||||
|
use_eager_attention: true
|
||||||
|
|
||||||
|
# DeepSpeed Configuration
|
||||||
|
deepspeed:
|
||||||
|
zero_optimization:
|
||||||
|
stage: 2 # ZeRO Stage 2 (partition optimizer states and gradients)
|
||||||
|
allgather_partitions: true
|
||||||
|
allgather_bucket_size: 500000000 # 500MB buckets
|
||||||
|
overlap_comm: true
|
||||||
|
reduce_scatter: true
|
||||||
|
reduce_bucket_size: 500000000
|
||||||
|
contiguous_gradients: true
|
||||||
|
cpu_offload: false # Keep on GPU for speed with small model
|
||||||
|
|
||||||
|
optimizer:
|
||||||
|
type: "AdamW"
|
||||||
|
params:
|
||||||
|
lr: 5e-4
|
||||||
|
betas: [0.9, 0.999]
|
||||||
|
eps: 1e-8
|
||||||
|
weight_decay: 0.001
|
||||||
|
|
||||||
|
scheduler:
|
||||||
|
type: "WarmupLR"
|
||||||
|
params:
|
||||||
|
warmup_min_lr: 0
|
||||||
|
warmup_max_lr: 5e-4
|
||||||
|
warmup_num_steps: 100
|
||||||
|
|
||||||
|
fp16:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
bf16:
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
gradient_clipping: 1.0
|
||||||
|
|
||||||
|
train_batch_size: 512 # Total batch size across all GPUs
|
||||||
|
train_micro_batch_size_per_gpu: 64 # Per-GPU batch size
|
||||||
|
|
||||||
|
progressive_stages:
|
||||||
|
- name: "basic_cot"
|
||||||
|
description: "Basic Chain-of-Thought reasoning"
|
||||||
|
dataset_path: "./data/basic_cot/"
|
||||||
|
adapter_config:
|
||||||
|
r: 32 # Larger rank with 8 GPUs
|
||||||
|
lora_alpha: 64
|
||||||
|
lora_dropout: 0.1
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
|
||||||
|
init_lora_weights: true
|
||||||
|
training:
|
||||||
|
num_epochs: 2
|
||||||
|
per_device_batch_size: 64 # Large batch with DeepSpeed
|
||||||
|
gradient_accumulation_steps: 1 # No accumulation needed
|
||||||
|
learning_rate: 5e-4
|
||||||
|
warmup_steps: 100
|
||||||
|
max_length: 1024
|
||||||
|
bf16: true
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
weight_decay: 0.001
|
||||||
|
save_steps: 50
|
||||||
|
logging_steps: 10
|
||||||
|
dataloader_num_workers: 8
|
||||||
|
dataloader_pin_memory: true
|
||||||
|
|
||||||
|
- name: "math_reasoning"
|
||||||
|
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
|
||||||
|
dataset_path: "open-r1/OpenR1-Math-220k"
|
||||||
|
inherit_from: "basic_cot"
|
||||||
|
adapter_config:
|
||||||
|
r: 64 # Larger rank for math reasoning
|
||||||
|
lora_alpha: 128
|
||||||
|
lora_dropout: 0.1
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||||
|
init_lora_weights: true
|
||||||
|
training:
|
||||||
|
num_epochs: 1
|
||||||
|
per_device_batch_size: 32 # Reduce for longer sequences
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
learning_rate: 3e-4
|
||||||
|
warmup_steps: 200
|
||||||
|
max_length: 2048
|
||||||
|
bf16: true
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
weight_decay: 0.001
|
||||||
|
save_steps: 100
|
||||||
|
logging_steps: 20
|
||||||
|
dataloader_num_workers: 8
|
||||||
|
dataset_config:
|
||||||
|
streaming: true
|
||||||
|
max_samples: 500000 # Process more data with 8 GPUs
|
||||||
|
split: "train"
|
||||||
|
|
||||||
|
- name: "complex_reasoning"
|
||||||
|
description: "Complex multi-step reasoning with Mixture-of-Thoughts"
|
||||||
|
dataset_path: "open-r1/Mixture-of-Thoughts"
|
||||||
|
inherit_from: "math_reasoning"
|
||||||
|
adapter_config:
|
||||||
|
r: 128 # Maximum rank for complex reasoning
|
||||||
|
lora_alpha: 256
|
||||||
|
lora_dropout: 0.1
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||||
|
init_lora_weights: true
|
||||||
|
training:
|
||||||
|
num_epochs: 1
|
||||||
|
per_device_batch_size: 16 # Reduce for very long sequences
|
||||||
|
gradient_accumulation_steps: 2
|
||||||
|
learning_rate: 2e-4
|
||||||
|
warmup_steps: 300
|
||||||
|
max_length: 4096
|
||||||
|
bf16: true
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
weight_decay: 0.001
|
||||||
|
save_steps: 200
|
||||||
|
logging_steps: 50
|
||||||
|
dataloader_num_workers: 8
|
||||||
|
dataset_config:
|
||||||
|
streaming: true
|
||||||
|
max_samples: 800000 # Process even more data
|
||||||
|
split: "train"
|
||||||
|
|
||||||
|
evaluation:
|
||||||
|
benchmarks:
|
||||||
|
- "HLE"
|
||||||
|
- "Do-Not-Answer"
|
||||||
|
save_results: true
|
||||||
|
results_dir: "./outputs/evaluation_results"
|
||||||
113
config/training_config_gemma3_1b_8gpu_fsdp.yaml
Normal file
113
config/training_config_gemma3_1b_8gpu_fsdp.yaml
Normal file
|
|
@ -0,0 +1,113 @@
|
||||||
|
experiment:
|
||||||
|
name: "progressive_reasoning_gemma3_1b_8gpu_fsdp"
|
||||||
|
base_model: "google/gemma-3-1b-pt"
|
||||||
|
output_dir: "./outputs"
|
||||||
|
use_wandb: true
|
||||||
|
wandb_project: "matsuo-llm-comp-2025"
|
||||||
|
|
||||||
|
model:
|
||||||
|
load_in_4bit: false
|
||||||
|
device_map: null # Let FSDP handle device placement
|
||||||
|
gradient_checkpointing: true
|
||||||
|
use_flash_attention_2: false
|
||||||
|
use_eager_attention: true
|
||||||
|
|
||||||
|
# FSDP Configuration
|
||||||
|
fsdp:
|
||||||
|
fsdp_transformer_layer_cls_to_wrap: "GemmaDecoderLayer" # Wrap at layer level
|
||||||
|
fsdp_sharding_strategy: "FULL_SHARD" # Shard parameters, gradients, and optimizer states
|
||||||
|
fsdp_cpu_offload: false # Keep on GPU for speed with small model
|
||||||
|
fsdp_mixed_precision: true # Use BF16 mixed precision
|
||||||
|
fsdp_auto_wrap_policy: "TRANSFORMER_BASED_WRAP"
|
||||||
|
fsdp_min_num_params: 1000000 # Wrap layers with >1M parameters
|
||||||
|
fsdp_sync_module_states: true
|
||||||
|
fsdp_forward_prefetch: true
|
||||||
|
fsdp_use_orig_params: true # Important for LoRA compatibility
|
||||||
|
|
||||||
|
progressive_stages:
|
||||||
|
- name: "basic_cot"
|
||||||
|
description: "Basic Chain-of-Thought reasoning"
|
||||||
|
dataset_path: "./data/basic_cot/"
|
||||||
|
adapter_config:
|
||||||
|
r: 32 # Can use larger ranks with FSDP
|
||||||
|
lora_alpha: 64
|
||||||
|
lora_dropout: 0.1
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
|
||||||
|
init_lora_weights: true
|
||||||
|
training:
|
||||||
|
num_epochs: 2
|
||||||
|
per_device_batch_size: 48 # Very large batch size with FSDP
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
learning_rate: 5e-4
|
||||||
|
warmup_steps: 100
|
||||||
|
max_length: 1024
|
||||||
|
bf16: true
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
weight_decay: 0.001
|
||||||
|
save_steps: 50
|
||||||
|
logging_steps: 10
|
||||||
|
dataloader_num_workers: 8
|
||||||
|
dataloader_pin_memory: true
|
||||||
|
|
||||||
|
- name: "math_reasoning"
|
||||||
|
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
|
||||||
|
dataset_path: "open-r1/OpenR1-Math-220k"
|
||||||
|
inherit_from: "basic_cot"
|
||||||
|
adapter_config:
|
||||||
|
r: 64
|
||||||
|
lora_alpha: 128
|
||||||
|
lora_dropout: 0.1
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||||
|
init_lora_weights: true
|
||||||
|
training:
|
||||||
|
num_epochs: 1
|
||||||
|
per_device_batch_size: 24
|
||||||
|
gradient_accumulation_steps: 2
|
||||||
|
learning_rate: 3e-4
|
||||||
|
warmup_steps: 200
|
||||||
|
max_length: 2048
|
||||||
|
bf16: true
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
weight_decay: 0.001
|
||||||
|
save_steps: 100
|
||||||
|
logging_steps: 20
|
||||||
|
dataloader_num_workers: 8
|
||||||
|
dataset_config:
|
||||||
|
streaming: true
|
||||||
|
max_samples: 600000 # Process even more data with FSDP
|
||||||
|
split: "train"
|
||||||
|
|
||||||
|
- name: "complex_reasoning"
|
||||||
|
description: "Complex multi-step reasoning with Mixture-of-Thoughts"
|
||||||
|
dataset_path: "open-r1/Mixture-of-Thoughts"
|
||||||
|
inherit_from: "math_reasoning"
|
||||||
|
adapter_config:
|
||||||
|
r: 128 # Very large rank possible with FSDP
|
||||||
|
lora_alpha: 256
|
||||||
|
lora_dropout: 0.1
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||||
|
init_lora_weights: true
|
||||||
|
training:
|
||||||
|
num_epochs: 1
|
||||||
|
per_device_batch_size: 12
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
learning_rate: 2e-4
|
||||||
|
warmup_steps: 300
|
||||||
|
max_length: 4096
|
||||||
|
bf16: true
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
weight_decay: 0.001
|
||||||
|
save_steps: 200
|
||||||
|
logging_steps: 50
|
||||||
|
dataloader_num_workers: 8
|
||||||
|
dataset_config:
|
||||||
|
streaming: true
|
||||||
|
max_samples: 1000000 # Can process 1M samples efficiently
|
||||||
|
split: "train"
|
||||||
|
|
||||||
|
evaluation:
|
||||||
|
benchmarks:
|
||||||
|
- "HLE"
|
||||||
|
- "Do-Not-Answer"
|
||||||
|
save_results: true
|
||||||
|
results_dir: "./outputs/evaluation_results"
|
||||||
37
docs/README.md
Normal file
37
docs/README.md
Normal file
|
|
@ -0,0 +1,37 @@
|
||||||
|
# Progressive LLM Training Documentation
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
uv sync
|
||||||
|
```
|
||||||
|
|
||||||
|
## Training
|
||||||
|
|
||||||
|
### Single GPU
|
||||||
|
```bash
|
||||||
|
uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
### 8 GPUs
|
||||||
|
```bash
|
||||||
|
./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
- `config/training_config_gemma3_1b.yaml` - Single GPU
|
||||||
|
- `config/training_config_gemma3_1b_8gpu_deepspeed.yaml` - 8 GPUs
|
||||||
|
|
||||||
|
## Environment
|
||||||
|
|
||||||
|
Copy `.env.example` to `.env` and set:
|
||||||
|
- `HF_TOKEN` - HuggingFace token
|
||||||
|
- `WANDB_API_KEY` - W&B API key
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
- Reduce `per_device_batch_size` for memory issues
|
||||||
|
- `export NCCL_DEBUG=INFO` for NCCL errors
|
||||||
|
- `nvidia-smi` to check GPUs
|
||||||
26
pyproject.toml
Normal file
26
pyproject.toml
Normal file
|
|
@ -0,0 +1,26 @@
|
||||||
|
[project]
|
||||||
|
name = "progressive-llm-training"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Progressive LLM Training for 松尾研LLMコンペ2025"
|
||||||
|
requires-python = ">=3.9"
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
"torch>=2.0.0",
|
||||||
|
"transformers>=4.40.0",
|
||||||
|
"accelerate>=0.27.0",
|
||||||
|
"peft>=0.11.0",
|
||||||
|
"trl>=0.9.0",
|
||||||
|
"datasets>=2.18.0",
|
||||||
|
"bitsandbytes>=0.43.0",
|
||||||
|
"wandb>=0.16.0",
|
||||||
|
"pyyaml>=6.0",
|
||||||
|
"jsonlines>=4.0.0",
|
||||||
|
"deepspeed>=0.12.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = ["pytest", "black", "isort"]
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["hatchling"]
|
||||||
|
build-backend = "hatchling.build"
|
||||||
|
|
@ -1,3 +1,5 @@
|
||||||
|
# Use uv instead: uv sync
|
||||||
|
torch>=2.0.0
|
||||||
transformers>=4.40.0
|
transformers>=4.40.0
|
||||||
accelerate>=0.27.0
|
accelerate>=0.27.0
|
||||||
peft>=0.11.0
|
peft>=0.11.0
|
||||||
|
|
@ -7,7 +9,4 @@ bitsandbytes>=0.43.0
|
||||||
wandb>=0.16.0
|
wandb>=0.16.0
|
||||||
pyyaml>=6.0
|
pyyaml>=6.0
|
||||||
jsonlines>=4.0.0
|
jsonlines>=4.0.0
|
||||||
scikit-learn>=1.3.0
|
deepspeed>=0.12.0
|
||||||
# flash-attn>=2.5.0 # Install separately with --no-build-isolation
|
|
||||||
sentencepiece>=0.2.0
|
|
||||||
protobuf>=4.25.0
|
|
||||||
|
|
|
||||||
201
scripts/debug_model_loading.py
Normal file
201
scripts/debug_model_loading.py
Normal file
|
|
@ -0,0 +1,201 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Debug script to identify model loading issues
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import torch
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add src to path
|
||||||
|
sys.path.append(str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
def clear_accelerate_env():
|
||||||
|
"""Clear all ACCELERATE environment variables"""
|
||||||
|
print("Clearing ACCELERATE environment variables...")
|
||||||
|
env_vars_to_clear = []
|
||||||
|
for key in os.environ:
|
||||||
|
if 'ACCELERATE' in key:
|
||||||
|
env_vars_to_clear.append(key)
|
||||||
|
|
||||||
|
for var in env_vars_to_clear:
|
||||||
|
print(f" Removing {var}={os.environ[var]}")
|
||||||
|
del os.environ[var]
|
||||||
|
|
||||||
|
def test_basic_model_loading():
|
||||||
|
"""Test basic model loading without any configuration"""
|
||||||
|
print("Testing basic model loading...")
|
||||||
|
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
|
||||||
|
model_name = "google/gemma-2-2b-it"
|
||||||
|
|
||||||
|
try:
|
||||||
|
print("Testing with absolutely minimal config...")
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
torch_dtype=torch.float32
|
||||||
|
)
|
||||||
|
print("✅ Basic loading successful!")
|
||||||
|
del model
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Basic loading failed: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_with_device_map():
|
||||||
|
"""Test with device_map auto"""
|
||||||
|
print("Testing with device_map='auto'...")
|
||||||
|
|
||||||
|
from transformers import AutoModelForCausalLM
|
||||||
|
|
||||||
|
model_name = "google/gemma-2-2b-it"
|
||||||
|
|
||||||
|
try:
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
torch_dtype=torch.float32,
|
||||||
|
device_map="auto"
|
||||||
|
)
|
||||||
|
print("✅ Device map loading successful!")
|
||||||
|
del model
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Device map loading failed: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_with_quantization():
|
||||||
|
"""Test with quantization"""
|
||||||
|
print("Testing with 4-bit quantization...")
|
||||||
|
|
||||||
|
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
||||||
|
|
||||||
|
model_name = "google/gemma-2-2b-it"
|
||||||
|
|
||||||
|
try:
|
||||||
|
bnb_config = BitsAndBytesConfig(
|
||||||
|
load_in_4bit=True,
|
||||||
|
bnb_4bit_compute_dtype=torch.bfloat16,
|
||||||
|
bnb_4bit_use_double_quant=True,
|
||||||
|
bnb_4bit_quant_type="nf4"
|
||||||
|
)
|
||||||
|
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
quantization_config=bnb_config
|
||||||
|
)
|
||||||
|
print("✅ Quantization loading successful!")
|
||||||
|
del model
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Quantization loading failed: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def print_environment_info():
|
||||||
|
"""Print detailed environment information"""
|
||||||
|
print("\n" + "="*50)
|
||||||
|
print("ENVIRONMENT INFORMATION")
|
||||||
|
print("="*50)
|
||||||
|
|
||||||
|
# Python version
|
||||||
|
print(f"Python version: {sys.version}")
|
||||||
|
|
||||||
|
# PyTorch info
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
print(f"PyTorch version: {torch.__version__}")
|
||||||
|
print(f"CUDA available: {torch.cuda.is_available()}")
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
print(f"CUDA device count: {torch.cuda.device_count()}")
|
||||||
|
for i in range(torch.cuda.device_count()):
|
||||||
|
print(f" Device {i}: {torch.cuda.get_device_name(i)}")
|
||||||
|
print(f"CUDA version: {torch.version.cuda}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"PyTorch info error: {e}")
|
||||||
|
|
||||||
|
# Transformers info
|
||||||
|
try:
|
||||||
|
from transformers import __version__ as tf_version
|
||||||
|
print(f"Transformers version: {tf_version}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Transformers info error: {e}")
|
||||||
|
|
||||||
|
# Accelerate info
|
||||||
|
try:
|
||||||
|
from accelerate import __version__ as acc_version
|
||||||
|
print(f"Accelerate version: {acc_version}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Accelerate info error: {e}")
|
||||||
|
|
||||||
|
# PEFT info
|
||||||
|
try:
|
||||||
|
from peft import __version__ as peft_version
|
||||||
|
print(f"PEFT version: {peft_version}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"PEFT info error: {e}")
|
||||||
|
|
||||||
|
# BitsAndBytes info
|
||||||
|
try:
|
||||||
|
import bitsandbytes as bnb
|
||||||
|
print(f"BitsAndBytes version: {bnb.__version__}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"BitsAndBytes info error: {e}")
|
||||||
|
|
||||||
|
# Environment variables
|
||||||
|
print("\nRelevant environment variables:")
|
||||||
|
for key, value in sorted(os.environ.items()):
|
||||||
|
if any(prefix in key for prefix in ['CUDA', 'TORCH', 'HF_', 'ACCELERATE', 'TRANSFORMERS']):
|
||||||
|
print(f" {key}={value}")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("Progressive LLM Training - Model Loading Debug")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Print environment info first
|
||||||
|
print_environment_info()
|
||||||
|
|
||||||
|
# Clear environment variables
|
||||||
|
clear_accelerate_env()
|
||||||
|
|
||||||
|
# Test various loading methods
|
||||||
|
print("\n" + "="*50)
|
||||||
|
print("TESTING MODEL LOADING")
|
||||||
|
print("="*50)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Test 1: Basic loading
|
||||||
|
results.append(("Basic loading", test_basic_model_loading()))
|
||||||
|
|
||||||
|
# Test 2: With device map
|
||||||
|
results.append(("Device map", test_with_device_map()))
|
||||||
|
|
||||||
|
# Test 3: With quantization
|
||||||
|
results.append(("Quantization", test_with_quantization()))
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print("\n" + "="*50)
|
||||||
|
print("SUMMARY")
|
||||||
|
print("="*50)
|
||||||
|
|
||||||
|
for test_name, success in results:
|
||||||
|
status = "✅ PASS" if success else "❌ FAIL"
|
||||||
|
print(f"{test_name}: {status}")
|
||||||
|
|
||||||
|
if any(result[1] for result in results):
|
||||||
|
print("\n✅ At least one loading method works!")
|
||||||
|
print("Use the successful method in your configuration.")
|
||||||
|
else:
|
||||||
|
print("\n❌ All loading methods failed!")
|
||||||
|
print("This indicates a fundamental environment issue.")
|
||||||
|
print("Consider:")
|
||||||
|
print("1. Reinstalling transformers, accelerate, torch")
|
||||||
|
print("2. Checking CUDA installation")
|
||||||
|
print("3. Using a different model")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
161
scripts/train_gemma3_1b_8gpu.sh
Executable file
161
scripts/train_gemma3_1b_8gpu.sh
Executable file
|
|
@ -0,0 +1,161 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# Training launcher script for Gemma3 1B with 8 GPUs (uv compatible)
|
||||||
|
|
||||||
|
# Color codes for output
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
echo -e "${GREEN}Progressive LLM Training - Gemma3 1B 8GPU Launcher (uv)${NC}"
|
||||||
|
echo "======================================================="
|
||||||
|
|
||||||
|
# Check if uv is available
|
||||||
|
if command -v uv &> /dev/null; then
|
||||||
|
echo -e "${GREEN}Using uv for Python environment management${NC}"
|
||||||
|
UV_PREFIX="uv run"
|
||||||
|
else
|
||||||
|
echo -e "${YELLOW}uv not found, using standard python${NC}"
|
||||||
|
UV_PREFIX="python"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Default values
|
||||||
|
STRATEGY="deepspeed"
|
||||||
|
CONFIG=""
|
||||||
|
NUM_GPUS=8
|
||||||
|
DRY_RUN=false
|
||||||
|
|
||||||
|
# Parse command line arguments
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case $1 in
|
||||||
|
--strategy)
|
||||||
|
STRATEGY="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--config)
|
||||||
|
CONFIG="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--num-gpus)
|
||||||
|
NUM_GPUS="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--dry-run)
|
||||||
|
DRY_RUN=true
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
-h|--help)
|
||||||
|
echo "Usage: $0 [options]"
|
||||||
|
echo ""
|
||||||
|
echo "Options:"
|
||||||
|
echo " --strategy <ddp|fsdp|deepspeed> Training strategy (default: deepspeed)"
|
||||||
|
echo " --config <path> Custom config file (optional)"
|
||||||
|
echo " --num-gpus <n> Number of GPUs to use (default: 8)"
|
||||||
|
echo " --dry-run Show command without executing"
|
||||||
|
echo ""
|
||||||
|
echo "Examples:"
|
||||||
|
echo " # Use DeepSpeed (recommended)"
|
||||||
|
echo " $0 --strategy deepspeed"
|
||||||
|
echo ""
|
||||||
|
echo " # Use DDP"
|
||||||
|
echo " $0 --strategy ddp"
|
||||||
|
echo ""
|
||||||
|
echo " # Use FSDP"
|
||||||
|
echo " $0 --strategy fsdp"
|
||||||
|
echo ""
|
||||||
|
echo " # Use custom config"
|
||||||
|
echo " $0 --strategy ddp --config config/my_config.yaml"
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo -e "${RED}Error: Unknown option $1${NC}"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Check GPU availability
|
||||||
|
GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
|
||||||
|
echo -e "Available GPUs: ${GREEN}$GPU_COUNT${NC}"
|
||||||
|
|
||||||
|
if [ $GPU_COUNT -lt $NUM_GPUS ]; then
|
||||||
|
echo -e "${RED}Error: Requested $NUM_GPUS GPUs but only $GPU_COUNT available${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Set default config based on strategy if not provided
|
||||||
|
if [ -z "$CONFIG" ]; then
|
||||||
|
case $STRATEGY in
|
||||||
|
ddp)
|
||||||
|
CONFIG="config/training_config_gemma3_1b_8gpu_ddp.yaml"
|
||||||
|
;;
|
||||||
|
fsdp)
|
||||||
|
CONFIG="config/training_config_gemma3_1b_8gpu_fsdp.yaml"
|
||||||
|
;;
|
||||||
|
deepspeed)
|
||||||
|
CONFIG="config/training_config_gemma3_1b_8gpu_deepspeed.yaml"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo -e "${RED}Error: Invalid strategy '$STRATEGY'. Choose from: ddp, fsdp, deepspeed${NC}"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if config file exists
|
||||||
|
if [ ! -f "$CONFIG" ]; then
|
||||||
|
echo -e "${RED}Error: Config file not found: $CONFIG${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo -e "Strategy: ${YELLOW}$STRATEGY${NC}"
|
||||||
|
echo -e "Config: ${YELLOW}$CONFIG${NC}"
|
||||||
|
echo -e "GPUs: ${YELLOW}$NUM_GPUS${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Build the command
|
||||||
|
CMD="$UV_PREFIX scripts/train_multi_gpu.py --config $CONFIG --strategy $STRATEGY --num_gpus $NUM_GPUS"
|
||||||
|
|
||||||
|
if [ "$DRY_RUN" = true ]; then
|
||||||
|
echo -e "${YELLOW}Dry run mode - Command that would be executed:${NC}"
|
||||||
|
echo "$CMD"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Show GPU memory before training
|
||||||
|
echo -e "${GREEN}GPU Memory Usage Before Training:${NC}"
|
||||||
|
nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${GREEN}Starting training...${NC}"
|
||||||
|
echo "Command: $CMD"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Set environment variables for optimal performance
|
||||||
|
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||||
|
export NCCL_DEBUG=WARN # Set to INFO for debugging
|
||||||
|
export NCCL_ASYNC_ERROR_HANDLING=1
|
||||||
|
|
||||||
|
# For DeepSpeed, set additional optimizations
|
||||||
|
if [ "$STRATEGY" = "deepspeed" ]; then
|
||||||
|
export DS_SKIP_CUDA_CHECK=1
|
||||||
|
export TOKENIZERS_PARALLELISM=false
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Execute the training command
|
||||||
|
$CMD
|
||||||
|
|
||||||
|
# Check exit status
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo ""
|
||||||
|
echo -e "${GREEN}Training completed successfully!${NC}"
|
||||||
|
|
||||||
|
# Show GPU memory after training
|
||||||
|
echo ""
|
||||||
|
echo -e "${GREEN}GPU Memory Usage After Training:${NC}"
|
||||||
|
nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo -e "${RED}Training failed!${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
224
scripts/train_multi_gpu.py
Executable file
224
scripts/train_multi_gpu.py
Executable file
|
|
@ -0,0 +1,224 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Multi-GPU training launcher for progressive reasoning model
|
||||||
|
Supports DDP, FSDP, and DeepSpeed strategies
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
import shutil
|
||||||
|
import torch
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add src to path
|
||||||
|
sys.path.append(str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
|
||||||
|
def get_gpu_count():
|
||||||
|
"""Get the number of available GPUs"""
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
return torch.cuda.device_count()
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def setup_environment_for_strategy(strategy):
|
||||||
|
"""Set up environment variables for different strategies"""
|
||||||
|
if strategy == "deepspeed":
|
||||||
|
# DeepSpeed specific environment
|
||||||
|
os.environ["MASTER_ADDR"] = "localhost"
|
||||||
|
os.environ["MASTER_PORT"] = "12355"
|
||||||
|
os.environ["RANK"] = "0"
|
||||||
|
os.environ["LOCAL_RANK"] = "0"
|
||||||
|
os.environ["WORLD_SIZE"] = str(get_gpu_count())
|
||||||
|
elif strategy in ["ddp", "fsdp"]:
|
||||||
|
# Standard distributed training environment
|
||||||
|
os.environ["MASTER_ADDR"] = "localhost"
|
||||||
|
os.environ["MASTER_PORT"] = "12355"
|
||||||
|
# Let torchrun handle the rest
|
||||||
|
|
||||||
|
# General optimizations
|
||||||
|
os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
|
||||||
|
os.environ["NCCL_DEBUG"] = "INFO"
|
||||||
|
os.environ["TORCH_DISTRIBUTED_DEBUG"] = "INFO"
|
||||||
|
|
||||||
|
|
||||||
|
def launch_ddp_training(config_path, num_gpus):
|
||||||
|
"""Launch DDP training using torchrun"""
|
||||||
|
print(f"Launching DDP training on {num_gpus} GPUs...")
|
||||||
|
|
||||||
|
setup_environment_for_strategy("ddp")
|
||||||
|
|
||||||
|
# Use torchrun for DDP
|
||||||
|
# Check if uv is available
|
||||||
|
python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"]
|
||||||
|
|
||||||
|
cmd = [
|
||||||
|
"torchrun",
|
||||||
|
"--nproc_per_node", str(num_gpus),
|
||||||
|
"--master_port", "12355",
|
||||||
|
] + python_cmd + [
|
||||||
|
"--config", config_path,
|
||||||
|
"--distributed"
|
||||||
|
]
|
||||||
|
|
||||||
|
print(f"Running command: {' '.join(cmd)}")
|
||||||
|
return subprocess.run(cmd, cwd=Path(__file__).parent.parent)
|
||||||
|
|
||||||
|
|
||||||
|
def launch_fsdp_training(config_path, num_gpus):
|
||||||
|
"""Launch FSDP training using accelerate"""
|
||||||
|
print(f"Launching FSDP training on {num_gpus} GPUs...")
|
||||||
|
|
||||||
|
setup_environment_for_strategy("fsdp")
|
||||||
|
|
||||||
|
# Create accelerate config for FSDP
|
||||||
|
accelerate_config = f"""
|
||||||
|
compute_environment: LOCAL_MACHINE
|
||||||
|
distributed_type: FSDP
|
||||||
|
fsdp_config:
|
||||||
|
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||||
|
fsdp_backward_prefetch: BACKWARD_PRE
|
||||||
|
fsdp_cpu_ram_efficient_loading: true
|
||||||
|
fsdp_forward_prefetch: false
|
||||||
|
fsdp_offload_params: false
|
||||||
|
fsdp_sharding_strategy: FULL_SHARD
|
||||||
|
fsdp_state_dict_type: SHARDED_STATE_DICT
|
||||||
|
fsdp_sync_module_states: true
|
||||||
|
fsdp_transformer_layer_cls_to_wrap: Gemma2DecoderLayer
|
||||||
|
fsdp_use_orig_params: true
|
||||||
|
machine_rank: 0
|
||||||
|
main_training_function: main
|
||||||
|
mixed_precision: bf16
|
||||||
|
num_machines: 1
|
||||||
|
num_processes: {num_gpus}
|
||||||
|
rdzv_backend: static
|
||||||
|
same_network: true
|
||||||
|
tpu_env: []
|
||||||
|
tpu_use_cluster: false
|
||||||
|
tpu_use_sudo: false
|
||||||
|
use_cpu: false
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Save config temporarily
|
||||||
|
config_file = Path(__file__).parent.parent / "accelerate_config.yaml"
|
||||||
|
with open(config_file, "w") as f:
|
||||||
|
f.write(accelerate_config)
|
||||||
|
|
||||||
|
# Check if uv is available
|
||||||
|
python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"]
|
||||||
|
|
||||||
|
cmd = [
|
||||||
|
"accelerate", "launch",
|
||||||
|
"--config_file", str(config_file),
|
||||||
|
] + python_cmd + [
|
||||||
|
"--config", config_path
|
||||||
|
]
|
||||||
|
|
||||||
|
print(f"Running command: {' '.join(cmd)}")
|
||||||
|
result = subprocess.run(cmd, cwd=Path(__file__).parent.parent)
|
||||||
|
|
||||||
|
# Clean up config file
|
||||||
|
config_file.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def launch_deepspeed_training(config_path, num_gpus):
|
||||||
|
"""Launch DeepSpeed training"""
|
||||||
|
print(f"Launching DeepSpeed training on {num_gpus} GPUs...")
|
||||||
|
|
||||||
|
setup_environment_for_strategy("deepspeed")
|
||||||
|
|
||||||
|
# Create DeepSpeed hostfile
|
||||||
|
hostfile = Path(__file__).parent.parent / "hostfile"
|
||||||
|
with open(hostfile, "w") as f:
|
||||||
|
f.write(f"localhost slots={num_gpus}\n")
|
||||||
|
|
||||||
|
# Check if uv is available
|
||||||
|
python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"]
|
||||||
|
|
||||||
|
cmd = [
|
||||||
|
"deepspeed",
|
||||||
|
"--hostfile", str(hostfile),
|
||||||
|
"--num_gpus", str(num_gpus),
|
||||||
|
] + python_cmd + [
|
||||||
|
"--config", config_path,
|
||||||
|
"--deepspeed"
|
||||||
|
]
|
||||||
|
|
||||||
|
print(f"Running command: {' '.join(cmd)}")
|
||||||
|
result = subprocess.run(cmd, cwd=Path(__file__).parent.parent)
|
||||||
|
|
||||||
|
# Clean up hostfile
|
||||||
|
hostfile.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Multi-GPU Progressive LLM Training")
|
||||||
|
parser.add_argument("--config", type=str, required=True,
|
||||||
|
help="Path to training configuration file")
|
||||||
|
parser.add_argument("--strategy", type=str, default="ddp",
|
||||||
|
choices=["ddp", "fsdp", "deepspeed"],
|
||||||
|
help="Multi-GPU strategy to use")
|
||||||
|
parser.add_argument("--num_gpus", type=int, default=None,
|
||||||
|
help="Number of GPUs to use (default: all available)")
|
||||||
|
parser.add_argument("--dry_run", action="store_true",
|
||||||
|
help="Print commands without executing")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Get GPU count
|
||||||
|
available_gpus = get_gpu_count()
|
||||||
|
if available_gpus == 0:
|
||||||
|
print("❌ No GPUs available!")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
num_gpus = args.num_gpus or available_gpus
|
||||||
|
if num_gpus > available_gpus:
|
||||||
|
print(f"❌ Requested {num_gpus} GPUs but only {available_gpus} available")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Check config file exists
|
||||||
|
if not Path(args.config).exists():
|
||||||
|
print(f"❌ Config file not found: {args.config}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print("Progressive LLM Training - Multi-GPU Launcher")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"Strategy: {args.strategy}")
|
||||||
|
print(f"GPUs: {num_gpus} / {available_gpus}")
|
||||||
|
print(f"Config: {args.config}")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
print("DRY RUN - Commands that would be executed:")
|
||||||
|
# Show what would be run
|
||||||
|
if args.strategy == "ddp":
|
||||||
|
print("torchrun --nproc_per_node", num_gpus, "scripts/train_progressive.py")
|
||||||
|
elif args.strategy == "fsdp":
|
||||||
|
print("accelerate launch --config_file accelerate_config.yaml scripts/train_progressive.py")
|
||||||
|
elif args.strategy == "deepspeed":
|
||||||
|
print("deepspeed --num_gpus", num_gpus, "scripts/train_progressive.py")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Launch training
|
||||||
|
if args.strategy == "ddp":
|
||||||
|
result = launch_ddp_training(args.config, num_gpus)
|
||||||
|
elif args.strategy == "fsdp":
|
||||||
|
result = launch_fsdp_training(args.config, num_gpus)
|
||||||
|
elif args.strategy == "deepspeed":
|
||||||
|
result = launch_deepspeed_training(args.config, num_gpus)
|
||||||
|
|
||||||
|
if result.returncode == 0:
|
||||||
|
print("✅ Training completed successfully!")
|
||||||
|
else:
|
||||||
|
print("❌ Training failed!")
|
||||||
|
sys.exit(result.returncode)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -6,6 +6,7 @@ Main training script for progressive reasoning model
|
||||||
import sys
|
import sys
|
||||||
import yaml
|
import yaml
|
||||||
import argparse
|
import argparse
|
||||||
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
# Add src to path
|
# Add src to path
|
||||||
|
|
@ -56,6 +57,18 @@ Examples:
|
||||||
help="Load config and model but skip training (for testing)"
|
help="Load config and model but skip training (for testing)"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--distributed",
|
||||||
|
action="store_true",
|
||||||
|
help="Enable distributed training"
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--deepspeed",
|
||||||
|
action="store_true",
|
||||||
|
help="Enable DeepSpeed training"
|
||||||
|
)
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -74,9 +87,34 @@ def load_config(config_path: str) -> dict:
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def setup_distributed_training():
|
||||||
|
"""Setup distributed training environment"""
|
||||||
|
# Check if we're in a distributed environment
|
||||||
|
if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
|
||||||
|
import torch.distributed as dist
|
||||||
|
import torch
|
||||||
|
|
||||||
|
# Initialize distributed training
|
||||||
|
if not dist.is_initialized():
|
||||||
|
dist.init_process_group(backend="nccl")
|
||||||
|
|
||||||
|
local_rank = int(os.environ.get("LOCAL_RANK", 0))
|
||||||
|
torch.cuda.set_device(local_rank)
|
||||||
|
|
||||||
|
print(f"Distributed training initialized: rank {dist.get_rank()}/{dist.get_world_size()}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
|
# Setup distributed training if requested
|
||||||
|
is_distributed = False
|
||||||
|
if args.distributed or args.deepspeed:
|
||||||
|
is_distributed = setup_distributed_training()
|
||||||
|
|
||||||
print("Progressive LLM Training for 松尾研LLMコンペ2025")
|
print("Progressive LLM Training for 松尾研LLMコンペ2025")
|
||||||
print("=" * 50)
|
print("=" * 50)
|
||||||
|
|
||||||
|
|
@ -95,11 +133,26 @@ def main():
|
||||||
print(f"Error loading config: {e}")
|
print(f"Error loading config: {e}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Add distributed/deepspeed flags to config
|
||||||
|
config["training_args"] = config.get("training_args", {})
|
||||||
|
if args.distributed:
|
||||||
|
config["training_args"]["distributed"] = True
|
||||||
|
if args.deepspeed:
|
||||||
|
config["training_args"]["deepspeed"] = True
|
||||||
|
# Add DeepSpeed config from main config
|
||||||
|
if "deepspeed" in config:
|
||||||
|
config["training_args"]["deepspeed_config"] = config["deepspeed"]
|
||||||
|
|
||||||
# Print configuration info
|
# Print configuration info
|
||||||
print(f"Experiment: {config['experiment']['name']}")
|
print(f"Experiment: {config['experiment']['name']}")
|
||||||
print(f"Base model: {config['experiment']['base_model']}")
|
print(f"Base model: {config['experiment']['base_model']}")
|
||||||
print(f"Output directory: {config['experiment']['output_dir']}")
|
print(f"Output directory: {config['experiment']['output_dir']}")
|
||||||
print(f"Stages: {len(config['progressive_stages'])}")
|
print(f"Stages: {len(config['progressive_stages'])}")
|
||||||
|
if is_distributed:
|
||||||
|
print("Mode: Distributed Training")
|
||||||
|
if args.deepspeed:
|
||||||
|
print("Backend: DeepSpeed")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
# Prepare sample datasets if requested
|
# Prepare sample datasets if requested
|
||||||
if args.prepare_data:
|
if args.prepare_data:
|
||||||
|
|
|
||||||
|
|
@ -367,27 +367,55 @@ class ProgressiveTrainer:
|
||||||
|
|
||||||
print(f"Final dataset size: {len(dataset)} examples")
|
print(f"Final dataset size: {len(dataset)} examples")
|
||||||
|
|
||||||
# Training arguments - with CPU offload optimizations
|
# Training arguments - with multi-GPU and CPU offload optimizations
|
||||||
training_args = TrainingArguments(
|
training_args_dict = {
|
||||||
output_dir=f"./outputs/checkpoints/{stage_name}",
|
"output_dir": f"./outputs/checkpoints/{stage_name}",
|
||||||
num_train_epochs=stage_config["training"]["num_epochs"],
|
"num_train_epochs": stage_config["training"]["num_epochs"],
|
||||||
per_device_train_batch_size=stage_config["training"]["per_device_batch_size"],
|
"per_device_train_batch_size": stage_config["training"]["per_device_batch_size"],
|
||||||
gradient_accumulation_steps=stage_config["training"]["gradient_accumulation_steps"],
|
"gradient_accumulation_steps": stage_config["training"]["gradient_accumulation_steps"],
|
||||||
learning_rate=float(stage_config["training"]["learning_rate"]), # Ensure it's a float
|
"learning_rate": float(stage_config["training"]["learning_rate"]), # Ensure it's a float
|
||||||
warmup_steps=stage_config["training"]["warmup_steps"],
|
"warmup_steps": stage_config["training"]["warmup_steps"],
|
||||||
logging_steps=stage_config["training"].get("logging_steps", 10),
|
"logging_steps": stage_config["training"].get("logging_steps", 10),
|
||||||
save_strategy="epoch",
|
"save_strategy": "epoch",
|
||||||
eval_strategy="no",
|
"eval_strategy": "no",
|
||||||
bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
|
"bf16": torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
|
||||||
gradient_checkpointing=self.config["model"].get("gradient_checkpointing", False),
|
"gradient_checkpointing": self.config["model"].get("gradient_checkpointing", False),
|
||||||
max_grad_norm=stage_config["training"].get("max_grad_norm", 1.0),
|
"max_grad_norm": stage_config["training"].get("max_grad_norm", 1.0),
|
||||||
report_to="wandb" if self.config["experiment"]["use_wandb"] else "none",
|
"report_to": "wandb" if self.config["experiment"]["use_wandb"] else "none",
|
||||||
run_name=f"{self.config['experiment']['name']}_{stage_name}",
|
"run_name": f"{self.config['experiment']['name']}_{stage_name}",
|
||||||
dataloader_pin_memory=False, # Reduce memory usage
|
"dataloader_pin_memory": stage_config["training"].get("dataloader_pin_memory", False),
|
||||||
remove_unused_columns=False, # Keep all columns
|
"remove_unused_columns": False, # Keep all columns
|
||||||
optim=stage_config["training"].get("optim", "adamw_torch"), # Support 8-bit optimizers
|
"optim": stage_config["training"].get("optim", "adamw_torch"), # Support 8-bit optimizers
|
||||||
dataloader_num_workers=stage_config["training"].get("dataloader_num_workers", 2),
|
"dataloader_num_workers": stage_config["training"].get("dataloader_num_workers", 2),
|
||||||
)
|
}
|
||||||
|
|
||||||
|
# Add multi-GPU specific settings
|
||||||
|
if self.config.get("training_args", {}).get("distributed", False):
|
||||||
|
training_args_dict.update({
|
||||||
|
"ddp_find_unused_parameters": False,
|
||||||
|
"ddp_bucket_cap_mb": 200,
|
||||||
|
"ddp_broadcast_buffers": False,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Add DeepSpeed configuration
|
||||||
|
if self.config.get("training_args", {}).get("deepspeed", False):
|
||||||
|
deepspeed_config = self.config.get("training_args", {}).get("deepspeed_config")
|
||||||
|
if deepspeed_config:
|
||||||
|
training_args_dict["deepspeed"] = deepspeed_config
|
||||||
|
|
||||||
|
# Add FSDP configuration
|
||||||
|
if "fsdp" in self.config:
|
||||||
|
fsdp_config = self.config["fsdp"]
|
||||||
|
training_args_dict.update({
|
||||||
|
"fsdp": fsdp_config.get("fsdp_sharding_strategy", "FULL_SHARD"),
|
||||||
|
"fsdp_transformer_layer_cls_to_wrap": fsdp_config.get("fsdp_transformer_layer_cls_to_wrap"),
|
||||||
|
"fsdp_auto_wrap_policy": fsdp_config.get("fsdp_auto_wrap_policy", "TRANSFORMER_BASED_WRAP"),
|
||||||
|
"fsdp_min_num_params": fsdp_config.get("fsdp_min_num_params", 1000000),
|
||||||
|
"fsdp_cpu_ram_efficient_loading": fsdp_config.get("fsdp_cpu_offload", False),
|
||||||
|
"fsdp_sync_module_states": fsdp_config.get("fsdp_sync_module_states", True),
|
||||||
|
})
|
||||||
|
|
||||||
|
training_args = TrainingArguments(**training_args_dict)
|
||||||
|
|
||||||
# Print dataset info for debugging
|
# Print dataset info for debugging
|
||||||
print(f"Dataset columns: {dataset.column_names}")
|
print(f"Dataset columns: {dataset.column_names}")
|
||||||
|
|
|
||||||
7
uv.lock
generated
Normal file
7
uv.lock
generated
Normal file
|
|
@ -0,0 +1,7 @@
|
||||||
|
# This file is automatically @generated by uv.
|
||||||
|
# It is not intended for manual editing.
|
||||||
|
version = 1
|
||||||
|
requires-python = ">=3.9"
|
||||||
|
|
||||||
|
# Note: This is a placeholder lock file.
|
||||||
|
# Run `uv lock` to generate the actual lock file with resolved dependencies.
|
||||||
Loading…
Add table
Reference in a new issue