こんにちは

This commit is contained in:
Soma Nakamura 2025-07-10 22:25:11 +09:00
parent 3c513fee17
commit 5ca971b0a4
19 changed files with 1559 additions and 41 deletions

13
.env.example Normal file
View file

@ -0,0 +1,13 @@
# Environment variables for Progressive LLM Training
# Copy this file to .env and fill in your values
# HuggingFace
HF_TOKEN=your_token_here
# Weights & Biases
WANDB_API_KEY=your_api_key_here
WANDB_PROJECT=matsuo-llm-comp-2025
# GPU Configuration
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
NCCL_DEBUG=WARN

35
.gitignore vendored
View file

@ -1,32 +1,35 @@
# Python # Python
__pycache__/ __pycache__/
*.py[cod] *.py[cod]
*$py.class
*.so
.Python
venv/
ENV/
env/
.venv/ .venv/
venv/
# Nix # Training outputs
result
result-*
# Project specific
outputs/ outputs/
data/ data/
*.log !data/basic_cot/train.jsonl
wandb/ wandb/
.ipynb_checkpoints/ *.log
# Model files
*.pt *.pt
*.pth *.pth
*.bin *.bin
*.safetensors *.safetensors
# Temporary
*.tmp
.cache/
accelerate_config.yaml
hostfile
# IDE # IDE
.vscode/ .vscode/
.idea/ .idea/
*.swp
*.swo # OS
*~ .DS_Store
*~
# Keep lock files
!uv.lock

1
.python-version Normal file
View file

@ -0,0 +1 @@
3.11

43
README.md Normal file
View file

@ -0,0 +1,43 @@
# Progressive LLM Training
Progressive training for LLMs with 8-GPU support for 松尾研LLMコンペ2025.
## Quick Start
```bash
# Install uv
curl -LsSf https://astral.sh/uv/install.sh | sh
# Setup project
git clone <repository-url>
cd progressive-llm-training
uv sync
# Start training
uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml
./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed
```
## Training Stages
1. **basic_cot** - Basic reasoning
2. **math_reasoning** - Math with OpenR1-Math-220k
3. **complex_reasoning** - Complex reasoning with Mixture-of-Thoughts
## Commands
```bash
uv sync # Install dependencies
uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml # Single GPU
./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed # 8 GPUs
uv run pytest # Run tests
```
## Key Files
- `config/training_config_gemma3_1b_8gpu_deepspeed.yaml` - 8-GPU config
- `scripts/train_progressive.py` - Main training script
- `scripts/train_gemma3_1b_8gpu.sh` - 8-GPU launcher
- `src/progressive_model.py` - Core model implementation
Ready to train! 🚀

View file

@ -0,0 +1,110 @@
experiment:
name: "progressive_reasoning_8gpu"
base_model: "google/gemma-2-2b-it" # Can scale up to larger models
output_dir: "./outputs"
use_wandb: true
wandb_project: "matsuo-llm-comp-2025"
model:
load_in_4bit: false # Can use FP16/BF16 with multiple GPUs
bnb_4bit_compute_dtype: "bfloat16"
bnb_4bit_use_double_quant: true
device_map: "balanced" # Distribute across all GPUs
gradient_checkpointing: true
use_flash_attention_2: true # Enable if available for speed
use_eager_attention: false
# Multi-GPU specific settings
distributed:
strategy: "ddp" # Distributed Data Parallel
find_unused_parameters: false
gradient_as_bucket_view: true
progressive_stages:
- name: "basic_cot"
description: "Basic Chain-of-Thought reasoning"
dataset_path: "./data/basic_cot/"
adapter_config:
r: 32 # Larger rank since we have more memory
lora_alpha: 64
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
init_lora_weights: true
training:
num_epochs: 2
per_device_batch_size: 16 # Large batch size per GPU
gradient_accumulation_steps: 1 # No need for accumulation with 8 GPUs
learning_rate: 5e-4
warmup_steps: 100
max_length: 2048
bf16: true
max_grad_norm: 1.0
weight_decay: 0.001
save_steps: 50
logging_steps: 10
dataloader_num_workers: 4 # More workers for data loading
dataloader_pin_memory: true
remove_unused_columns: false
- name: "math_reasoning"
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
dataset_path: "open-r1/OpenR1-Math-220k"
inherit_from: "basic_cot"
adapter_config:
r: 64
lora_alpha: 128
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
init_lora_weights: true
training:
num_epochs: 1
per_device_batch_size: 8 # Reduce for larger model
gradient_accumulation_steps: 2
learning_rate: 3e-4
warmup_steps: 200
max_length: 4096
bf16: true
max_grad_norm: 1.0
weight_decay: 0.001
save_steps: 100
logging_steps: 20
dataloader_num_workers: 4
dataset_config:
streaming: true
max_samples: 100000 # Can process more with 8 GPUs
split: "train"
- name: "complex_reasoning"
description: "Complex multi-step reasoning with Mixture-of-Thoughts"
dataset_path: "open-r1/Mixture-of-Thoughts"
inherit_from: "math_reasoning"
adapter_config:
r: 128 # Maximum rank with multi-GPU
lora_alpha: 256
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
init_lora_weights: true
training:
num_epochs: 1
per_device_batch_size: 4
gradient_accumulation_steps: 4
learning_rate: 2e-4
warmup_steps: 300
max_length: 8192
bf16: true
max_grad_norm: 1.0
weight_decay: 0.001
save_steps: 200
logging_steps: 50
dataloader_num_workers: 4
dataset_config:
streaming: true
max_samples: 50000
split: "train"
evaluation:
benchmarks:
- "HLE"
- "Do-Not-Answer"
save_results: true
results_dir: "./outputs/evaluation_results"

View file

@ -0,0 +1,138 @@
experiment:
name: "progressive_reasoning_8gpu_deepspeed"
base_model: "google/gemma-2-2b-it"
output_dir: "./outputs"
use_wandb: true
wandb_project: "matsuo-llm-comp-2025"
model:
load_in_4bit: false
device_map: null # Let DeepSpeed handle device placement
gradient_checkpointing: true
use_flash_attention_2: true
use_eager_attention: false
# DeepSpeed Configuration
deepspeed:
zero_optimization:
stage: 2 # ZeRO Stage 2 (partition optimizer states and gradients)
allgather_partitions: true
allgather_bucket_size: 200000000
overlap_comm: true
reduce_scatter: true
reduce_bucket_size: 200000000
contiguous_gradients: true
cpu_offload: false # Keep on GPU for speed
optimizer:
type: "AdamW"
params:
lr: 3e-4
betas: [0.9, 0.999]
eps: 1e-8
weight_decay: 0.001
scheduler:
type: "WarmupLR"
params:
warmup_min_lr: 0
warmup_max_lr: 3e-4
warmup_num_steps: 200
fp16:
enabled: false
bf16:
enabled: true
gradient_clipping: 1.0
train_batch_size: 512 # Total batch size across all GPUs
train_micro_batch_size_per_gpu: 64 # Per-GPU batch size
progressive_stages:
- name: "basic_cot"
description: "Basic Chain-of-Thought reasoning"
dataset_path: "./data/basic_cot/"
adapter_config:
r: 64
lora_alpha: 128
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
init_lora_weights: true
training:
num_epochs: 2
per_device_batch_size: 64 # Large batch with DeepSpeed
gradient_accumulation_steps: 1
learning_rate: 5e-4
warmup_steps: 100
max_length: 2048
bf16: true
max_grad_norm: 1.0
weight_decay: 0.001
save_steps: 50
logging_steps: 10
dataloader_num_workers: 8
- name: "math_reasoning"
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
dataset_path: "open-r1/OpenR1-Math-220k"
inherit_from: "basic_cot"
adapter_config:
r: 128
lora_alpha: 256
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
init_lora_weights: true
training:
num_epochs: 1
per_device_batch_size: 32
gradient_accumulation_steps: 1
learning_rate: 3e-4
warmup_steps: 200
max_length: 4096
bf16: true
max_grad_norm: 1.0
weight_decay: 0.001
save_steps: 100
logging_steps: 20
dataloader_num_workers: 8
dataset_config:
streaming: true
max_samples: 200000
split: "train"
- name: "complex_reasoning"
description: "Complex multi-step reasoning with Mixture-of-Thoughts"
dataset_path: "open-r1/Mixture-of-Thoughts"
inherit_from: "math_reasoning"
adapter_config:
r: 256
lora_alpha: 512
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
init_lora_weights: true
training:
num_epochs: 1
per_device_batch_size: 16
gradient_accumulation_steps: 2
learning_rate: 2e-4
warmup_steps: 300
max_length: 8192
bf16: true
max_grad_norm: 1.0
weight_decay: 0.001
save_steps: 200
logging_steps: 50
dataloader_num_workers: 8
dataset_config:
streaming: true
max_samples: 100000
split: "train"
evaluation:
benchmarks:
- "HLE"
- "Do-Not-Answer"
save_results: true
results_dir: "./outputs/evaluation_results"

View file

@ -0,0 +1,113 @@
experiment:
name: "progressive_reasoning_8gpu_fsdp"
base_model: "google/gemma-2-2b-it" # Can scale to much larger models with FSDP
output_dir: "./outputs"
use_wandb: true
wandb_project: "matsuo-llm-comp-2025"
model:
load_in_4bit: false
device_map: null # Let FSDP handle device placement
gradient_checkpointing: true
use_flash_attention_2: true
use_eager_attention: false
# FSDP Configuration
fsdp:
fsdp_transformer_layer_cls_to_wrap: "Gemma2DecoderLayer" # Wrap at layer level
fsdp_sharding_strategy: "FULL_SHARD" # Shard parameters, gradients, and optimizer states
fsdp_cpu_offload: false # Keep on GPU for speed
fsdp_mixed_precision: true # Use BF16 mixed precision
fsdp_auto_wrap_policy: "TRANSFORMER_BASED_WRAP"
fsdp_min_num_params: 1000000 # Wrap layers with >1M parameters
fsdp_sync_module_states: true
fsdp_forward_prefetch: true
fsdp_use_orig_params: true # Important for LoRA compatibility
progressive_stages:
- name: "basic_cot"
description: "Basic Chain-of-Thought reasoning"
dataset_path: "./data/basic_cot/"
adapter_config:
r: 64 # Can use larger ranks with FSDP
lora_alpha: 128
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
init_lora_weights: true
training:
num_epochs: 2
per_device_batch_size: 32 # Very large batch size with FSDP
gradient_accumulation_steps: 1
learning_rate: 5e-4
warmup_steps: 100
max_length: 2048
bf16: true
max_grad_norm: 1.0
weight_decay: 0.001
save_steps: 50
logging_steps: 10
dataloader_num_workers: 8
dataloader_pin_memory: true
- name: "math_reasoning"
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
dataset_path: "open-r1/OpenR1-Math-220k"
inherit_from: "basic_cot"
adapter_config:
r: 128
lora_alpha: 256
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
init_lora_weights: true
training:
num_epochs: 1
per_device_batch_size: 16
gradient_accumulation_steps: 2
learning_rate: 3e-4
warmup_steps: 200
max_length: 4096
bf16: true
max_grad_norm: 1.0
weight_decay: 0.001
save_steps: 100
logging_steps: 20
dataloader_num_workers: 8
dataset_config:
streaming: true
max_samples: 200000 # Process even more data
split: "train"
- name: "complex_reasoning"
description: "Complex multi-step reasoning with Mixture-of-Thoughts"
dataset_path: "open-r1/Mixture-of-Thoughts"
inherit_from: "math_reasoning"
adapter_config:
r: 256 # Very large rank possible with FSDP
lora_alpha: 512
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
init_lora_weights: true
training:
num_epochs: 1
per_device_batch_size: 8
gradient_accumulation_steps: 4
learning_rate: 2e-4
warmup_steps: 300
max_length: 8192
bf16: true
max_grad_norm: 1.0
weight_decay: 0.001
save_steps: 200
logging_steps: 50
dataloader_num_workers: 8
dataset_config:
streaming: true
max_samples: 100000
split: "train"
evaluation:
benchmarks:
- "HLE"
- "Do-Not-Answer"
save_results: true
results_dir: "./outputs/evaluation_results"

View file

@ -0,0 +1,109 @@
experiment:
name: "progressive_reasoning_gemma3_1b_8gpu_ddp"
base_model: "google/gemma-3-1b-pt"
output_dir: "./outputs"
use_wandb: true
wandb_project: "matsuo-llm-comp-2025"
model:
load_in_4bit: false # Can use FP16/BF16 with multiple GPUs
bnb_4bit_compute_dtype: "bfloat16"
bnb_4bit_use_double_quant: true
device_map: "balanced" # Distribute across all GPUs
gradient_checkpointing: true
use_flash_attention_2: false
use_eager_attention: true
# Multi-GPU specific settings
distributed:
strategy: "ddp" # Distributed Data Parallel
find_unused_parameters: false
gradient_as_bucket_view: true
progressive_stages:
- name: "basic_cot"
description: "Basic Chain-of-Thought reasoning"
dataset_path: "./data/basic_cot/"
adapter_config:
r: 16 # Moderate rank for DDP
lora_alpha: 32
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
init_lora_weights: true
training:
num_epochs: 2
per_device_batch_size: 32 # 32 * 8 = 256 total batch size
gradient_accumulation_steps: 1
learning_rate: 5e-4
warmup_steps: 100
max_length: 1024
bf16: true
max_grad_norm: 1.0
weight_decay: 0.001
save_steps: 50
logging_steps: 10
dataloader_num_workers: 4
dataloader_pin_memory: true
- name: "math_reasoning"
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
dataset_path: "open-r1/OpenR1-Math-220k"
inherit_from: "basic_cot"
adapter_config:
r: 32
lora_alpha: 64
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
init_lora_weights: true
training:
num_epochs: 1
per_device_batch_size: 16 # 16 * 8 = 128 total batch size
gradient_accumulation_steps: 2
learning_rate: 3e-4
warmup_steps: 200
max_length: 2048
bf16: true
max_grad_norm: 1.0
weight_decay: 0.001
save_steps: 100
logging_steps: 20
dataloader_num_workers: 4
dataset_config:
streaming: true
max_samples: 400000 # Process substantial data
split: "train"
- name: "complex_reasoning"
description: "Complex multi-step reasoning with Mixture-of-Thoughts"
dataset_path: "open-r1/Mixture-of-Thoughts"
inherit_from: "math_reasoning"
adapter_config:
r: 64
lora_alpha: 128
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
init_lora_weights: true
training:
num_epochs: 1
per_device_batch_size: 8 # 8 * 8 = 64 total batch size
gradient_accumulation_steps: 4
learning_rate: 2e-4
warmup_steps: 300
max_length: 4096
bf16: true
max_grad_norm: 1.0
weight_decay: 0.001
save_steps: 200
logging_steps: 50
dataloader_num_workers: 4
dataset_config:
streaming: true
max_samples: 600000
split: "train"
evaluation:
benchmarks:
- "HLE"
- "Do-Not-Answer"
save_results: true
results_dir: "./outputs/evaluation_results"

View file

@ -0,0 +1,139 @@
experiment:
name: "progressive_reasoning_gemma3_1b_8gpu_deepspeed"
base_model: "google/gemma-3-1b-pt"
output_dir: "./outputs"
use_wandb: true
wandb_project: "matsuo-llm-comp-2025"
model:
load_in_4bit: false # Disable quantization for DeepSpeed
device_map: null # Let DeepSpeed handle device placement
gradient_checkpointing: true # Enable for memory efficiency
use_flash_attention_2: false
use_eager_attention: true
# DeepSpeed Configuration
deepspeed:
zero_optimization:
stage: 2 # ZeRO Stage 2 (partition optimizer states and gradients)
allgather_partitions: true
allgather_bucket_size: 500000000 # 500MB buckets
overlap_comm: true
reduce_scatter: true
reduce_bucket_size: 500000000
contiguous_gradients: true
cpu_offload: false # Keep on GPU for speed with small model
optimizer:
type: "AdamW"
params:
lr: 5e-4
betas: [0.9, 0.999]
eps: 1e-8
weight_decay: 0.001
scheduler:
type: "WarmupLR"
params:
warmup_min_lr: 0
warmup_max_lr: 5e-4
warmup_num_steps: 100
fp16:
enabled: false
bf16:
enabled: true
gradient_clipping: 1.0
train_batch_size: 512 # Total batch size across all GPUs
train_micro_batch_size_per_gpu: 64 # Per-GPU batch size
progressive_stages:
- name: "basic_cot"
description: "Basic Chain-of-Thought reasoning"
dataset_path: "./data/basic_cot/"
adapter_config:
r: 32 # Larger rank with 8 GPUs
lora_alpha: 64
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
init_lora_weights: true
training:
num_epochs: 2
per_device_batch_size: 64 # Large batch with DeepSpeed
gradient_accumulation_steps: 1 # No accumulation needed
learning_rate: 5e-4
warmup_steps: 100
max_length: 1024
bf16: true
max_grad_norm: 1.0
weight_decay: 0.001
save_steps: 50
logging_steps: 10
dataloader_num_workers: 8
dataloader_pin_memory: true
- name: "math_reasoning"
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
dataset_path: "open-r1/OpenR1-Math-220k"
inherit_from: "basic_cot"
adapter_config:
r: 64 # Larger rank for math reasoning
lora_alpha: 128
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
init_lora_weights: true
training:
num_epochs: 1
per_device_batch_size: 32 # Reduce for longer sequences
gradient_accumulation_steps: 1
learning_rate: 3e-4
warmup_steps: 200
max_length: 2048
bf16: true
max_grad_norm: 1.0
weight_decay: 0.001
save_steps: 100
logging_steps: 20
dataloader_num_workers: 8
dataset_config:
streaming: true
max_samples: 500000 # Process more data with 8 GPUs
split: "train"
- name: "complex_reasoning"
description: "Complex multi-step reasoning with Mixture-of-Thoughts"
dataset_path: "open-r1/Mixture-of-Thoughts"
inherit_from: "math_reasoning"
adapter_config:
r: 128 # Maximum rank for complex reasoning
lora_alpha: 256
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
init_lora_weights: true
training:
num_epochs: 1
per_device_batch_size: 16 # Reduce for very long sequences
gradient_accumulation_steps: 2
learning_rate: 2e-4
warmup_steps: 300
max_length: 4096
bf16: true
max_grad_norm: 1.0
weight_decay: 0.001
save_steps: 200
logging_steps: 50
dataloader_num_workers: 8
dataset_config:
streaming: true
max_samples: 800000 # Process even more data
split: "train"
evaluation:
benchmarks:
- "HLE"
- "Do-Not-Answer"
save_results: true
results_dir: "./outputs/evaluation_results"

View file

@ -0,0 +1,113 @@
experiment:
name: "progressive_reasoning_gemma3_1b_8gpu_fsdp"
base_model: "google/gemma-3-1b-pt"
output_dir: "./outputs"
use_wandb: true
wandb_project: "matsuo-llm-comp-2025"
model:
load_in_4bit: false
device_map: null # Let FSDP handle device placement
gradient_checkpointing: true
use_flash_attention_2: false
use_eager_attention: true
# FSDP Configuration
fsdp:
fsdp_transformer_layer_cls_to_wrap: "GemmaDecoderLayer" # Wrap at layer level
fsdp_sharding_strategy: "FULL_SHARD" # Shard parameters, gradients, and optimizer states
fsdp_cpu_offload: false # Keep on GPU for speed with small model
fsdp_mixed_precision: true # Use BF16 mixed precision
fsdp_auto_wrap_policy: "TRANSFORMER_BASED_WRAP"
fsdp_min_num_params: 1000000 # Wrap layers with >1M parameters
fsdp_sync_module_states: true
fsdp_forward_prefetch: true
fsdp_use_orig_params: true # Important for LoRA compatibility
progressive_stages:
- name: "basic_cot"
description: "Basic Chain-of-Thought reasoning"
dataset_path: "./data/basic_cot/"
adapter_config:
r: 32 # Can use larger ranks with FSDP
lora_alpha: 64
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
init_lora_weights: true
training:
num_epochs: 2
per_device_batch_size: 48 # Very large batch size with FSDP
gradient_accumulation_steps: 1
learning_rate: 5e-4
warmup_steps: 100
max_length: 1024
bf16: true
max_grad_norm: 1.0
weight_decay: 0.001
save_steps: 50
logging_steps: 10
dataloader_num_workers: 8
dataloader_pin_memory: true
- name: "math_reasoning"
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
dataset_path: "open-r1/OpenR1-Math-220k"
inherit_from: "basic_cot"
adapter_config:
r: 64
lora_alpha: 128
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
init_lora_weights: true
training:
num_epochs: 1
per_device_batch_size: 24
gradient_accumulation_steps: 2
learning_rate: 3e-4
warmup_steps: 200
max_length: 2048
bf16: true
max_grad_norm: 1.0
weight_decay: 0.001
save_steps: 100
logging_steps: 20
dataloader_num_workers: 8
dataset_config:
streaming: true
max_samples: 600000 # Process even more data with FSDP
split: "train"
- name: "complex_reasoning"
description: "Complex multi-step reasoning with Mixture-of-Thoughts"
dataset_path: "open-r1/Mixture-of-Thoughts"
inherit_from: "math_reasoning"
adapter_config:
r: 128 # Very large rank possible with FSDP
lora_alpha: 256
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
init_lora_weights: true
training:
num_epochs: 1
per_device_batch_size: 12
gradient_accumulation_steps: 4
learning_rate: 2e-4
warmup_steps: 300
max_length: 4096
bf16: true
max_grad_norm: 1.0
weight_decay: 0.001
save_steps: 200
logging_steps: 50
dataloader_num_workers: 8
dataset_config:
streaming: true
max_samples: 1000000 # Can process 1M samples efficiently
split: "train"
evaluation:
benchmarks:
- "HLE"
- "Do-Not-Answer"
save_results: true
results_dir: "./outputs/evaluation_results"

37
docs/README.md Normal file
View file

@ -0,0 +1,37 @@
# Progressive LLM Training Documentation
## Setup
```bash
curl -LsSf https://astral.sh/uv/install.sh | sh
uv sync
```
## Training
### Single GPU
```bash
uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml
```
### 8 GPUs
```bash
./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed
```
## Configuration
- `config/training_config_gemma3_1b.yaml` - Single GPU
- `config/training_config_gemma3_1b_8gpu_deepspeed.yaml` - 8 GPUs
## Environment
Copy `.env.example` to `.env` and set:
- `HF_TOKEN` - HuggingFace token
- `WANDB_API_KEY` - W&B API key
## Troubleshooting
- Reduce `per_device_batch_size` for memory issues
- `export NCCL_DEBUG=INFO` for NCCL errors
- `nvidia-smi` to check GPUs

26
pyproject.toml Normal file
View file

@ -0,0 +1,26 @@
[project]
name = "progressive-llm-training"
version = "0.1.0"
description = "Progressive LLM Training for 松尾研LLMコンペ2025"
requires-python = ">=3.9"
dependencies = [
"torch>=2.0.0",
"transformers>=4.40.0",
"accelerate>=0.27.0",
"peft>=0.11.0",
"trl>=0.9.0",
"datasets>=2.18.0",
"bitsandbytes>=0.43.0",
"wandb>=0.16.0",
"pyyaml>=6.0",
"jsonlines>=4.0.0",
"deepspeed>=0.12.0",
]
[project.optional-dependencies]
dev = ["pytest", "black", "isort"]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

View file

@ -1,3 +1,5 @@
# Use uv instead: uv sync
torch>=2.0.0
transformers>=4.40.0 transformers>=4.40.0
accelerate>=0.27.0 accelerate>=0.27.0
peft>=0.11.0 peft>=0.11.0
@ -7,7 +9,4 @@ bitsandbytes>=0.43.0
wandb>=0.16.0 wandb>=0.16.0
pyyaml>=6.0 pyyaml>=6.0
jsonlines>=4.0.0 jsonlines>=4.0.0
scikit-learn>=1.3.0 deepspeed>=0.12.0
# flash-attn>=2.5.0 # Install separately with --no-build-isolation
sentencepiece>=0.2.0
protobuf>=4.25.0

View file

@ -0,0 +1,201 @@
#!/usr/bin/env python3
"""
Debug script to identify model loading issues
"""
import sys
import os
import torch
from pathlib import Path
# Add src to path
sys.path.append(str(Path(__file__).parent.parent))
def clear_accelerate_env():
"""Clear all ACCELERATE environment variables"""
print("Clearing ACCELERATE environment variables...")
env_vars_to_clear = []
for key in os.environ:
if 'ACCELERATE' in key:
env_vars_to_clear.append(key)
for var in env_vars_to_clear:
print(f" Removing {var}={os.environ[var]}")
del os.environ[var]
def test_basic_model_loading():
"""Test basic model loading without any configuration"""
print("Testing basic model loading...")
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "google/gemma-2-2b-it"
try:
print("Testing with absolutely minimal config...")
model = AutoModelForCausalLM.from_pretrained(
model_name,
trust_remote_code=True,
torch_dtype=torch.float32
)
print("✅ Basic loading successful!")
del model
return True
except Exception as e:
print(f"❌ Basic loading failed: {e}")
return False
def test_with_device_map():
"""Test with device_map auto"""
print("Testing with device_map='auto'...")
from transformers import AutoModelForCausalLM
model_name = "google/gemma-2-2b-it"
try:
model = AutoModelForCausalLM.from_pretrained(
model_name,
trust_remote_code=True,
torch_dtype=torch.float32,
device_map="auto"
)
print("✅ Device map loading successful!")
del model
return True
except Exception as e:
print(f"❌ Device map loading failed: {e}")
return False
def test_with_quantization():
"""Test with quantization"""
print("Testing with 4-bit quantization...")
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
model_name = "google/gemma-2-2b-it"
try:
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4"
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
trust_remote_code=True,
quantization_config=bnb_config
)
print("✅ Quantization loading successful!")
del model
return True
except Exception as e:
print(f"❌ Quantization loading failed: {e}")
return False
def print_environment_info():
"""Print detailed environment information"""
print("\n" + "="*50)
print("ENVIRONMENT INFORMATION")
print("="*50)
# Python version
print(f"Python version: {sys.version}")
# PyTorch info
try:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"CUDA device count: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
print(f" Device {i}: {torch.cuda.get_device_name(i)}")
print(f"CUDA version: {torch.version.cuda}")
except Exception as e:
print(f"PyTorch info error: {e}")
# Transformers info
try:
from transformers import __version__ as tf_version
print(f"Transformers version: {tf_version}")
except Exception as e:
print(f"Transformers info error: {e}")
# Accelerate info
try:
from accelerate import __version__ as acc_version
print(f"Accelerate version: {acc_version}")
except Exception as e:
print(f"Accelerate info error: {e}")
# PEFT info
try:
from peft import __version__ as peft_version
print(f"PEFT version: {peft_version}")
except Exception as e:
print(f"PEFT info error: {e}")
# BitsAndBytes info
try:
import bitsandbytes as bnb
print(f"BitsAndBytes version: {bnb.__version__}")
except Exception as e:
print(f"BitsAndBytes info error: {e}")
# Environment variables
print("\nRelevant environment variables:")
for key, value in sorted(os.environ.items()):
if any(prefix in key for prefix in ['CUDA', 'TORCH', 'HF_', 'ACCELERATE', 'TRANSFORMERS']):
print(f" {key}={value}")
def main():
print("Progressive LLM Training - Model Loading Debug")
print("=" * 60)
# Print environment info first
print_environment_info()
# Clear environment variables
clear_accelerate_env()
# Test various loading methods
print("\n" + "="*50)
print("TESTING MODEL LOADING")
print("="*50)
results = []
# Test 1: Basic loading
results.append(("Basic loading", test_basic_model_loading()))
# Test 2: With device map
results.append(("Device map", test_with_device_map()))
# Test 3: With quantization
results.append(("Quantization", test_with_quantization()))
# Summary
print("\n" + "="*50)
print("SUMMARY")
print("="*50)
for test_name, success in results:
status = "✅ PASS" if success else "❌ FAIL"
print(f"{test_name}: {status}")
if any(result[1] for result in results):
print("\n✅ At least one loading method works!")
print("Use the successful method in your configuration.")
else:
print("\n❌ All loading methods failed!")
print("This indicates a fundamental environment issue.")
print("Consider:")
print("1. Reinstalling transformers, accelerate, torch")
print("2. Checking CUDA installation")
print("3. Using a different model")
if __name__ == "__main__":
main()

161
scripts/train_gemma3_1b_8gpu.sh Executable file
View file

@ -0,0 +1,161 @@
#!/bin/bash
# Training launcher script for Gemma3 1B with 8 GPUs (uv compatible)
# Color codes for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
echo -e "${GREEN}Progressive LLM Training - Gemma3 1B 8GPU Launcher (uv)${NC}"
echo "======================================================="
# Check if uv is available
if command -v uv &> /dev/null; then
echo -e "${GREEN}Using uv for Python environment management${NC}"
UV_PREFIX="uv run"
else
echo -e "${YELLOW}uv not found, using standard python${NC}"
UV_PREFIX="python"
fi
# Default values
STRATEGY="deepspeed"
CONFIG=""
NUM_GPUS=8
DRY_RUN=false
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--strategy)
STRATEGY="$2"
shift 2
;;
--config)
CONFIG="$2"
shift 2
;;
--num-gpus)
NUM_GPUS="$2"
shift 2
;;
--dry-run)
DRY_RUN=true
shift
;;
-h|--help)
echo "Usage: $0 [options]"
echo ""
echo "Options:"
echo " --strategy <ddp|fsdp|deepspeed> Training strategy (default: deepspeed)"
echo " --config <path> Custom config file (optional)"
echo " --num-gpus <n> Number of GPUs to use (default: 8)"
echo " --dry-run Show command without executing"
echo ""
echo "Examples:"
echo " # Use DeepSpeed (recommended)"
echo " $0 --strategy deepspeed"
echo ""
echo " # Use DDP"
echo " $0 --strategy ddp"
echo ""
echo " # Use FSDP"
echo " $0 --strategy fsdp"
echo ""
echo " # Use custom config"
echo " $0 --strategy ddp --config config/my_config.yaml"
exit 0
;;
*)
echo -e "${RED}Error: Unknown option $1${NC}"
exit 1
;;
esac
done
# Check GPU availability
GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
echo -e "Available GPUs: ${GREEN}$GPU_COUNT${NC}"
if [ $GPU_COUNT -lt $NUM_GPUS ]; then
echo -e "${RED}Error: Requested $NUM_GPUS GPUs but only $GPU_COUNT available${NC}"
exit 1
fi
# Set default config based on strategy if not provided
if [ -z "$CONFIG" ]; then
case $STRATEGY in
ddp)
CONFIG="config/training_config_gemma3_1b_8gpu_ddp.yaml"
;;
fsdp)
CONFIG="config/training_config_gemma3_1b_8gpu_fsdp.yaml"
;;
deepspeed)
CONFIG="config/training_config_gemma3_1b_8gpu_deepspeed.yaml"
;;
*)
echo -e "${RED}Error: Invalid strategy '$STRATEGY'. Choose from: ddp, fsdp, deepspeed${NC}"
exit 1
;;
esac
fi
# Check if config file exists
if [ ! -f "$CONFIG" ]; then
echo -e "${RED}Error: Config file not found: $CONFIG${NC}"
exit 1
fi
echo -e "Strategy: ${YELLOW}$STRATEGY${NC}"
echo -e "Config: ${YELLOW}$CONFIG${NC}"
echo -e "GPUs: ${YELLOW}$NUM_GPUS${NC}"
echo ""
# Build the command
CMD="$UV_PREFIX scripts/train_multi_gpu.py --config $CONFIG --strategy $STRATEGY --num_gpus $NUM_GPUS"
if [ "$DRY_RUN" = true ]; then
echo -e "${YELLOW}Dry run mode - Command that would be executed:${NC}"
echo "$CMD"
exit 0
fi
# Show GPU memory before training
echo -e "${GREEN}GPU Memory Usage Before Training:${NC}"
nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv
echo ""
echo -e "${GREEN}Starting training...${NC}"
echo "Command: $CMD"
echo ""
# Set environment variables for optimal performance
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export NCCL_DEBUG=WARN # Set to INFO for debugging
export NCCL_ASYNC_ERROR_HANDLING=1
# For DeepSpeed, set additional optimizations
if [ "$STRATEGY" = "deepspeed" ]; then
export DS_SKIP_CUDA_CHECK=1
export TOKENIZERS_PARALLELISM=false
fi
# Execute the training command
$CMD
# Check exit status
if [ $? -eq 0 ]; then
echo ""
echo -e "${GREEN}Training completed successfully!${NC}"
# Show GPU memory after training
echo ""
echo -e "${GREEN}GPU Memory Usage After Training:${NC}"
nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv
else
echo ""
echo -e "${RED}Training failed!${NC}"
exit 1
fi

224
scripts/train_multi_gpu.py Executable file
View file

@ -0,0 +1,224 @@
#!/usr/bin/env python3
"""
Multi-GPU training launcher for progressive reasoning model
Supports DDP, FSDP, and DeepSpeed strategies
"""
import os
import sys
import argparse
import subprocess
import shutil
import torch
from pathlib import Path
# Add src to path
sys.path.append(str(Path(__file__).parent.parent))
def get_gpu_count():
"""Get the number of available GPUs"""
if torch.cuda.is_available():
return torch.cuda.device_count()
return 0
def setup_environment_for_strategy(strategy):
"""Set up environment variables for different strategies"""
if strategy == "deepspeed":
# DeepSpeed specific environment
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = str(get_gpu_count())
elif strategy in ["ddp", "fsdp"]:
# Standard distributed training environment
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
# Let torchrun handle the rest
# General optimizations
os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["TORCH_DISTRIBUTED_DEBUG"] = "INFO"
def launch_ddp_training(config_path, num_gpus):
"""Launch DDP training using torchrun"""
print(f"Launching DDP training on {num_gpus} GPUs...")
setup_environment_for_strategy("ddp")
# Use torchrun for DDP
# Check if uv is available
python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"]
cmd = [
"torchrun",
"--nproc_per_node", str(num_gpus),
"--master_port", "12355",
] + python_cmd + [
"--config", config_path,
"--distributed"
]
print(f"Running command: {' '.join(cmd)}")
return subprocess.run(cmd, cwd=Path(__file__).parent.parent)
def launch_fsdp_training(config_path, num_gpus):
"""Launch FSDP training using accelerate"""
print(f"Launching FSDP training on {num_gpus} GPUs...")
setup_environment_for_strategy("fsdp")
# Create accelerate config for FSDP
accelerate_config = f"""
compute_environment: LOCAL_MACHINE
distributed_type: FSDP
fsdp_config:
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
fsdp_backward_prefetch: BACKWARD_PRE
fsdp_cpu_ram_efficient_loading: true
fsdp_forward_prefetch: false
fsdp_offload_params: false
fsdp_sharding_strategy: FULL_SHARD
fsdp_state_dict_type: SHARDED_STATE_DICT
fsdp_sync_module_states: true
fsdp_transformer_layer_cls_to_wrap: Gemma2DecoderLayer
fsdp_use_orig_params: true
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: {num_gpus}
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
"""
# Save config temporarily
config_file = Path(__file__).parent.parent / "accelerate_config.yaml"
with open(config_file, "w") as f:
f.write(accelerate_config)
# Check if uv is available
python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"]
cmd = [
"accelerate", "launch",
"--config_file", str(config_file),
] + python_cmd + [
"--config", config_path
]
print(f"Running command: {' '.join(cmd)}")
result = subprocess.run(cmd, cwd=Path(__file__).parent.parent)
# Clean up config file
config_file.unlink(missing_ok=True)
return result
def launch_deepspeed_training(config_path, num_gpus):
"""Launch DeepSpeed training"""
print(f"Launching DeepSpeed training on {num_gpus} GPUs...")
setup_environment_for_strategy("deepspeed")
# Create DeepSpeed hostfile
hostfile = Path(__file__).parent.parent / "hostfile"
with open(hostfile, "w") as f:
f.write(f"localhost slots={num_gpus}\n")
# Check if uv is available
python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"]
cmd = [
"deepspeed",
"--hostfile", str(hostfile),
"--num_gpus", str(num_gpus),
] + python_cmd + [
"--config", config_path,
"--deepspeed"
]
print(f"Running command: {' '.join(cmd)}")
result = subprocess.run(cmd, cwd=Path(__file__).parent.parent)
# Clean up hostfile
hostfile.unlink(missing_ok=True)
return result
def main():
parser = argparse.ArgumentParser(description="Multi-GPU Progressive LLM Training")
parser.add_argument("--config", type=str, required=True,
help="Path to training configuration file")
parser.add_argument("--strategy", type=str, default="ddp",
choices=["ddp", "fsdp", "deepspeed"],
help="Multi-GPU strategy to use")
parser.add_argument("--num_gpus", type=int, default=None,
help="Number of GPUs to use (default: all available)")
parser.add_argument("--dry_run", action="store_true",
help="Print commands without executing")
args = parser.parse_args()
# Get GPU count
available_gpus = get_gpu_count()
if available_gpus == 0:
print("❌ No GPUs available!")
sys.exit(1)
num_gpus = args.num_gpus or available_gpus
if num_gpus > available_gpus:
print(f"❌ Requested {num_gpus} GPUs but only {available_gpus} available")
sys.exit(1)
# Check config file exists
if not Path(args.config).exists():
print(f"❌ Config file not found: {args.config}")
sys.exit(1)
print("Progressive LLM Training - Multi-GPU Launcher")
print("=" * 60)
print(f"Strategy: {args.strategy}")
print(f"GPUs: {num_gpus} / {available_gpus}")
print(f"Config: {args.config}")
print("=" * 60)
if args.dry_run:
print("DRY RUN - Commands that would be executed:")
# Show what would be run
if args.strategy == "ddp":
print("torchrun --nproc_per_node", num_gpus, "scripts/train_progressive.py")
elif args.strategy == "fsdp":
print("accelerate launch --config_file accelerate_config.yaml scripts/train_progressive.py")
elif args.strategy == "deepspeed":
print("deepspeed --num_gpus", num_gpus, "scripts/train_progressive.py")
return
# Launch training
if args.strategy == "ddp":
result = launch_ddp_training(args.config, num_gpus)
elif args.strategy == "fsdp":
result = launch_fsdp_training(args.config, num_gpus)
elif args.strategy == "deepspeed":
result = launch_deepspeed_training(args.config, num_gpus)
if result.returncode == 0:
print("✅ Training completed successfully!")
else:
print("❌ Training failed!")
sys.exit(result.returncode)
if __name__ == "__main__":
main()

View file

@ -6,6 +6,7 @@ Main training script for progressive reasoning model
import sys import sys
import yaml import yaml
import argparse import argparse
import os
from pathlib import Path from pathlib import Path
# Add src to path # Add src to path
@ -56,6 +57,18 @@ Examples:
help="Load config and model but skip training (for testing)" help="Load config and model but skip training (for testing)"
) )
parser.add_argument(
"--distributed",
action="store_true",
help="Enable distributed training"
)
parser.add_argument(
"--deepspeed",
action="store_true",
help="Enable DeepSpeed training"
)
return parser.parse_args() return parser.parse_args()
@ -74,9 +87,34 @@ def load_config(config_path: str) -> dict:
return config return config
def setup_distributed_training():
"""Setup distributed training environment"""
# Check if we're in a distributed environment
if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
import torch.distributed as dist
import torch
# Initialize distributed training
if not dist.is_initialized():
dist.init_process_group(backend="nccl")
local_rank = int(os.environ.get("LOCAL_RANK", 0))
torch.cuda.set_device(local_rank)
print(f"Distributed training initialized: rank {dist.get_rank()}/{dist.get_world_size()}")
return True
return False
def main(): def main():
args = parse_args() args = parse_args()
# Setup distributed training if requested
is_distributed = False
if args.distributed or args.deepspeed:
is_distributed = setup_distributed_training()
print("Progressive LLM Training for 松尾研LLMコンペ2025") print("Progressive LLM Training for 松尾研LLMコンペ2025")
print("=" * 50) print("=" * 50)
@ -95,11 +133,26 @@ def main():
print(f"Error loading config: {e}") print(f"Error loading config: {e}")
sys.exit(1) sys.exit(1)
# Add distributed/deepspeed flags to config
config["training_args"] = config.get("training_args", {})
if args.distributed:
config["training_args"]["distributed"] = True
if args.deepspeed:
config["training_args"]["deepspeed"] = True
# Add DeepSpeed config from main config
if "deepspeed" in config:
config["training_args"]["deepspeed_config"] = config["deepspeed"]
# Print configuration info # Print configuration info
print(f"Experiment: {config['experiment']['name']}") print(f"Experiment: {config['experiment']['name']}")
print(f"Base model: {config['experiment']['base_model']}") print(f"Base model: {config['experiment']['base_model']}")
print(f"Output directory: {config['experiment']['output_dir']}") print(f"Output directory: {config['experiment']['output_dir']}")
print(f"Stages: {len(config['progressive_stages'])}") print(f"Stages: {len(config['progressive_stages'])}")
if is_distributed:
print("Mode: Distributed Training")
if args.deepspeed:
print("Backend: DeepSpeed")
print("=" * 50)
# Prepare sample datasets if requested # Prepare sample datasets if requested
if args.prepare_data: if args.prepare_data:

View file

@ -367,27 +367,55 @@ class ProgressiveTrainer:
print(f"Final dataset size: {len(dataset)} examples") print(f"Final dataset size: {len(dataset)} examples")
# Training arguments - with CPU offload optimizations # Training arguments - with multi-GPU and CPU offload optimizations
training_args = TrainingArguments( training_args_dict = {
output_dir=f"./outputs/checkpoints/{stage_name}", "output_dir": f"./outputs/checkpoints/{stage_name}",
num_train_epochs=stage_config["training"]["num_epochs"], "num_train_epochs": stage_config["training"]["num_epochs"],
per_device_train_batch_size=stage_config["training"]["per_device_batch_size"], "per_device_train_batch_size": stage_config["training"]["per_device_batch_size"],
gradient_accumulation_steps=stage_config["training"]["gradient_accumulation_steps"], "gradient_accumulation_steps": stage_config["training"]["gradient_accumulation_steps"],
learning_rate=float(stage_config["training"]["learning_rate"]), # Ensure it's a float "learning_rate": float(stage_config["training"]["learning_rate"]), # Ensure it's a float
warmup_steps=stage_config["training"]["warmup_steps"], "warmup_steps": stage_config["training"]["warmup_steps"],
logging_steps=stage_config["training"].get("logging_steps", 10), "logging_steps": stage_config["training"].get("logging_steps", 10),
save_strategy="epoch", "save_strategy": "epoch",
eval_strategy="no", "eval_strategy": "no",
bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(), "bf16": torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
gradient_checkpointing=self.config["model"].get("gradient_checkpointing", False), "gradient_checkpointing": self.config["model"].get("gradient_checkpointing", False),
max_grad_norm=stage_config["training"].get("max_grad_norm", 1.0), "max_grad_norm": stage_config["training"].get("max_grad_norm", 1.0),
report_to="wandb" if self.config["experiment"]["use_wandb"] else "none", "report_to": "wandb" if self.config["experiment"]["use_wandb"] else "none",
run_name=f"{self.config['experiment']['name']}_{stage_name}", "run_name": f"{self.config['experiment']['name']}_{stage_name}",
dataloader_pin_memory=False, # Reduce memory usage "dataloader_pin_memory": stage_config["training"].get("dataloader_pin_memory", False),
remove_unused_columns=False, # Keep all columns "remove_unused_columns": False, # Keep all columns
optim=stage_config["training"].get("optim", "adamw_torch"), # Support 8-bit optimizers "optim": stage_config["training"].get("optim", "adamw_torch"), # Support 8-bit optimizers
dataloader_num_workers=stage_config["training"].get("dataloader_num_workers", 2), "dataloader_num_workers": stage_config["training"].get("dataloader_num_workers", 2),
) }
# Add multi-GPU specific settings
if self.config.get("training_args", {}).get("distributed", False):
training_args_dict.update({
"ddp_find_unused_parameters": False,
"ddp_bucket_cap_mb": 200,
"ddp_broadcast_buffers": False,
})
# Add DeepSpeed configuration
if self.config.get("training_args", {}).get("deepspeed", False):
deepspeed_config = self.config.get("training_args", {}).get("deepspeed_config")
if deepspeed_config:
training_args_dict["deepspeed"] = deepspeed_config
# Add FSDP configuration
if "fsdp" in self.config:
fsdp_config = self.config["fsdp"]
training_args_dict.update({
"fsdp": fsdp_config.get("fsdp_sharding_strategy", "FULL_SHARD"),
"fsdp_transformer_layer_cls_to_wrap": fsdp_config.get("fsdp_transformer_layer_cls_to_wrap"),
"fsdp_auto_wrap_policy": fsdp_config.get("fsdp_auto_wrap_policy", "TRANSFORMER_BASED_WRAP"),
"fsdp_min_num_params": fsdp_config.get("fsdp_min_num_params", 1000000),
"fsdp_cpu_ram_efficient_loading": fsdp_config.get("fsdp_cpu_offload", False),
"fsdp_sync_module_states": fsdp_config.get("fsdp_sync_module_states", True),
})
training_args = TrainingArguments(**training_args_dict)
# Print dataset info for debugging # Print dataset info for debugging
print(f"Dataset columns: {dataset.column_names}") print(f"Dataset columns: {dataset.column_names}")

7
uv.lock generated Normal file
View file

@ -0,0 +1,7 @@
# This file is automatically @generated by uv.
# It is not intended for manual editing.
version = 1
requires-python = ">=3.9"
# Note: This is a placeholder lock file.
# Run `uv lock` to generate the actual lock file with resolved dependencies.