From 5ca971b0a4bfdca54efb6e13857aa0e2c47a3301 Mon Sep 17 00:00:00 2001
From: Soma Nakamura
Date: Thu, 10 Jul 2025 22:25:11 +0900
Subject: [PATCH] =?UTF-8?q?=E3=81=93=E3=82=93=E3=81=AB=E3=81=A1=E3=81=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.env.example | 13 +
.gitignore | 35 +--
.python-version | 1 +
README.md | 43 ++++
config/training_config_8gpu.yaml | 110 +++++++++
config/training_config_8gpu_deepspeed.yaml | 138 +++++++++++
config/training_config_8gpu_fsdp.yaml | 113 +++++++++
.../training_config_gemma3_1b_8gpu_ddp.yaml | 109 +++++++++
...ining_config_gemma3_1b_8gpu_deepspeed.yaml | 139 +++++++++++
.../training_config_gemma3_1b_8gpu_fsdp.yaml | 113 +++++++++
docs/README.md | 37 +++
pyproject.toml | 26 ++
requirements.txt | 7 +-
scripts/debug_model_loading.py | 201 ++++++++++++++++
scripts/train_gemma3_1b_8gpu.sh | 161 +++++++++++++
scripts/train_multi_gpu.py | 224 ++++++++++++++++++
scripts/train_progressive.py | 53 +++++
src/training.py | 70 ++++--
uv.lock | 7 +
19 files changed, 1559 insertions(+), 41 deletions(-)
create mode 100644 .env.example
create mode 100644 .python-version
create mode 100644 README.md
create mode 100644 config/training_config_8gpu.yaml
create mode 100644 config/training_config_8gpu_deepspeed.yaml
create mode 100644 config/training_config_8gpu_fsdp.yaml
create mode 100644 config/training_config_gemma3_1b_8gpu_ddp.yaml
create mode 100644 config/training_config_gemma3_1b_8gpu_deepspeed.yaml
create mode 100644 config/training_config_gemma3_1b_8gpu_fsdp.yaml
create mode 100644 docs/README.md
create mode 100644 pyproject.toml
create mode 100644 scripts/debug_model_loading.py
create mode 100755 scripts/train_gemma3_1b_8gpu.sh
create mode 100755 scripts/train_multi_gpu.py
create mode 100644 uv.lock
diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..a7fa5ef
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,13 @@
+# Environment variables for Progressive LLM Training
+# Copy this file to .env and fill in your values
+
+# HuggingFace
+HF_TOKEN=your_token_here
+
+# Weights & Biases
+WANDB_API_KEY=your_api_key_here
+WANDB_PROJECT=matsuo-llm-comp-2025
+
+# GPU Configuration
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+NCCL_DEBUG=WARN
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 20b8504..d8c85a5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,32 +1,35 @@
# Python
__pycache__/
*.py[cod]
-*$py.class
-*.so
-.Python
-venv/
-ENV/
-env/
.venv/
+venv/
-# Nix
-result
-result-*
-
-# Project specific
+# Training outputs
outputs/
data/
-*.log
+!data/basic_cot/train.jsonl
wandb/
-.ipynb_checkpoints/
+*.log
+
+# Model files
*.pt
*.pth
*.bin
*.safetensors
+# Temporary
+*.tmp
+.cache/
+accelerate_config.yaml
+hostfile
+
# IDE
.vscode/
.idea/
-*.swp
-*.swo
-*~
\ No newline at end of file
+
+# OS
+.DS_Store
+*~
+
+# Keep lock files
+!uv.lock
\ No newline at end of file
diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000..902b2c9
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.11
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9ea360c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,43 @@
+# Progressive LLM Training
+
+Progressive training for LLMs with 8-GPU support for 松尾研LLMコンペ2025.
+
+## Quick Start
+
+```bash
+# Install uv
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Setup project
+git clone
+cd progressive-llm-training
+uv sync
+
+# Start training
+uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml
+./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed
+```
+
+## Training Stages
+
+1. **basic_cot** - Basic reasoning
+2. **math_reasoning** - Math with OpenR1-Math-220k
+3. **complex_reasoning** - Complex reasoning with Mixture-of-Thoughts
+
+## Commands
+
+```bash
+uv sync # Install dependencies
+uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml # Single GPU
+./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed # 8 GPUs
+uv run pytest # Run tests
+```
+
+## Key Files
+
+- `config/training_config_gemma3_1b_8gpu_deepspeed.yaml` - 8-GPU config
+- `scripts/train_progressive.py` - Main training script
+- `scripts/train_gemma3_1b_8gpu.sh` - 8-GPU launcher
+- `src/progressive_model.py` - Core model implementation
+
+Ready to train! 🚀
\ No newline at end of file
diff --git a/config/training_config_8gpu.yaml b/config/training_config_8gpu.yaml
new file mode 100644
index 0000000..c532196
--- /dev/null
+++ b/config/training_config_8gpu.yaml
@@ -0,0 +1,110 @@
+experiment:
+ name: "progressive_reasoning_8gpu"
+ base_model: "google/gemma-2-2b-it" # Can scale up to larger models
+ output_dir: "./outputs"
+ use_wandb: true
+ wandb_project: "matsuo-llm-comp-2025"
+
+model:
+ load_in_4bit: false # Can use FP16/BF16 with multiple GPUs
+ bnb_4bit_compute_dtype: "bfloat16"
+ bnb_4bit_use_double_quant: true
+ device_map: "balanced" # Distribute across all GPUs
+ gradient_checkpointing: true
+ use_flash_attention_2: true # Enable if available for speed
+ use_eager_attention: false
+
+# Multi-GPU specific settings
+distributed:
+ strategy: "ddp" # Distributed Data Parallel
+ find_unused_parameters: false
+ gradient_as_bucket_view: true
+
+progressive_stages:
+ - name: "basic_cot"
+ description: "Basic Chain-of-Thought reasoning"
+ dataset_path: "./data/basic_cot/"
+ adapter_config:
+ r: 32 # Larger rank since we have more memory
+ lora_alpha: 64
+ lora_dropout: 0.1
+ target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
+ init_lora_weights: true
+ training:
+ num_epochs: 2
+ per_device_batch_size: 16 # Large batch size per GPU
+ gradient_accumulation_steps: 1 # No need for accumulation with 8 GPUs
+ learning_rate: 5e-4
+ warmup_steps: 100
+ max_length: 2048
+ bf16: true
+ max_grad_norm: 1.0
+ weight_decay: 0.001
+ save_steps: 50
+ logging_steps: 10
+ dataloader_num_workers: 4 # More workers for data loading
+ dataloader_pin_memory: true
+ remove_unused_columns: false
+
+ - name: "math_reasoning"
+ description: "Mathematical reasoning with OpenR1-Math-220k dataset"
+ dataset_path: "open-r1/OpenR1-Math-220k"
+ inherit_from: "basic_cot"
+ adapter_config:
+ r: 64
+ lora_alpha: 128
+ lora_dropout: 0.1
+ target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+ init_lora_weights: true
+ training:
+ num_epochs: 1
+ per_device_batch_size: 8 # Reduce for larger model
+ gradient_accumulation_steps: 2
+ learning_rate: 3e-4
+ warmup_steps: 200
+ max_length: 4096
+ bf16: true
+ max_grad_norm: 1.0
+ weight_decay: 0.001
+ save_steps: 100
+ logging_steps: 20
+ dataloader_num_workers: 4
+ dataset_config:
+ streaming: true
+ max_samples: 100000 # Can process more with 8 GPUs
+ split: "train"
+
+ - name: "complex_reasoning"
+ description: "Complex multi-step reasoning with Mixture-of-Thoughts"
+ dataset_path: "open-r1/Mixture-of-Thoughts"
+ inherit_from: "math_reasoning"
+ adapter_config:
+ r: 128 # Maximum rank with multi-GPU
+ lora_alpha: 256
+ lora_dropout: 0.1
+ target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+ init_lora_weights: true
+ training:
+ num_epochs: 1
+ per_device_batch_size: 4
+ gradient_accumulation_steps: 4
+ learning_rate: 2e-4
+ warmup_steps: 300
+ max_length: 8192
+ bf16: true
+ max_grad_norm: 1.0
+ weight_decay: 0.001
+ save_steps: 200
+ logging_steps: 50
+ dataloader_num_workers: 4
+ dataset_config:
+ streaming: true
+ max_samples: 50000
+ split: "train"
+
+evaluation:
+ benchmarks:
+ - "HLE"
+ - "Do-Not-Answer"
+ save_results: true
+ results_dir: "./outputs/evaluation_results"
\ No newline at end of file
diff --git a/config/training_config_8gpu_deepspeed.yaml b/config/training_config_8gpu_deepspeed.yaml
new file mode 100644
index 0000000..248e4dc
--- /dev/null
+++ b/config/training_config_8gpu_deepspeed.yaml
@@ -0,0 +1,138 @@
+experiment:
+ name: "progressive_reasoning_8gpu_deepspeed"
+ base_model: "google/gemma-2-2b-it"
+ output_dir: "./outputs"
+ use_wandb: true
+ wandb_project: "matsuo-llm-comp-2025"
+
+model:
+ load_in_4bit: false
+ device_map: null # Let DeepSpeed handle device placement
+ gradient_checkpointing: true
+ use_flash_attention_2: true
+ use_eager_attention: false
+
+# DeepSpeed Configuration
+deepspeed:
+ zero_optimization:
+ stage: 2 # ZeRO Stage 2 (partition optimizer states and gradients)
+ allgather_partitions: true
+ allgather_bucket_size: 200000000
+ overlap_comm: true
+ reduce_scatter: true
+ reduce_bucket_size: 200000000
+ contiguous_gradients: true
+ cpu_offload: false # Keep on GPU for speed
+
+ optimizer:
+ type: "AdamW"
+ params:
+ lr: 3e-4
+ betas: [0.9, 0.999]
+ eps: 1e-8
+ weight_decay: 0.001
+
+ scheduler:
+ type: "WarmupLR"
+ params:
+ warmup_min_lr: 0
+ warmup_max_lr: 3e-4
+ warmup_num_steps: 200
+
+ fp16:
+ enabled: false
+
+ bf16:
+ enabled: true
+
+ gradient_clipping: 1.0
+
+ train_batch_size: 512 # Total batch size across all GPUs
+ train_micro_batch_size_per_gpu: 64 # Per-GPU batch size
+
+progressive_stages:
+ - name: "basic_cot"
+ description: "Basic Chain-of-Thought reasoning"
+ dataset_path: "./data/basic_cot/"
+ adapter_config:
+ r: 64
+ lora_alpha: 128
+ lora_dropout: 0.1
+ target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
+ init_lora_weights: true
+ training:
+ num_epochs: 2
+ per_device_batch_size: 64 # Large batch with DeepSpeed
+ gradient_accumulation_steps: 1
+ learning_rate: 5e-4
+ warmup_steps: 100
+ max_length: 2048
+ bf16: true
+ max_grad_norm: 1.0
+ weight_decay: 0.001
+ save_steps: 50
+ logging_steps: 10
+ dataloader_num_workers: 8
+
+ - name: "math_reasoning"
+ description: "Mathematical reasoning with OpenR1-Math-220k dataset"
+ dataset_path: "open-r1/OpenR1-Math-220k"
+ inherit_from: "basic_cot"
+ adapter_config:
+ r: 128
+ lora_alpha: 256
+ lora_dropout: 0.1
+ target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+ init_lora_weights: true
+ training:
+ num_epochs: 1
+ per_device_batch_size: 32
+ gradient_accumulation_steps: 1
+ learning_rate: 3e-4
+ warmup_steps: 200
+ max_length: 4096
+ bf16: true
+ max_grad_norm: 1.0
+ weight_decay: 0.001
+ save_steps: 100
+ logging_steps: 20
+ dataloader_num_workers: 8
+ dataset_config:
+ streaming: true
+ max_samples: 200000
+ split: "train"
+
+ - name: "complex_reasoning"
+ description: "Complex multi-step reasoning with Mixture-of-Thoughts"
+ dataset_path: "open-r1/Mixture-of-Thoughts"
+ inherit_from: "math_reasoning"
+ adapter_config:
+ r: 256
+ lora_alpha: 512
+ lora_dropout: 0.1
+ target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+ init_lora_weights: true
+ training:
+ num_epochs: 1
+ per_device_batch_size: 16
+ gradient_accumulation_steps: 2
+ learning_rate: 2e-4
+ warmup_steps: 300
+ max_length: 8192
+ bf16: true
+ max_grad_norm: 1.0
+ weight_decay: 0.001
+ save_steps: 200
+ logging_steps: 50
+ dataloader_num_workers: 8
+ dataset_config:
+ streaming: true
+ max_samples: 100000
+ split: "train"
+
+evaluation:
+ benchmarks:
+ - "HLE"
+ - "Do-Not-Answer"
+ save_results: true
+ results_dir: "./outputs/evaluation_results"
\ No newline at end of file
diff --git a/config/training_config_8gpu_fsdp.yaml b/config/training_config_8gpu_fsdp.yaml
new file mode 100644
index 0000000..62d06cf
--- /dev/null
+++ b/config/training_config_8gpu_fsdp.yaml
@@ -0,0 +1,113 @@
+experiment:
+ name: "progressive_reasoning_8gpu_fsdp"
+ base_model: "google/gemma-2-2b-it" # Can scale to much larger models with FSDP
+ output_dir: "./outputs"
+ use_wandb: true
+ wandb_project: "matsuo-llm-comp-2025"
+
+model:
+ load_in_4bit: false
+ device_map: null # Let FSDP handle device placement
+ gradient_checkpointing: true
+ use_flash_attention_2: true
+ use_eager_attention: false
+
+# FSDP Configuration
+fsdp:
+ fsdp_transformer_layer_cls_to_wrap: "Gemma2DecoderLayer" # Wrap at layer level
+ fsdp_sharding_strategy: "FULL_SHARD" # Shard parameters, gradients, and optimizer states
+ fsdp_cpu_offload: false # Keep on GPU for speed
+ fsdp_mixed_precision: true # Use BF16 mixed precision
+ fsdp_auto_wrap_policy: "TRANSFORMER_BASED_WRAP"
+ fsdp_min_num_params: 1000000 # Wrap layers with >1M parameters
+ fsdp_sync_module_states: true
+ fsdp_forward_prefetch: true
+ fsdp_use_orig_params: true # Important for LoRA compatibility
+
+progressive_stages:
+ - name: "basic_cot"
+ description: "Basic Chain-of-Thought reasoning"
+ dataset_path: "./data/basic_cot/"
+ adapter_config:
+ r: 64 # Can use larger ranks with FSDP
+ lora_alpha: 128
+ lora_dropout: 0.1
+ target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
+ init_lora_weights: true
+ training:
+ num_epochs: 2
+ per_device_batch_size: 32 # Very large batch size with FSDP
+ gradient_accumulation_steps: 1
+ learning_rate: 5e-4
+ warmup_steps: 100
+ max_length: 2048
+ bf16: true
+ max_grad_norm: 1.0
+ weight_decay: 0.001
+ save_steps: 50
+ logging_steps: 10
+ dataloader_num_workers: 8
+ dataloader_pin_memory: true
+
+ - name: "math_reasoning"
+ description: "Mathematical reasoning with OpenR1-Math-220k dataset"
+ dataset_path: "open-r1/OpenR1-Math-220k"
+ inherit_from: "basic_cot"
+ adapter_config:
+ r: 128
+ lora_alpha: 256
+ lora_dropout: 0.1
+ target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+ init_lora_weights: true
+ training:
+ num_epochs: 1
+ per_device_batch_size: 16
+ gradient_accumulation_steps: 2
+ learning_rate: 3e-4
+ warmup_steps: 200
+ max_length: 4096
+ bf16: true
+ max_grad_norm: 1.0
+ weight_decay: 0.001
+ save_steps: 100
+ logging_steps: 20
+ dataloader_num_workers: 8
+ dataset_config:
+ streaming: true
+ max_samples: 200000 # Process even more data
+ split: "train"
+
+ - name: "complex_reasoning"
+ description: "Complex multi-step reasoning with Mixture-of-Thoughts"
+ dataset_path: "open-r1/Mixture-of-Thoughts"
+ inherit_from: "math_reasoning"
+ adapter_config:
+ r: 256 # Very large rank possible with FSDP
+ lora_alpha: 512
+ lora_dropout: 0.1
+ target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+ init_lora_weights: true
+ training:
+ num_epochs: 1
+ per_device_batch_size: 8
+ gradient_accumulation_steps: 4
+ learning_rate: 2e-4
+ warmup_steps: 300
+ max_length: 8192
+ bf16: true
+ max_grad_norm: 1.0
+ weight_decay: 0.001
+ save_steps: 200
+ logging_steps: 50
+ dataloader_num_workers: 8
+ dataset_config:
+ streaming: true
+ max_samples: 100000
+ split: "train"
+
+evaluation:
+ benchmarks:
+ - "HLE"
+ - "Do-Not-Answer"
+ save_results: true
+ results_dir: "./outputs/evaluation_results"
\ No newline at end of file
diff --git a/config/training_config_gemma3_1b_8gpu_ddp.yaml b/config/training_config_gemma3_1b_8gpu_ddp.yaml
new file mode 100644
index 0000000..847acfb
--- /dev/null
+++ b/config/training_config_gemma3_1b_8gpu_ddp.yaml
@@ -0,0 +1,109 @@
+experiment:
+ name: "progressive_reasoning_gemma3_1b_8gpu_ddp"
+ base_model: "google/gemma-3-1b-pt"
+ output_dir: "./outputs"
+ use_wandb: true
+ wandb_project: "matsuo-llm-comp-2025"
+
+model:
+ load_in_4bit: false # Can use FP16/BF16 with multiple GPUs
+ bnb_4bit_compute_dtype: "bfloat16"
+ bnb_4bit_use_double_quant: true
+ device_map: "balanced" # Distribute across all GPUs
+ gradient_checkpointing: true
+ use_flash_attention_2: false
+ use_eager_attention: true
+
+# Multi-GPU specific settings
+distributed:
+ strategy: "ddp" # Distributed Data Parallel
+ find_unused_parameters: false
+ gradient_as_bucket_view: true
+
+progressive_stages:
+ - name: "basic_cot"
+ description: "Basic Chain-of-Thought reasoning"
+ dataset_path: "./data/basic_cot/"
+ adapter_config:
+ r: 16 # Moderate rank for DDP
+ lora_alpha: 32
+ lora_dropout: 0.1
+ target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
+ init_lora_weights: true
+ training:
+ num_epochs: 2
+ per_device_batch_size: 32 # 32 * 8 = 256 total batch size
+ gradient_accumulation_steps: 1
+ learning_rate: 5e-4
+ warmup_steps: 100
+ max_length: 1024
+ bf16: true
+ max_grad_norm: 1.0
+ weight_decay: 0.001
+ save_steps: 50
+ logging_steps: 10
+ dataloader_num_workers: 4
+ dataloader_pin_memory: true
+
+ - name: "math_reasoning"
+ description: "Mathematical reasoning with OpenR1-Math-220k dataset"
+ dataset_path: "open-r1/OpenR1-Math-220k"
+ inherit_from: "basic_cot"
+ adapter_config:
+ r: 32
+ lora_alpha: 64
+ lora_dropout: 0.1
+ target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+ init_lora_weights: true
+ training:
+ num_epochs: 1
+ per_device_batch_size: 16 # 16 * 8 = 128 total batch size
+ gradient_accumulation_steps: 2
+ learning_rate: 3e-4
+ warmup_steps: 200
+ max_length: 2048
+ bf16: true
+ max_grad_norm: 1.0
+ weight_decay: 0.001
+ save_steps: 100
+ logging_steps: 20
+ dataloader_num_workers: 4
+ dataset_config:
+ streaming: true
+ max_samples: 400000 # Process substantial data
+ split: "train"
+
+ - name: "complex_reasoning"
+ description: "Complex multi-step reasoning with Mixture-of-Thoughts"
+ dataset_path: "open-r1/Mixture-of-Thoughts"
+ inherit_from: "math_reasoning"
+ adapter_config:
+ r: 64
+ lora_alpha: 128
+ lora_dropout: 0.1
+ target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+ init_lora_weights: true
+ training:
+ num_epochs: 1
+ per_device_batch_size: 8 # 8 * 8 = 64 total batch size
+ gradient_accumulation_steps: 4
+ learning_rate: 2e-4
+ warmup_steps: 300
+ max_length: 4096
+ bf16: true
+ max_grad_norm: 1.0
+ weight_decay: 0.001
+ save_steps: 200
+ logging_steps: 50
+ dataloader_num_workers: 4
+ dataset_config:
+ streaming: true
+ max_samples: 600000
+ split: "train"
+
+evaluation:
+ benchmarks:
+ - "HLE"
+ - "Do-Not-Answer"
+ save_results: true
+ results_dir: "./outputs/evaluation_results"
\ No newline at end of file
diff --git a/config/training_config_gemma3_1b_8gpu_deepspeed.yaml b/config/training_config_gemma3_1b_8gpu_deepspeed.yaml
new file mode 100644
index 0000000..521fdc5
--- /dev/null
+++ b/config/training_config_gemma3_1b_8gpu_deepspeed.yaml
@@ -0,0 +1,139 @@
+experiment:
+ name: "progressive_reasoning_gemma3_1b_8gpu_deepspeed"
+ base_model: "google/gemma-3-1b-pt"
+ output_dir: "./outputs"
+ use_wandb: true
+ wandb_project: "matsuo-llm-comp-2025"
+
+model:
+ load_in_4bit: false # Disable quantization for DeepSpeed
+ device_map: null # Let DeepSpeed handle device placement
+ gradient_checkpointing: true # Enable for memory efficiency
+ use_flash_attention_2: false
+ use_eager_attention: true
+
+# DeepSpeed Configuration
+deepspeed:
+ zero_optimization:
+ stage: 2 # ZeRO Stage 2 (partition optimizer states and gradients)
+ allgather_partitions: true
+ allgather_bucket_size: 500000000 # 500MB buckets
+ overlap_comm: true
+ reduce_scatter: true
+ reduce_bucket_size: 500000000
+ contiguous_gradients: true
+ cpu_offload: false # Keep on GPU for speed with small model
+
+ optimizer:
+ type: "AdamW"
+ params:
+ lr: 5e-4
+ betas: [0.9, 0.999]
+ eps: 1e-8
+ weight_decay: 0.001
+
+ scheduler:
+ type: "WarmupLR"
+ params:
+ warmup_min_lr: 0
+ warmup_max_lr: 5e-4
+ warmup_num_steps: 100
+
+ fp16:
+ enabled: false
+
+ bf16:
+ enabled: true
+
+ gradient_clipping: 1.0
+
+ train_batch_size: 512 # Total batch size across all GPUs
+ train_micro_batch_size_per_gpu: 64 # Per-GPU batch size
+
+progressive_stages:
+ - name: "basic_cot"
+ description: "Basic Chain-of-Thought reasoning"
+ dataset_path: "./data/basic_cot/"
+ adapter_config:
+ r: 32 # Larger rank with 8 GPUs
+ lora_alpha: 64
+ lora_dropout: 0.1
+ target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
+ init_lora_weights: true
+ training:
+ num_epochs: 2
+ per_device_batch_size: 64 # Large batch with DeepSpeed
+ gradient_accumulation_steps: 1 # No accumulation needed
+ learning_rate: 5e-4
+ warmup_steps: 100
+ max_length: 1024
+ bf16: true
+ max_grad_norm: 1.0
+ weight_decay: 0.001
+ save_steps: 50
+ logging_steps: 10
+ dataloader_num_workers: 8
+ dataloader_pin_memory: true
+
+ - name: "math_reasoning"
+ description: "Mathematical reasoning with OpenR1-Math-220k dataset"
+ dataset_path: "open-r1/OpenR1-Math-220k"
+ inherit_from: "basic_cot"
+ adapter_config:
+ r: 64 # Larger rank for math reasoning
+ lora_alpha: 128
+ lora_dropout: 0.1
+ target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+ init_lora_weights: true
+ training:
+ num_epochs: 1
+ per_device_batch_size: 32 # Reduce for longer sequences
+ gradient_accumulation_steps: 1
+ learning_rate: 3e-4
+ warmup_steps: 200
+ max_length: 2048
+ bf16: true
+ max_grad_norm: 1.0
+ weight_decay: 0.001
+ save_steps: 100
+ logging_steps: 20
+ dataloader_num_workers: 8
+ dataset_config:
+ streaming: true
+ max_samples: 500000 # Process more data with 8 GPUs
+ split: "train"
+
+ - name: "complex_reasoning"
+ description: "Complex multi-step reasoning with Mixture-of-Thoughts"
+ dataset_path: "open-r1/Mixture-of-Thoughts"
+ inherit_from: "math_reasoning"
+ adapter_config:
+ r: 128 # Maximum rank for complex reasoning
+ lora_alpha: 256
+ lora_dropout: 0.1
+ target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+ init_lora_weights: true
+ training:
+ num_epochs: 1
+ per_device_batch_size: 16 # Reduce for very long sequences
+ gradient_accumulation_steps: 2
+ learning_rate: 2e-4
+ warmup_steps: 300
+ max_length: 4096
+ bf16: true
+ max_grad_norm: 1.0
+ weight_decay: 0.001
+ save_steps: 200
+ logging_steps: 50
+ dataloader_num_workers: 8
+ dataset_config:
+ streaming: true
+ max_samples: 800000 # Process even more data
+ split: "train"
+
+evaluation:
+ benchmarks:
+ - "HLE"
+ - "Do-Not-Answer"
+ save_results: true
+ results_dir: "./outputs/evaluation_results"
\ No newline at end of file
diff --git a/config/training_config_gemma3_1b_8gpu_fsdp.yaml b/config/training_config_gemma3_1b_8gpu_fsdp.yaml
new file mode 100644
index 0000000..29bd242
--- /dev/null
+++ b/config/training_config_gemma3_1b_8gpu_fsdp.yaml
@@ -0,0 +1,113 @@
+experiment:
+ name: "progressive_reasoning_gemma3_1b_8gpu_fsdp"
+ base_model: "google/gemma-3-1b-pt"
+ output_dir: "./outputs"
+ use_wandb: true
+ wandb_project: "matsuo-llm-comp-2025"
+
+model:
+ load_in_4bit: false
+ device_map: null # Let FSDP handle device placement
+ gradient_checkpointing: true
+ use_flash_attention_2: false
+ use_eager_attention: true
+
+# FSDP Configuration
+fsdp:
+ fsdp_transformer_layer_cls_to_wrap: "GemmaDecoderLayer" # Wrap at layer level
+ fsdp_sharding_strategy: "FULL_SHARD" # Shard parameters, gradients, and optimizer states
+ fsdp_cpu_offload: false # Keep on GPU for speed with small model
+ fsdp_mixed_precision: true # Use BF16 mixed precision
+ fsdp_auto_wrap_policy: "TRANSFORMER_BASED_WRAP"
+ fsdp_min_num_params: 1000000 # Wrap layers with >1M parameters
+ fsdp_sync_module_states: true
+ fsdp_forward_prefetch: true
+ fsdp_use_orig_params: true # Important for LoRA compatibility
+
+progressive_stages:
+ - name: "basic_cot"
+ description: "Basic Chain-of-Thought reasoning"
+ dataset_path: "./data/basic_cot/"
+ adapter_config:
+ r: 32 # Can use larger ranks with FSDP
+ lora_alpha: 64
+ lora_dropout: 0.1
+ target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
+ init_lora_weights: true
+ training:
+ num_epochs: 2
+ per_device_batch_size: 48 # Very large batch size with FSDP
+ gradient_accumulation_steps: 1
+ learning_rate: 5e-4
+ warmup_steps: 100
+ max_length: 1024
+ bf16: true
+ max_grad_norm: 1.0
+ weight_decay: 0.001
+ save_steps: 50
+ logging_steps: 10
+ dataloader_num_workers: 8
+ dataloader_pin_memory: true
+
+ - name: "math_reasoning"
+ description: "Mathematical reasoning with OpenR1-Math-220k dataset"
+ dataset_path: "open-r1/OpenR1-Math-220k"
+ inherit_from: "basic_cot"
+ adapter_config:
+ r: 64
+ lora_alpha: 128
+ lora_dropout: 0.1
+ target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+ init_lora_weights: true
+ training:
+ num_epochs: 1
+ per_device_batch_size: 24
+ gradient_accumulation_steps: 2
+ learning_rate: 3e-4
+ warmup_steps: 200
+ max_length: 2048
+ bf16: true
+ max_grad_norm: 1.0
+ weight_decay: 0.001
+ save_steps: 100
+ logging_steps: 20
+ dataloader_num_workers: 8
+ dataset_config:
+ streaming: true
+ max_samples: 600000 # Process even more data with FSDP
+ split: "train"
+
+ - name: "complex_reasoning"
+ description: "Complex multi-step reasoning with Mixture-of-Thoughts"
+ dataset_path: "open-r1/Mixture-of-Thoughts"
+ inherit_from: "math_reasoning"
+ adapter_config:
+ r: 128 # Very large rank possible with FSDP
+ lora_alpha: 256
+ lora_dropout: 0.1
+ target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+ init_lora_weights: true
+ training:
+ num_epochs: 1
+ per_device_batch_size: 12
+ gradient_accumulation_steps: 4
+ learning_rate: 2e-4
+ warmup_steps: 300
+ max_length: 4096
+ bf16: true
+ max_grad_norm: 1.0
+ weight_decay: 0.001
+ save_steps: 200
+ logging_steps: 50
+ dataloader_num_workers: 8
+ dataset_config:
+ streaming: true
+ max_samples: 1000000 # Can process 1M samples efficiently
+ split: "train"
+
+evaluation:
+ benchmarks:
+ - "HLE"
+ - "Do-Not-Answer"
+ save_results: true
+ results_dir: "./outputs/evaluation_results"
\ No newline at end of file
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000..6581458
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,37 @@
+# Progressive LLM Training Documentation
+
+## Setup
+
+```bash
+curl -LsSf https://astral.sh/uv/install.sh | sh
+uv sync
+```
+
+## Training
+
+### Single GPU
+```bash
+uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml
+```
+
+### 8 GPUs
+```bash
+./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed
+```
+
+## Configuration
+
+- `config/training_config_gemma3_1b.yaml` - Single GPU
+- `config/training_config_gemma3_1b_8gpu_deepspeed.yaml` - 8 GPUs
+
+## Environment
+
+Copy `.env.example` to `.env` and set:
+- `HF_TOKEN` - HuggingFace token
+- `WANDB_API_KEY` - W&B API key
+
+## Troubleshooting
+
+- Reduce `per_device_batch_size` for memory issues
+- `export NCCL_DEBUG=INFO` for NCCL errors
+- `nvidia-smi` to check GPUs
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..573bccd
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,26 @@
+[project]
+name = "progressive-llm-training"
+version = "0.1.0"
+description = "Progressive LLM Training for 松尾研LLMコンペ2025"
+requires-python = ">=3.9"
+
+dependencies = [
+ "torch>=2.0.0",
+ "transformers>=4.40.0",
+ "accelerate>=0.27.0",
+ "peft>=0.11.0",
+ "trl>=0.9.0",
+ "datasets>=2.18.0",
+ "bitsandbytes>=0.43.0",
+ "wandb>=0.16.0",
+ "pyyaml>=6.0",
+ "jsonlines>=4.0.0",
+ "deepspeed>=0.12.0",
+]
+
+[project.optional-dependencies]
+dev = ["pytest", "black", "isort"]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 534ab7a..f521a17 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
+# Use uv instead: uv sync
+torch>=2.0.0
transformers>=4.40.0
accelerate>=0.27.0
peft>=0.11.0
@@ -7,7 +9,4 @@ bitsandbytes>=0.43.0
wandb>=0.16.0
pyyaml>=6.0
jsonlines>=4.0.0
-scikit-learn>=1.3.0
-# flash-attn>=2.5.0 # Install separately with --no-build-isolation
-sentencepiece>=0.2.0
-protobuf>=4.25.0
+deepspeed>=0.12.0
diff --git a/scripts/debug_model_loading.py b/scripts/debug_model_loading.py
new file mode 100644
index 0000000..ad39c10
--- /dev/null
+++ b/scripts/debug_model_loading.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+"""
+Debug script to identify model loading issues
+"""
+
+import sys
+import os
+import torch
+from pathlib import Path
+
+# Add src to path
+sys.path.append(str(Path(__file__).parent.parent))
+
+def clear_accelerate_env():
+ """Clear all ACCELERATE environment variables"""
+ print("Clearing ACCELERATE environment variables...")
+ env_vars_to_clear = []
+ for key in os.environ:
+ if 'ACCELERATE' in key:
+ env_vars_to_clear.append(key)
+
+ for var in env_vars_to_clear:
+ print(f" Removing {var}={os.environ[var]}")
+ del os.environ[var]
+
+def test_basic_model_loading():
+ """Test basic model loading without any configuration"""
+ print("Testing basic model loading...")
+
+ from transformers import AutoModelForCausalLM, AutoTokenizer
+
+ model_name = "google/gemma-2-2b-it"
+
+ try:
+ print("Testing with absolutely minimal config...")
+ model = AutoModelForCausalLM.from_pretrained(
+ model_name,
+ trust_remote_code=True,
+ torch_dtype=torch.float32
+ )
+ print("✅ Basic loading successful!")
+ del model
+ return True
+ except Exception as e:
+ print(f"❌ Basic loading failed: {e}")
+ return False
+
+def test_with_device_map():
+ """Test with device_map auto"""
+ print("Testing with device_map='auto'...")
+
+ from transformers import AutoModelForCausalLM
+
+ model_name = "google/gemma-2-2b-it"
+
+ try:
+ model = AutoModelForCausalLM.from_pretrained(
+ model_name,
+ trust_remote_code=True,
+ torch_dtype=torch.float32,
+ device_map="auto"
+ )
+ print("✅ Device map loading successful!")
+ del model
+ return True
+ except Exception as e:
+ print(f"❌ Device map loading failed: {e}")
+ return False
+
+def test_with_quantization():
+ """Test with quantization"""
+ print("Testing with 4-bit quantization...")
+
+ from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+
+ model_name = "google/gemma-2-2b-it"
+
+ try:
+ bnb_config = BitsAndBytesConfig(
+ load_in_4bit=True,
+ bnb_4bit_compute_dtype=torch.bfloat16,
+ bnb_4bit_use_double_quant=True,
+ bnb_4bit_quant_type="nf4"
+ )
+
+ model = AutoModelForCausalLM.from_pretrained(
+ model_name,
+ trust_remote_code=True,
+ quantization_config=bnb_config
+ )
+ print("✅ Quantization loading successful!")
+ del model
+ return True
+ except Exception as e:
+ print(f"❌ Quantization loading failed: {e}")
+ return False
+
+def print_environment_info():
+ """Print detailed environment information"""
+ print("\n" + "="*50)
+ print("ENVIRONMENT INFORMATION")
+ print("="*50)
+
+ # Python version
+ print(f"Python version: {sys.version}")
+
+ # PyTorch info
+ try:
+ import torch
+ print(f"PyTorch version: {torch.__version__}")
+ print(f"CUDA available: {torch.cuda.is_available()}")
+ if torch.cuda.is_available():
+ print(f"CUDA device count: {torch.cuda.device_count()}")
+ for i in range(torch.cuda.device_count()):
+ print(f" Device {i}: {torch.cuda.get_device_name(i)}")
+ print(f"CUDA version: {torch.version.cuda}")
+ except Exception as e:
+ print(f"PyTorch info error: {e}")
+
+ # Transformers info
+ try:
+ from transformers import __version__ as tf_version
+ print(f"Transformers version: {tf_version}")
+ except Exception as e:
+ print(f"Transformers info error: {e}")
+
+ # Accelerate info
+ try:
+ from accelerate import __version__ as acc_version
+ print(f"Accelerate version: {acc_version}")
+ except Exception as e:
+ print(f"Accelerate info error: {e}")
+
+ # PEFT info
+ try:
+ from peft import __version__ as peft_version
+ print(f"PEFT version: {peft_version}")
+ except Exception as e:
+ print(f"PEFT info error: {e}")
+
+ # BitsAndBytes info
+ try:
+ import bitsandbytes as bnb
+ print(f"BitsAndBytes version: {bnb.__version__}")
+ except Exception as e:
+ print(f"BitsAndBytes info error: {e}")
+
+ # Environment variables
+ print("\nRelevant environment variables:")
+ for key, value in sorted(os.environ.items()):
+ if any(prefix in key for prefix in ['CUDA', 'TORCH', 'HF_', 'ACCELERATE', 'TRANSFORMERS']):
+ print(f" {key}={value}")
+
+def main():
+ print("Progressive LLM Training - Model Loading Debug")
+ print("=" * 60)
+
+ # Print environment info first
+ print_environment_info()
+
+ # Clear environment variables
+ clear_accelerate_env()
+
+ # Test various loading methods
+ print("\n" + "="*50)
+ print("TESTING MODEL LOADING")
+ print("="*50)
+
+ results = []
+
+ # Test 1: Basic loading
+ results.append(("Basic loading", test_basic_model_loading()))
+
+ # Test 2: With device map
+ results.append(("Device map", test_with_device_map()))
+
+ # Test 3: With quantization
+ results.append(("Quantization", test_with_quantization()))
+
+ # Summary
+ print("\n" + "="*50)
+ print("SUMMARY")
+ print("="*50)
+
+ for test_name, success in results:
+ status = "✅ PASS" if success else "❌ FAIL"
+ print(f"{test_name}: {status}")
+
+ if any(result[1] for result in results):
+ print("\n✅ At least one loading method works!")
+ print("Use the successful method in your configuration.")
+ else:
+ print("\n❌ All loading methods failed!")
+ print("This indicates a fundamental environment issue.")
+ print("Consider:")
+ print("1. Reinstalling transformers, accelerate, torch")
+ print("2. Checking CUDA installation")
+ print("3. Using a different model")
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/scripts/train_gemma3_1b_8gpu.sh b/scripts/train_gemma3_1b_8gpu.sh
new file mode 100755
index 0000000..0c4ca81
--- /dev/null
+++ b/scripts/train_gemma3_1b_8gpu.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+# Training launcher script for Gemma3 1B with 8 GPUs (uv compatible)
+
+# Color codes for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+echo -e "${GREEN}Progressive LLM Training - Gemma3 1B 8GPU Launcher (uv)${NC}"
+echo "======================================================="
+
+# Check if uv is available
+if command -v uv &> /dev/null; then
+ echo -e "${GREEN}Using uv for Python environment management${NC}"
+ UV_PREFIX="uv run"
+else
+ echo -e "${YELLOW}uv not found, using standard python${NC}"
+ UV_PREFIX="python"
+fi
+
+# Default values
+STRATEGY="deepspeed"
+CONFIG=""
+NUM_GPUS=8
+DRY_RUN=false
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+ case $1 in
+ --strategy)
+ STRATEGY="$2"
+ shift 2
+ ;;
+ --config)
+ CONFIG="$2"
+ shift 2
+ ;;
+ --num-gpus)
+ NUM_GPUS="$2"
+ shift 2
+ ;;
+ --dry-run)
+ DRY_RUN=true
+ shift
+ ;;
+ -h|--help)
+ echo "Usage: $0 [options]"
+ echo ""
+ echo "Options:"
+ echo " --strategy Training strategy (default: deepspeed)"
+ echo " --config Custom config file (optional)"
+ echo " --num-gpus Number of GPUs to use (default: 8)"
+ echo " --dry-run Show command without executing"
+ echo ""
+ echo "Examples:"
+ echo " # Use DeepSpeed (recommended)"
+ echo " $0 --strategy deepspeed"
+ echo ""
+ echo " # Use DDP"
+ echo " $0 --strategy ddp"
+ echo ""
+ echo " # Use FSDP"
+ echo " $0 --strategy fsdp"
+ echo ""
+ echo " # Use custom config"
+ echo " $0 --strategy ddp --config config/my_config.yaml"
+ exit 0
+ ;;
+ *)
+ echo -e "${RED}Error: Unknown option $1${NC}"
+ exit 1
+ ;;
+ esac
+done
+
+# Check GPU availability
+GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+echo -e "Available GPUs: ${GREEN}$GPU_COUNT${NC}"
+
+if [ $GPU_COUNT -lt $NUM_GPUS ]; then
+ echo -e "${RED}Error: Requested $NUM_GPUS GPUs but only $GPU_COUNT available${NC}"
+ exit 1
+fi
+
+# Set default config based on strategy if not provided
+if [ -z "$CONFIG" ]; then
+ case $STRATEGY in
+ ddp)
+ CONFIG="config/training_config_gemma3_1b_8gpu_ddp.yaml"
+ ;;
+ fsdp)
+ CONFIG="config/training_config_gemma3_1b_8gpu_fsdp.yaml"
+ ;;
+ deepspeed)
+ CONFIG="config/training_config_gemma3_1b_8gpu_deepspeed.yaml"
+ ;;
+ *)
+ echo -e "${RED}Error: Invalid strategy '$STRATEGY'. Choose from: ddp, fsdp, deepspeed${NC}"
+ exit 1
+ ;;
+ esac
+fi
+
+# Check if config file exists
+if [ ! -f "$CONFIG" ]; then
+ echo -e "${RED}Error: Config file not found: $CONFIG${NC}"
+ exit 1
+fi
+
+echo -e "Strategy: ${YELLOW}$STRATEGY${NC}"
+echo -e "Config: ${YELLOW}$CONFIG${NC}"
+echo -e "GPUs: ${YELLOW}$NUM_GPUS${NC}"
+echo ""
+
+# Build the command
+CMD="$UV_PREFIX scripts/train_multi_gpu.py --config $CONFIG --strategy $STRATEGY --num_gpus $NUM_GPUS"
+
+if [ "$DRY_RUN" = true ]; then
+ echo -e "${YELLOW}Dry run mode - Command that would be executed:${NC}"
+ echo "$CMD"
+ exit 0
+fi
+
+# Show GPU memory before training
+echo -e "${GREEN}GPU Memory Usage Before Training:${NC}"
+nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv
+
+echo ""
+echo -e "${GREEN}Starting training...${NC}"
+echo "Command: $CMD"
+echo ""
+
+# Set environment variables for optimal performance
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export NCCL_DEBUG=WARN # Set to INFO for debugging
+export NCCL_ASYNC_ERROR_HANDLING=1
+
+# For DeepSpeed, set additional optimizations
+if [ "$STRATEGY" = "deepspeed" ]; then
+ export DS_SKIP_CUDA_CHECK=1
+ export TOKENIZERS_PARALLELISM=false
+fi
+
+# Execute the training command
+$CMD
+
+# Check exit status
+if [ $? -eq 0 ]; then
+ echo ""
+ echo -e "${GREEN}Training completed successfully!${NC}"
+
+ # Show GPU memory after training
+ echo ""
+ echo -e "${GREEN}GPU Memory Usage After Training:${NC}"
+ nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv
+else
+ echo ""
+ echo -e "${RED}Training failed!${NC}"
+ exit 1
+fi
\ No newline at end of file
diff --git a/scripts/train_multi_gpu.py b/scripts/train_multi_gpu.py
new file mode 100755
index 0000000..5cc278f
--- /dev/null
+++ b/scripts/train_multi_gpu.py
@@ -0,0 +1,224 @@
+#!/usr/bin/env python3
+"""
+Multi-GPU training launcher for progressive reasoning model
+Supports DDP, FSDP, and DeepSpeed strategies
+"""
+
+import os
+import sys
+import argparse
+import subprocess
+import shutil
+import torch
+from pathlib import Path
+
+# Add src to path
+sys.path.append(str(Path(__file__).parent.parent))
+
+
+def get_gpu_count():
+ """Get the number of available GPUs"""
+ if torch.cuda.is_available():
+ return torch.cuda.device_count()
+ return 0
+
+
+def setup_environment_for_strategy(strategy):
+ """Set up environment variables for different strategies"""
+ if strategy == "deepspeed":
+ # DeepSpeed specific environment
+ os.environ["MASTER_ADDR"] = "localhost"
+ os.environ["MASTER_PORT"] = "12355"
+ os.environ["RANK"] = "0"
+ os.environ["LOCAL_RANK"] = "0"
+ os.environ["WORLD_SIZE"] = str(get_gpu_count())
+ elif strategy in ["ddp", "fsdp"]:
+ # Standard distributed training environment
+ os.environ["MASTER_ADDR"] = "localhost"
+ os.environ["MASTER_PORT"] = "12355"
+ # Let torchrun handle the rest
+
+ # General optimizations
+ os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
+ os.environ["NCCL_DEBUG"] = "INFO"
+ os.environ["TORCH_DISTRIBUTED_DEBUG"] = "INFO"
+
+
+def launch_ddp_training(config_path, num_gpus):
+ """Launch DDP training using torchrun"""
+ print(f"Launching DDP training on {num_gpus} GPUs...")
+
+ setup_environment_for_strategy("ddp")
+
+ # Use torchrun for DDP
+ # Check if uv is available
+ python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"]
+
+ cmd = [
+ "torchrun",
+ "--nproc_per_node", str(num_gpus),
+ "--master_port", "12355",
+ ] + python_cmd + [
+ "--config", config_path,
+ "--distributed"
+ ]
+
+ print(f"Running command: {' '.join(cmd)}")
+ return subprocess.run(cmd, cwd=Path(__file__).parent.parent)
+
+
+def launch_fsdp_training(config_path, num_gpus):
+ """Launch FSDP training using accelerate"""
+ print(f"Launching FSDP training on {num_gpus} GPUs...")
+
+ setup_environment_for_strategy("fsdp")
+
+ # Create accelerate config for FSDP
+ accelerate_config = f"""
+compute_environment: LOCAL_MACHINE
+distributed_type: FSDP
+fsdp_config:
+ fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+ fsdp_backward_prefetch: BACKWARD_PRE
+ fsdp_cpu_ram_efficient_loading: true
+ fsdp_forward_prefetch: false
+ fsdp_offload_params: false
+ fsdp_sharding_strategy: FULL_SHARD
+ fsdp_state_dict_type: SHARDED_STATE_DICT
+ fsdp_sync_module_states: true
+ fsdp_transformer_layer_cls_to_wrap: Gemma2DecoderLayer
+ fsdp_use_orig_params: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: {num_gpus}
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+"""
+
+ # Save config temporarily
+ config_file = Path(__file__).parent.parent / "accelerate_config.yaml"
+ with open(config_file, "w") as f:
+ f.write(accelerate_config)
+
+ # Check if uv is available
+ python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"]
+
+ cmd = [
+ "accelerate", "launch",
+ "--config_file", str(config_file),
+ ] + python_cmd + [
+ "--config", config_path
+ ]
+
+ print(f"Running command: {' '.join(cmd)}")
+ result = subprocess.run(cmd, cwd=Path(__file__).parent.parent)
+
+ # Clean up config file
+ config_file.unlink(missing_ok=True)
+
+ return result
+
+
+def launch_deepspeed_training(config_path, num_gpus):
+ """Launch DeepSpeed training"""
+ print(f"Launching DeepSpeed training on {num_gpus} GPUs...")
+
+ setup_environment_for_strategy("deepspeed")
+
+ # Create DeepSpeed hostfile
+ hostfile = Path(__file__).parent.parent / "hostfile"
+ with open(hostfile, "w") as f:
+ f.write(f"localhost slots={num_gpus}\n")
+
+ # Check if uv is available
+ python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"]
+
+ cmd = [
+ "deepspeed",
+ "--hostfile", str(hostfile),
+ "--num_gpus", str(num_gpus),
+ ] + python_cmd + [
+ "--config", config_path,
+ "--deepspeed"
+ ]
+
+ print(f"Running command: {' '.join(cmd)}")
+ result = subprocess.run(cmd, cwd=Path(__file__).parent.parent)
+
+ # Clean up hostfile
+ hostfile.unlink(missing_ok=True)
+
+ return result
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Multi-GPU Progressive LLM Training")
+ parser.add_argument("--config", type=str, required=True,
+ help="Path to training configuration file")
+ parser.add_argument("--strategy", type=str, default="ddp",
+ choices=["ddp", "fsdp", "deepspeed"],
+ help="Multi-GPU strategy to use")
+ parser.add_argument("--num_gpus", type=int, default=None,
+ help="Number of GPUs to use (default: all available)")
+ parser.add_argument("--dry_run", action="store_true",
+ help="Print commands without executing")
+
+ args = parser.parse_args()
+
+ # Get GPU count
+ available_gpus = get_gpu_count()
+ if available_gpus == 0:
+ print("❌ No GPUs available!")
+ sys.exit(1)
+
+ num_gpus = args.num_gpus or available_gpus
+ if num_gpus > available_gpus:
+ print(f"❌ Requested {num_gpus} GPUs but only {available_gpus} available")
+ sys.exit(1)
+
+ # Check config file exists
+ if not Path(args.config).exists():
+ print(f"❌ Config file not found: {args.config}")
+ sys.exit(1)
+
+ print("Progressive LLM Training - Multi-GPU Launcher")
+ print("=" * 60)
+ print(f"Strategy: {args.strategy}")
+ print(f"GPUs: {num_gpus} / {available_gpus}")
+ print(f"Config: {args.config}")
+ print("=" * 60)
+
+ if args.dry_run:
+ print("DRY RUN - Commands that would be executed:")
+ # Show what would be run
+ if args.strategy == "ddp":
+ print("torchrun --nproc_per_node", num_gpus, "scripts/train_progressive.py")
+ elif args.strategy == "fsdp":
+ print("accelerate launch --config_file accelerate_config.yaml scripts/train_progressive.py")
+ elif args.strategy == "deepspeed":
+ print("deepspeed --num_gpus", num_gpus, "scripts/train_progressive.py")
+ return
+
+ # Launch training
+ if args.strategy == "ddp":
+ result = launch_ddp_training(args.config, num_gpus)
+ elif args.strategy == "fsdp":
+ result = launch_fsdp_training(args.config, num_gpus)
+ elif args.strategy == "deepspeed":
+ result = launch_deepspeed_training(args.config, num_gpus)
+
+ if result.returncode == 0:
+ print("✅ Training completed successfully!")
+ else:
+ print("❌ Training failed!")
+ sys.exit(result.returncode)
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/scripts/train_progressive.py b/scripts/train_progressive.py
index d3cd938..9b4df04 100755
--- a/scripts/train_progressive.py
+++ b/scripts/train_progressive.py
@@ -6,6 +6,7 @@ Main training script for progressive reasoning model
import sys
import yaml
import argparse
+import os
from pathlib import Path
# Add src to path
@@ -56,6 +57,18 @@ Examples:
help="Load config and model but skip training (for testing)"
)
+ parser.add_argument(
+ "--distributed",
+ action="store_true",
+ help="Enable distributed training"
+ )
+
+ parser.add_argument(
+ "--deepspeed",
+ action="store_true",
+ help="Enable DeepSpeed training"
+ )
+
return parser.parse_args()
@@ -74,9 +87,34 @@ def load_config(config_path: str) -> dict:
return config
+def setup_distributed_training():
+ """Setup distributed training environment"""
+ # Check if we're in a distributed environment
+ if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+ import torch.distributed as dist
+ import torch
+
+ # Initialize distributed training
+ if not dist.is_initialized():
+ dist.init_process_group(backend="nccl")
+
+ local_rank = int(os.environ.get("LOCAL_RANK", 0))
+ torch.cuda.set_device(local_rank)
+
+ print(f"Distributed training initialized: rank {dist.get_rank()}/{dist.get_world_size()}")
+ return True
+
+ return False
+
+
def main():
args = parse_args()
+ # Setup distributed training if requested
+ is_distributed = False
+ if args.distributed or args.deepspeed:
+ is_distributed = setup_distributed_training()
+
print("Progressive LLM Training for 松尾研LLMコンペ2025")
print("=" * 50)
@@ -95,11 +133,26 @@ def main():
print(f"Error loading config: {e}")
sys.exit(1)
+ # Add distributed/deepspeed flags to config
+ config["training_args"] = config.get("training_args", {})
+ if args.distributed:
+ config["training_args"]["distributed"] = True
+ if args.deepspeed:
+ config["training_args"]["deepspeed"] = True
+ # Add DeepSpeed config from main config
+ if "deepspeed" in config:
+ config["training_args"]["deepspeed_config"] = config["deepspeed"]
+
# Print configuration info
print(f"Experiment: {config['experiment']['name']}")
print(f"Base model: {config['experiment']['base_model']}")
print(f"Output directory: {config['experiment']['output_dir']}")
print(f"Stages: {len(config['progressive_stages'])}")
+ if is_distributed:
+ print("Mode: Distributed Training")
+ if args.deepspeed:
+ print("Backend: DeepSpeed")
+ print("=" * 50)
# Prepare sample datasets if requested
if args.prepare_data:
diff --git a/src/training.py b/src/training.py
index af6f63a..8f05f2e 100644
--- a/src/training.py
+++ b/src/training.py
@@ -367,27 +367,55 @@ class ProgressiveTrainer:
print(f"Final dataset size: {len(dataset)} examples")
- # Training arguments - with CPU offload optimizations
- training_args = TrainingArguments(
- output_dir=f"./outputs/checkpoints/{stage_name}",
- num_train_epochs=stage_config["training"]["num_epochs"],
- per_device_train_batch_size=stage_config["training"]["per_device_batch_size"],
- gradient_accumulation_steps=stage_config["training"]["gradient_accumulation_steps"],
- learning_rate=float(stage_config["training"]["learning_rate"]), # Ensure it's a float
- warmup_steps=stage_config["training"]["warmup_steps"],
- logging_steps=stage_config["training"].get("logging_steps", 10),
- save_strategy="epoch",
- eval_strategy="no",
- bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
- gradient_checkpointing=self.config["model"].get("gradient_checkpointing", False),
- max_grad_norm=stage_config["training"].get("max_grad_norm", 1.0),
- report_to="wandb" if self.config["experiment"]["use_wandb"] else "none",
- run_name=f"{self.config['experiment']['name']}_{stage_name}",
- dataloader_pin_memory=False, # Reduce memory usage
- remove_unused_columns=False, # Keep all columns
- optim=stage_config["training"].get("optim", "adamw_torch"), # Support 8-bit optimizers
- dataloader_num_workers=stage_config["training"].get("dataloader_num_workers", 2),
- )
+ # Training arguments - with multi-GPU and CPU offload optimizations
+ training_args_dict = {
+ "output_dir": f"./outputs/checkpoints/{stage_name}",
+ "num_train_epochs": stage_config["training"]["num_epochs"],
+ "per_device_train_batch_size": stage_config["training"]["per_device_batch_size"],
+ "gradient_accumulation_steps": stage_config["training"]["gradient_accumulation_steps"],
+ "learning_rate": float(stage_config["training"]["learning_rate"]), # Ensure it's a float
+ "warmup_steps": stage_config["training"]["warmup_steps"],
+ "logging_steps": stage_config["training"].get("logging_steps", 10),
+ "save_strategy": "epoch",
+ "eval_strategy": "no",
+ "bf16": torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
+ "gradient_checkpointing": self.config["model"].get("gradient_checkpointing", False),
+ "max_grad_norm": stage_config["training"].get("max_grad_norm", 1.0),
+ "report_to": "wandb" if self.config["experiment"]["use_wandb"] else "none",
+ "run_name": f"{self.config['experiment']['name']}_{stage_name}",
+ "dataloader_pin_memory": stage_config["training"].get("dataloader_pin_memory", False),
+ "remove_unused_columns": False, # Keep all columns
+ "optim": stage_config["training"].get("optim", "adamw_torch"), # Support 8-bit optimizers
+ "dataloader_num_workers": stage_config["training"].get("dataloader_num_workers", 2),
+ }
+
+ # Add multi-GPU specific settings
+ if self.config.get("training_args", {}).get("distributed", False):
+ training_args_dict.update({
+ "ddp_find_unused_parameters": False,
+ "ddp_bucket_cap_mb": 200,
+ "ddp_broadcast_buffers": False,
+ })
+
+ # Add DeepSpeed configuration
+ if self.config.get("training_args", {}).get("deepspeed", False):
+ deepspeed_config = self.config.get("training_args", {}).get("deepspeed_config")
+ if deepspeed_config:
+ training_args_dict["deepspeed"] = deepspeed_config
+
+ # Add FSDP configuration
+ if "fsdp" in self.config:
+ fsdp_config = self.config["fsdp"]
+ training_args_dict.update({
+ "fsdp": fsdp_config.get("fsdp_sharding_strategy", "FULL_SHARD"),
+ "fsdp_transformer_layer_cls_to_wrap": fsdp_config.get("fsdp_transformer_layer_cls_to_wrap"),
+ "fsdp_auto_wrap_policy": fsdp_config.get("fsdp_auto_wrap_policy", "TRANSFORMER_BASED_WRAP"),
+ "fsdp_min_num_params": fsdp_config.get("fsdp_min_num_params", 1000000),
+ "fsdp_cpu_ram_efficient_loading": fsdp_config.get("fsdp_cpu_offload", False),
+ "fsdp_sync_module_states": fsdp_config.get("fsdp_sync_module_states", True),
+ })
+
+ training_args = TrainingArguments(**training_args_dict)
# Print dataset info for debugging
print(f"Dataset columns: {dataset.column_names}")
diff --git a/uv.lock b/uv.lock
new file mode 100644
index 0000000..f5d5904
--- /dev/null
+++ b/uv.lock
@@ -0,0 +1,7 @@
+# This file is automatically @generated by uv.
+# It is not intended for manual editing.
+version = 1
+requires-python = ">=3.9"
+
+# Note: This is a placeholder lock file.
+# Run `uv lock` to generate the actual lock file with resolved dependencies.
\ No newline at end of file