こんにちは

2025-07-10 22:25:11 +09:00 · 2025-07-10 22:25:11 +09:00 · 5ca971b0a4
commit 5ca971b0a4
parent 3c513fee17
19 changed files with 1559 additions and 41 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,13 @@
+# Environment variables for Progressive LLM Training
+# Copy this file to .env and fill in your values
+
+# HuggingFace
+HF_TOKEN=your_token_here
+
+# Weights & Biases  
+WANDB_API_KEY=your_api_key_here
+WANDB_PROJECT=matsuo-llm-comp-2025
+
+# GPU Configuration
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+NCCL_DEBUG=WARN
--- a/.gitignore
+++ b/.gitignore
@ -1,32 +1,35 @@
 # Python
 __pycache__/
 *.py[cod]
-*$py.class
-*.so
-.Python
-venv/
-ENV/
-env/
 .venv/
+venv/

-# Nix
-result
-result-*
-
-# Project specific
+# Training outputs
 outputs/
 data/
-*.log
+!data/basic_cot/train.jsonl
 wandb/
-.ipynb_checkpoints/
+*.log
+
+# Model files
 *.pt
 *.pth
 *.bin
 *.safetensors

+# Temporary
+*.tmp
+.cache/
+accelerate_config.yaml
+hostfile
+
 # IDE
 .vscode/
 .idea/
-*.swp
-*.swo
+
+# OS
+.DS_Store
 *~
+
+# Keep lock files
+!uv.lock
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
+3.11
--- a/README.md
+++ b/README.md
@ -0,0 +1,43 @@
+# Progressive LLM Training
+
+Progressive training for LLMs with 8-GPU support for 松尾研LLMコンペ2025.
+
+## Quick Start
+
+```bash
+# Install uv
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Setup project
+git clone <repository-url>
+cd progressive-llm-training
+uv sync
+
+# Start training
+uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml
+./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed
+```
+
+## Training Stages
+
+1. **basic_cot** - Basic reasoning
+2. **math_reasoning** - Math with OpenR1-Math-220k 
+3. **complex_reasoning** - Complex reasoning with Mixture-of-Thoughts
+
+## Commands
+
+```bash
+uv sync                          # Install dependencies
+uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml     # Single GPU
+./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed                                 # 8 GPUs
+uv run pytest                    # Run tests
+```
+
+## Key Files
+
+- `config/training_config_gemma3_1b_8gpu_deepspeed.yaml` - 8-GPU config
+- `scripts/train_progressive.py` - Main training script
+- `scripts/train_gemma3_1b_8gpu.sh` - 8-GPU launcher
+- `src/progressive_model.py` - Core model implementation
+
+Ready to train! 🚀
--- a/config/training_config_8gpu.yaml
+++ b/config/training_config_8gpu.yaml
@ -0,0 +1,110 @@
+experiment:
+  name: "progressive_reasoning_8gpu"
+  base_model: "google/gemma-2-2b-it"  # Can scale up to larger models
+  output_dir: "./outputs"
+  use_wandb: true
+  wandb_project: "matsuo-llm-comp-2025"
+
+model:
+  load_in_4bit: false  # Can use FP16/BF16 with multiple GPUs
+  bnb_4bit_compute_dtype: "bfloat16"
+  bnb_4bit_use_double_quant: true
+  device_map: "balanced"  # Distribute across all GPUs
+  gradient_checkpointing: true
+  use_flash_attention_2: true  # Enable if available for speed
+  use_eager_attention: false
+
+# Multi-GPU specific settings
+distributed:
+  strategy: "ddp"  # Distributed Data Parallel
+  find_unused_parameters: false
+  gradient_as_bucket_view: true
+
+progressive_stages:
+  - name: "basic_cot"
+    description: "Basic Chain-of-Thought reasoning"
+    dataset_path: "./data/basic_cot/"
+    adapter_config:
+      r: 32  # Larger rank since we have more memory
+      lora_alpha: 64
+      lora_dropout: 0.1
+      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
+      init_lora_weights: true
+    training:
+      num_epochs: 2
+      per_device_batch_size: 16  # Large batch size per GPU
+      gradient_accumulation_steps: 1  # No need for accumulation with 8 GPUs
+      learning_rate: 5e-4
+      warmup_steps: 100
+      max_length: 2048
+      bf16: true
+      max_grad_norm: 1.0
+      weight_decay: 0.001
+      save_steps: 50
+      logging_steps: 10
+      dataloader_num_workers: 4  # More workers for data loading
+      dataloader_pin_memory: true
+      remove_unused_columns: false
+
+  - name: "math_reasoning"
+    description: "Mathematical reasoning with OpenR1-Math-220k dataset"
+    dataset_path: "open-r1/OpenR1-Math-220k"
+    inherit_from: "basic_cot"
+    adapter_config:
+      r: 64
+      lora_alpha: 128
+      lora_dropout: 0.1
+      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+      init_lora_weights: true
+    training:
+      num_epochs: 1
+      per_device_batch_size: 8  # Reduce for larger model
+      gradient_accumulation_steps: 2
+      learning_rate: 3e-4
+      warmup_steps: 200
+      max_length: 4096
+      bf16: true
+      max_grad_norm: 1.0
+      weight_decay: 0.001
+      save_steps: 100
+      logging_steps: 20
+      dataloader_num_workers: 4
+    dataset_config:
+      streaming: true
+      max_samples: 100000  # Can process more with 8 GPUs
+      split: "train"
+
+  - name: "complex_reasoning"
+    description: "Complex multi-step reasoning with Mixture-of-Thoughts"
+    dataset_path: "open-r1/Mixture-of-Thoughts"
+    inherit_from: "math_reasoning"
+    adapter_config:
+      r: 128  # Maximum rank with multi-GPU
+      lora_alpha: 256
+      lora_dropout: 0.1
+      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+      init_lora_weights: true
+    training:
+      num_epochs: 1
+      per_device_batch_size: 4
+      gradient_accumulation_steps: 4
+      learning_rate: 2e-4
+      warmup_steps: 300
+      max_length: 8192
+      bf16: true
+      max_grad_norm: 1.0
+      weight_decay: 0.001
+      save_steps: 200
+      logging_steps: 50
+      dataloader_num_workers: 4
+    dataset_config:
+      streaming: true
+      max_samples: 50000
+      split: "train"
+
+evaluation:
+  benchmarks:
+    - "HLE"
+    - "Do-Not-Answer"
+  save_results: true
+  results_dir: "./outputs/evaluation_results"
--- a/config/training_config_8gpu_deepspeed.yaml
+++ b/config/training_config_8gpu_deepspeed.yaml
@ -0,0 +1,138 @@
+experiment:
+  name: "progressive_reasoning_8gpu_deepspeed"
+  base_model: "google/gemma-2-2b-it"
+  output_dir: "./outputs"
+  use_wandb: true
+  wandb_project: "matsuo-llm-comp-2025"
+
+model:
+  load_in_4bit: false
+  device_map: null  # Let DeepSpeed handle device placement
+  gradient_checkpointing: true
+  use_flash_attention_2: true
+  use_eager_attention: false
+
+# DeepSpeed Configuration
+deepspeed:
+  zero_optimization:
+    stage: 2  # ZeRO Stage 2 (partition optimizer states and gradients)
+    allgather_partitions: true
+    allgather_bucket_size: 200000000
+    overlap_comm: true
+    reduce_scatter: true
+    reduce_bucket_size: 200000000
+    contiguous_gradients: true
+    cpu_offload: false  # Keep on GPU for speed
+  
+  optimizer:
+    type: "AdamW"
+    params:
+      lr: 3e-4
+      betas: [0.9, 0.999]
+      eps: 1e-8
+      weight_decay: 0.001
+  
+  scheduler:
+    type: "WarmupLR"
+    params:
+      warmup_min_lr: 0
+      warmup_max_lr: 3e-4
+      warmup_num_steps: 200
+  
+  fp16:
+    enabled: false
+  
+  bf16:
+    enabled: true
+  
+  gradient_clipping: 1.0
+  
+  train_batch_size: 512  # Total batch size across all GPUs
+  train_micro_batch_size_per_gpu: 64  # Per-GPU batch size
+
+progressive_stages:
+  - name: "basic_cot"
+    description: "Basic Chain-of-Thought reasoning"
+    dataset_path: "./data/basic_cot/"
+    adapter_config:
+      r: 64
+      lora_alpha: 128
+      lora_dropout: 0.1
+      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
+      init_lora_weights: true
+    training:
+      num_epochs: 2
+      per_device_batch_size: 64  # Large batch with DeepSpeed
+      gradient_accumulation_steps: 1
+      learning_rate: 5e-4
+      warmup_steps: 100
+      max_length: 2048
+      bf16: true
+      max_grad_norm: 1.0
+      weight_decay: 0.001
+      save_steps: 50
+      logging_steps: 10
+      dataloader_num_workers: 8
+
+  - name: "math_reasoning"
+    description: "Mathematical reasoning with OpenR1-Math-220k dataset"
+    dataset_path: "open-r1/OpenR1-Math-220k"
+    inherit_from: "basic_cot"
+    adapter_config:
+      r: 128
+      lora_alpha: 256
+      lora_dropout: 0.1
+      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+      init_lora_weights: true
+    training:
+      num_epochs: 1
+      per_device_batch_size: 32
+      gradient_accumulation_steps: 1
+      learning_rate: 3e-4
+      warmup_steps: 200
+      max_length: 4096
+      bf16: true
+      max_grad_norm: 1.0
+      weight_decay: 0.001
+      save_steps: 100
+      logging_steps: 20
+      dataloader_num_workers: 8
+    dataset_config:
+      streaming: true
+      max_samples: 200000
+      split: "train"
+
+  - name: "complex_reasoning"
+    description: "Complex multi-step reasoning with Mixture-of-Thoughts"
+    dataset_path: "open-r1/Mixture-of-Thoughts"
+    inherit_from: "math_reasoning"
+    adapter_config:
+      r: 256
+      lora_alpha: 512
+      lora_dropout: 0.1
+      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+      init_lora_weights: true
+    training:
+      num_epochs: 1
+      per_device_batch_size: 16
+      gradient_accumulation_steps: 2
+      learning_rate: 2e-4
+      warmup_steps: 300
+      max_length: 8192
+      bf16: true
+      max_grad_norm: 1.0
+      weight_decay: 0.001
+      save_steps: 200
+      logging_steps: 50
+      dataloader_num_workers: 8
+    dataset_config:
+      streaming: true
+      max_samples: 100000
+      split: "train"
+
+evaluation:
+  benchmarks:
+    - "HLE"
+    - "Do-Not-Answer"
+  save_results: true
+  results_dir: "./outputs/evaluation_results"
--- a/config/training_config_8gpu_fsdp.yaml
+++ b/config/training_config_8gpu_fsdp.yaml
@ -0,0 +1,113 @@
+experiment:
+  name: "progressive_reasoning_8gpu_fsdp"
+  base_model: "google/gemma-2-2b-it"  # Can scale to much larger models with FSDP
+  output_dir: "./outputs"
+  use_wandb: true
+  wandb_project: "matsuo-llm-comp-2025"
+
+model:
+  load_in_4bit: false
+  device_map: null  # Let FSDP handle device placement
+  gradient_checkpointing: true
+  use_flash_attention_2: true
+  use_eager_attention: false
+
+# FSDP Configuration
+fsdp:
+  fsdp_transformer_layer_cls_to_wrap: "Gemma2DecoderLayer"  # Wrap at layer level
+  fsdp_sharding_strategy: "FULL_SHARD"  # Shard parameters, gradients, and optimizer states
+  fsdp_cpu_offload: false  # Keep on GPU for speed
+  fsdp_mixed_precision: true  # Use BF16 mixed precision
+  fsdp_auto_wrap_policy: "TRANSFORMER_BASED_WRAP"
+  fsdp_min_num_params: 1000000  # Wrap layers with >1M parameters
+  fsdp_sync_module_states: true
+  fsdp_forward_prefetch: true
+  fsdp_use_orig_params: true  # Important for LoRA compatibility
+
+progressive_stages:
+  - name: "basic_cot"
+    description: "Basic Chain-of-Thought reasoning"
+    dataset_path: "./data/basic_cot/"
+    adapter_config:
+      r: 64  # Can use larger ranks with FSDP
+      lora_alpha: 128
+      lora_dropout: 0.1
+      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
+      init_lora_weights: true
+    training:
+      num_epochs: 2
+      per_device_batch_size: 32  # Very large batch size with FSDP
+      gradient_accumulation_steps: 1
+      learning_rate: 5e-4
+      warmup_steps: 100
+      max_length: 2048
+      bf16: true
+      max_grad_norm: 1.0
+      weight_decay: 0.001
+      save_steps: 50
+      logging_steps: 10
+      dataloader_num_workers: 8
+      dataloader_pin_memory: true
+
+  - name: "math_reasoning"
+    description: "Mathematical reasoning with OpenR1-Math-220k dataset"
+    dataset_path: "open-r1/OpenR1-Math-220k"
+    inherit_from: "basic_cot"
+    adapter_config:
+      r: 128
+      lora_alpha: 256
+      lora_dropout: 0.1
+      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+      init_lora_weights: true
+    training:
+      num_epochs: 1
+      per_device_batch_size: 16
+      gradient_accumulation_steps: 2
+      learning_rate: 3e-4
+      warmup_steps: 200
+      max_length: 4096
+      bf16: true
+      max_grad_norm: 1.0
+      weight_decay: 0.001
+      save_steps: 100
+      logging_steps: 20
+      dataloader_num_workers: 8
+    dataset_config:
+      streaming: true
+      max_samples: 200000  # Process even more data
+      split: "train"
+
+  - name: "complex_reasoning"
+    description: "Complex multi-step reasoning with Mixture-of-Thoughts"
+    dataset_path: "open-r1/Mixture-of-Thoughts"
+    inherit_from: "math_reasoning"
+    adapter_config:
+      r: 256  # Very large rank possible with FSDP
+      lora_alpha: 512
+      lora_dropout: 0.1
+      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+      init_lora_weights: true
+    training:
+      num_epochs: 1
+      per_device_batch_size: 8
+      gradient_accumulation_steps: 4
+      learning_rate: 2e-4
+      warmup_steps: 300
+      max_length: 8192
+      bf16: true
+      max_grad_norm: 1.0
+      weight_decay: 0.001
+      save_steps: 200
+      logging_steps: 50
+      dataloader_num_workers: 8
+    dataset_config:
+      streaming: true
+      max_samples: 100000
+      split: "train"
+
+evaluation:
+  benchmarks:
+    - "HLE"
+    - "Do-Not-Answer"
+  save_results: true
+  results_dir: "./outputs/evaluation_results"
--- a/config/training_config_gemma3_1b_8gpu_ddp.yaml
+++ b/config/training_config_gemma3_1b_8gpu_ddp.yaml
@ -0,0 +1,109 @@
+experiment:
+  name: "progressive_reasoning_gemma3_1b_8gpu_ddp"
+  base_model: "google/gemma-3-1b-pt"
+  output_dir: "./outputs"
+  use_wandb: true
+  wandb_project: "matsuo-llm-comp-2025"
+
+model:
+  load_in_4bit: false  # Can use FP16/BF16 with multiple GPUs
+  bnb_4bit_compute_dtype: "bfloat16"
+  bnb_4bit_use_double_quant: true
+  device_map: "balanced"  # Distribute across all GPUs
+  gradient_checkpointing: true
+  use_flash_attention_2: false
+  use_eager_attention: true
+
+# Multi-GPU specific settings
+distributed:
+  strategy: "ddp"  # Distributed Data Parallel
+  find_unused_parameters: false
+  gradient_as_bucket_view: true
+
+progressive_stages:
+  - name: "basic_cot"
+    description: "Basic Chain-of-Thought reasoning"
+    dataset_path: "./data/basic_cot/"
+    adapter_config:
+      r: 16  # Moderate rank for DDP
+      lora_alpha: 32
+      lora_dropout: 0.1
+      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
+      init_lora_weights: true
+    training:
+      num_epochs: 2
+      per_device_batch_size: 32  # 32 * 8 = 256 total batch size
+      gradient_accumulation_steps: 1
+      learning_rate: 5e-4
+      warmup_steps: 100
+      max_length: 1024
+      bf16: true
+      max_grad_norm: 1.0
+      weight_decay: 0.001
+      save_steps: 50
+      logging_steps: 10
+      dataloader_num_workers: 4
+      dataloader_pin_memory: true
+
+  - name: "math_reasoning"
+    description: "Mathematical reasoning with OpenR1-Math-220k dataset"
+    dataset_path: "open-r1/OpenR1-Math-220k"
+    inherit_from: "basic_cot"
+    adapter_config:
+      r: 32
+      lora_alpha: 64
+      lora_dropout: 0.1
+      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+      init_lora_weights: true
+    training:
+      num_epochs: 1
+      per_device_batch_size: 16  # 16 * 8 = 128 total batch size
+      gradient_accumulation_steps: 2
+      learning_rate: 3e-4
+      warmup_steps: 200
+      max_length: 2048
+      bf16: true
+      max_grad_norm: 1.0
+      weight_decay: 0.001
+      save_steps: 100
+      logging_steps: 20
+      dataloader_num_workers: 4
+    dataset_config:
+      streaming: true
+      max_samples: 400000  # Process substantial data
+      split: "train"
+
+  - name: "complex_reasoning"
+    description: "Complex multi-step reasoning with Mixture-of-Thoughts"
+    dataset_path: "open-r1/Mixture-of-Thoughts"
+    inherit_from: "math_reasoning"
+    adapter_config:
+      r: 64
+      lora_alpha: 128
+      lora_dropout: 0.1
+      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+      init_lora_weights: true
+    training:
+      num_epochs: 1
+      per_device_batch_size: 8  # 8 * 8 = 64 total batch size
+      gradient_accumulation_steps: 4
+      learning_rate: 2e-4
+      warmup_steps: 300
+      max_length: 4096
+      bf16: true
+      max_grad_norm: 1.0
+      weight_decay: 0.001
+      save_steps: 200
+      logging_steps: 50
+      dataloader_num_workers: 4
+    dataset_config:
+      streaming: true
+      max_samples: 600000
+      split: "train"
+
+evaluation:
+  benchmarks:
+    - "HLE"
+    - "Do-Not-Answer"
+  save_results: true
+  results_dir: "./outputs/evaluation_results"
--- a/config/training_config_gemma3_1b_8gpu_deepspeed.yaml
+++ b/config/training_config_gemma3_1b_8gpu_deepspeed.yaml
@ -0,0 +1,139 @@
+experiment:
+  name: "progressive_reasoning_gemma3_1b_8gpu_deepspeed"
+  base_model: "google/gemma-3-1b-pt"
+  output_dir: "./outputs"
+  use_wandb: true
+  wandb_project: "matsuo-llm-comp-2025"
+
+model:
+  load_in_4bit: false  # Disable quantization for DeepSpeed
+  device_map: null  # Let DeepSpeed handle device placement
+  gradient_checkpointing: true  # Enable for memory efficiency
+  use_flash_attention_2: false
+  use_eager_attention: true
+
+# DeepSpeed Configuration
+deepspeed:
+  zero_optimization:
+    stage: 2  # ZeRO Stage 2 (partition optimizer states and gradients)
+    allgather_partitions: true
+    allgather_bucket_size: 500000000  # 500MB buckets
+    overlap_comm: true
+    reduce_scatter: true
+    reduce_bucket_size: 500000000
+    contiguous_gradients: true
+    cpu_offload: false  # Keep on GPU for speed with small model
+  
+  optimizer:
+    type: "AdamW"
+    params:
+      lr: 5e-4
+      betas: [0.9, 0.999]
+      eps: 1e-8
+      weight_decay: 0.001
+  
+  scheduler:
+    type: "WarmupLR"
+    params:
+      warmup_min_lr: 0
+      warmup_max_lr: 5e-4
+      warmup_num_steps: 100
+  
+  fp16:
+    enabled: false
+  
+  bf16:
+    enabled: true
+  
+  gradient_clipping: 1.0
+  
+  train_batch_size: 512  # Total batch size across all GPUs
+  train_micro_batch_size_per_gpu: 64  # Per-GPU batch size
+
+progressive_stages:
+  - name: "basic_cot"
+    description: "Basic Chain-of-Thought reasoning"
+    dataset_path: "./data/basic_cot/"
+    adapter_config:
+      r: 32  # Larger rank with 8 GPUs
+      lora_alpha: 64
+      lora_dropout: 0.1
+      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
+      init_lora_weights: true
+    training:
+      num_epochs: 2
+      per_device_batch_size: 64  # Large batch with DeepSpeed
+      gradient_accumulation_steps: 1  # No accumulation needed
+      learning_rate: 5e-4
+      warmup_steps: 100
+      max_length: 1024
+      bf16: true
+      max_grad_norm: 1.0
+      weight_decay: 0.001
+      save_steps: 50
+      logging_steps: 10
+      dataloader_num_workers: 8
+      dataloader_pin_memory: true
+
+  - name: "math_reasoning"
+    description: "Mathematical reasoning with OpenR1-Math-220k dataset"
+    dataset_path: "open-r1/OpenR1-Math-220k"
+    inherit_from: "basic_cot"
+    adapter_config:
+      r: 64  # Larger rank for math reasoning
+      lora_alpha: 128
+      lora_dropout: 0.1
+      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+      init_lora_weights: true
+    training:
+      num_epochs: 1
+      per_device_batch_size: 32  # Reduce for longer sequences
+      gradient_accumulation_steps: 1
+      learning_rate: 3e-4
+      warmup_steps: 200
+      max_length: 2048
+      bf16: true
+      max_grad_norm: 1.0
+      weight_decay: 0.001
+      save_steps: 100
+      logging_steps: 20
+      dataloader_num_workers: 8
+    dataset_config:
+      streaming: true
+      max_samples: 500000  # Process more data with 8 GPUs
+      split: "train"
+
+  - name: "complex_reasoning"
+    description: "Complex multi-step reasoning with Mixture-of-Thoughts"
+    dataset_path: "open-r1/Mixture-of-Thoughts"
+    inherit_from: "math_reasoning"
+    adapter_config:
+      r: 128  # Maximum rank for complex reasoning
+      lora_alpha: 256
+      lora_dropout: 0.1
+      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+      init_lora_weights: true
+    training:
+      num_epochs: 1
+      per_device_batch_size: 16  # Reduce for very long sequences
+      gradient_accumulation_steps: 2
+      learning_rate: 2e-4
+      warmup_steps: 300
+      max_length: 4096
+      bf16: true
+      max_grad_norm: 1.0
+      weight_decay: 0.001
+      save_steps: 200
+      logging_steps: 50
+      dataloader_num_workers: 8
+    dataset_config:
+      streaming: true
+      max_samples: 800000  # Process even more data
+      split: "train"
+
+evaluation:
+  benchmarks:
+    - "HLE"
+    - "Do-Not-Answer"
+  save_results: true
+  results_dir: "./outputs/evaluation_results"
--- a/config/training_config_gemma3_1b_8gpu_fsdp.yaml
+++ b/config/training_config_gemma3_1b_8gpu_fsdp.yaml
@ -0,0 +1,113 @@
+experiment:
+  name: "progressive_reasoning_gemma3_1b_8gpu_fsdp"
+  base_model: "google/gemma-3-1b-pt"
+  output_dir: "./outputs"
+  use_wandb: true
+  wandb_project: "matsuo-llm-comp-2025"
+
+model:
+  load_in_4bit: false
+  device_map: null  # Let FSDP handle device placement
+  gradient_checkpointing: true
+  use_flash_attention_2: false
+  use_eager_attention: true
+
+# FSDP Configuration
+fsdp:
+  fsdp_transformer_layer_cls_to_wrap: "GemmaDecoderLayer"  # Wrap at layer level
+  fsdp_sharding_strategy: "FULL_SHARD"  # Shard parameters, gradients, and optimizer states
+  fsdp_cpu_offload: false  # Keep on GPU for speed with small model
+  fsdp_mixed_precision: true  # Use BF16 mixed precision
+  fsdp_auto_wrap_policy: "TRANSFORMER_BASED_WRAP"
+  fsdp_min_num_params: 1000000  # Wrap layers with >1M parameters
+  fsdp_sync_module_states: true
+  fsdp_forward_prefetch: true
+  fsdp_use_orig_params: true  # Important for LoRA compatibility
+
+progressive_stages:
+  - name: "basic_cot"
+    description: "Basic Chain-of-Thought reasoning"
+    dataset_path: "./data/basic_cot/"
+    adapter_config:
+      r: 32  # Can use larger ranks with FSDP
+      lora_alpha: 64
+      lora_dropout: 0.1
+      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
+      init_lora_weights: true
+    training:
+      num_epochs: 2
+      per_device_batch_size: 48  # Very large batch size with FSDP
+      gradient_accumulation_steps: 1
+      learning_rate: 5e-4
+      warmup_steps: 100
+      max_length: 1024
+      bf16: true
+      max_grad_norm: 1.0
+      weight_decay: 0.001
+      save_steps: 50
+      logging_steps: 10
+      dataloader_num_workers: 8
+      dataloader_pin_memory: true
+
+  - name: "math_reasoning"
+    description: "Mathematical reasoning with OpenR1-Math-220k dataset"
+    dataset_path: "open-r1/OpenR1-Math-220k"
+    inherit_from: "basic_cot"
+    adapter_config:
+      r: 64
+      lora_alpha: 128
+      lora_dropout: 0.1
+      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+      init_lora_weights: true
+    training:
+      num_epochs: 1
+      per_device_batch_size: 24
+      gradient_accumulation_steps: 2
+      learning_rate: 3e-4
+      warmup_steps: 200
+      max_length: 2048
+      bf16: true
+      max_grad_norm: 1.0
+      weight_decay: 0.001
+      save_steps: 100
+      logging_steps: 20
+      dataloader_num_workers: 8
+    dataset_config:
+      streaming: true
+      max_samples: 600000  # Process even more data with FSDP
+      split: "train"
+
+  - name: "complex_reasoning"
+    description: "Complex multi-step reasoning with Mixture-of-Thoughts"
+    dataset_path: "open-r1/Mixture-of-Thoughts"
+    inherit_from: "math_reasoning"
+    adapter_config:
+      r: 128  # Very large rank possible with FSDP
+      lora_alpha: 256
+      lora_dropout: 0.1
+      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+      init_lora_weights: true
+    training:
+      num_epochs: 1
+      per_device_batch_size: 12
+      gradient_accumulation_steps: 4
+      learning_rate: 2e-4
+      warmup_steps: 300
+      max_length: 4096
+      bf16: true
+      max_grad_norm: 1.0
+      weight_decay: 0.001
+      save_steps: 200
+      logging_steps: 50
+      dataloader_num_workers: 8
+    dataset_config:
+      streaming: true
+      max_samples: 1000000  # Can process 1M samples efficiently
+      split: "train"
+
+evaluation:
+  benchmarks:
+    - "HLE"
+    - "Do-Not-Answer"
+  save_results: true
+  results_dir: "./outputs/evaluation_results"
--- a/docs/README.md
+++ b/docs/README.md
@ -0,0 +1,37 @@
+# Progressive LLM Training Documentation
+
+## Setup
+
+```bash
+curl -LsSf https://astral.sh/uv/install.sh | sh
+uv sync
+```
+
+## Training
+
+### Single GPU
+```bash
+uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml
+```
+
+### 8 GPUs
+```bash
+./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed
+```
+
+## Configuration
+
+- `config/training_config_gemma3_1b.yaml` - Single GPU
+- `config/training_config_gemma3_1b_8gpu_deepspeed.yaml` - 8 GPUs
+
+## Environment
+
+Copy `.env.example` to `.env` and set:
+- `HF_TOKEN` - HuggingFace token
+- `WANDB_API_KEY` - W&B API key
+
+## Troubleshooting
+
+- Reduce `per_device_batch_size` for memory issues
+- `export NCCL_DEBUG=INFO` for NCCL errors
+- `nvidia-smi` to check GPUs
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,26 @@
+[project]
+name = "progressive-llm-training"
+version = "0.1.0"
+description = "Progressive LLM Training for 松尾研LLMコンペ2025"
+requires-python = ">=3.9"
+
+dependencies = [
+    "torch>=2.0.0",
+    "transformers>=4.40.0",
+    "accelerate>=0.27.0",
+    "peft>=0.11.0",
+    "trl>=0.9.0",
+    "datasets>=2.18.0",
+    "bitsandbytes>=0.43.0",
+    "wandb>=0.16.0",
+    "pyyaml>=6.0",
+    "jsonlines>=4.0.0",
+    "deepspeed>=0.12.0",
+]
+
+[project.optional-dependencies]
+dev = ["pytest", "black", "isort"]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,5 @@
+# Use uv instead: uv sync
+torch>=2.0.0
 transformers>=4.40.0
 accelerate>=0.27.0
 peft>=0.11.0
@ -7,7 +9,4 @@ bitsandbytes>=0.43.0
 wandb>=0.16.0
 pyyaml>=6.0
 jsonlines>=4.0.0
-scikit-learn>=1.3.0
-# flash-attn>=2.5.0  # Install separately with --no-build-isolation
-sentencepiece>=0.2.0
-protobuf>=4.25.0
+deepspeed>=0.12.0
--- a/scripts/debug_model_loading.py
+++ b/scripts/debug_model_loading.py
@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+"""
+Debug script to identify model loading issues
+"""
+
+import sys
+import os
+import torch
+from pathlib import Path
+
+# Add src to path
+sys.path.append(str(Path(__file__).parent.parent))
+
+def clear_accelerate_env():
+    """Clear all ACCELERATE environment variables"""
+    print("Clearing ACCELERATE environment variables...")
+    env_vars_to_clear = []
+    for key in os.environ:
+        if 'ACCELERATE' in key:
+            env_vars_to_clear.append(key)
+    
+    for var in env_vars_to_clear:
+        print(f"  Removing {var}={os.environ[var]}")
+        del os.environ[var]
+
+def test_basic_model_loading():
+    """Test basic model loading without any configuration"""
+    print("Testing basic model loading...")
+    
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    
+    model_name = "google/gemma-2-2b-it"
+    
+    try:
+        print("Testing with absolutely minimal config...")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            torch_dtype=torch.float32
+        )
+        print("✅ Basic loading successful!")
+        del model
+        return True
+    except Exception as e:
+        print(f"❌ Basic loading failed: {e}")
+        return False
+
+def test_with_device_map():
+    """Test with device_map auto"""
+    print("Testing with device_map='auto'...")
+    
+    from transformers import AutoModelForCausalLM
+    
+    model_name = "google/gemma-2-2b-it"
+    
+    try:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            torch_dtype=torch.float32,
+            device_map="auto"
+        )
+        print("✅ Device map loading successful!")
+        del model
+        return True
+    except Exception as e:
+        print(f"❌ Device map loading failed: {e}")
+        return False
+
+def test_with_quantization():
+    """Test with quantization"""
+    print("Testing with 4-bit quantization...")
+    
+    from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+    
+    model_name = "google/gemma-2-2b-it"
+    
+    try:
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4"
+        )
+        
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            quantization_config=bnb_config
+        )
+        print("✅ Quantization loading successful!")
+        del model
+        return True
+    except Exception as e:
+        print(f"❌ Quantization loading failed: {e}")
+        return False
+
+def print_environment_info():
+    """Print detailed environment information"""
+    print("\n" + "="*50)
+    print("ENVIRONMENT INFORMATION")
+    print("="*50)
+    
+    # Python version
+    print(f"Python version: {sys.version}")
+    
+    # PyTorch info
+    try:
+        import torch
+        print(f"PyTorch version: {torch.__version__}")
+        print(f"CUDA available: {torch.cuda.is_available()}")
+        if torch.cuda.is_available():
+            print(f"CUDA device count: {torch.cuda.device_count()}")
+            for i in range(torch.cuda.device_count()):
+                print(f"  Device {i}: {torch.cuda.get_device_name(i)}")
+        print(f"CUDA version: {torch.version.cuda}")
+    except Exception as e:
+        print(f"PyTorch info error: {e}")
+    
+    # Transformers info
+    try:
+        from transformers import __version__ as tf_version
+        print(f"Transformers version: {tf_version}")
+    except Exception as e:
+        print(f"Transformers info error: {e}")
+    
+    # Accelerate info
+    try:
+        from accelerate import __version__ as acc_version
+        print(f"Accelerate version: {acc_version}")
+    except Exception as e:
+        print(f"Accelerate info error: {e}")
+    
+    # PEFT info
+    try:
+        from peft import __version__ as peft_version
+        print(f"PEFT version: {peft_version}")
+    except Exception as e:
+        print(f"PEFT info error: {e}")
+    
+    # BitsAndBytes info
+    try:
+        import bitsandbytes as bnb
+        print(f"BitsAndBytes version: {bnb.__version__}")
+    except Exception as e:
+        print(f"BitsAndBytes info error: {e}")
+    
+    # Environment variables
+    print("\nRelevant environment variables:")
+    for key, value in sorted(os.environ.items()):
+        if any(prefix in key for prefix in ['CUDA', 'TORCH', 'HF_', 'ACCELERATE', 'TRANSFORMERS']):
+            print(f"  {key}={value}")
+
+def main():
+    print("Progressive LLM Training - Model Loading Debug")
+    print("=" * 60)
+    
+    # Print environment info first
+    print_environment_info()
+    
+    # Clear environment variables
+    clear_accelerate_env()
+    
+    # Test various loading methods
+    print("\n" + "="*50)
+    print("TESTING MODEL LOADING")
+    print("="*50)
+    
+    results = []
+    
+    # Test 1: Basic loading
+    results.append(("Basic loading", test_basic_model_loading()))
+    
+    # Test 2: With device map
+    results.append(("Device map", test_with_device_map()))
+    
+    # Test 3: With quantization
+    results.append(("Quantization", test_with_quantization()))
+    
+    # Summary
+    print("\n" + "="*50)
+    print("SUMMARY")
+    print("="*50)
+    
+    for test_name, success in results:
+        status = "✅ PASS" if success else "❌ FAIL"
+        print(f"{test_name}: {status}")
+    
+    if any(result[1] for result in results):
+        print("\n✅ At least one loading method works!")
+        print("Use the successful method in your configuration.")
+    else:
+        print("\n❌ All loading methods failed!")
+        print("This indicates a fundamental environment issue.")
+        print("Consider:")
+        print("1. Reinstalling transformers, accelerate, torch")
+        print("2. Checking CUDA installation")
+        print("3. Using a different model")
+
+if __name__ == "__main__":
+    main()
--- a/scripts/train_gemma3_1b_8gpu.sh
+++ b/scripts/train_gemma3_1b_8gpu.sh
@ -0,0 +1,161 @@
+#!/bin/bash
+# Training launcher script for Gemma3 1B with 8 GPUs (uv compatible)
+
+# Color codes for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+echo -e "${GREEN}Progressive LLM Training - Gemma3 1B 8GPU Launcher (uv)${NC}"
+echo "======================================================="
+
+# Check if uv is available
+if command -v uv &> /dev/null; then
+    echo -e "${GREEN}Using uv for Python environment management${NC}"
+    UV_PREFIX="uv run"
+else
+    echo -e "${YELLOW}uv not found, using standard python${NC}"
+    UV_PREFIX="python"
+fi
+
+# Default values
+STRATEGY="deepspeed"
+CONFIG=""
+NUM_GPUS=8
+DRY_RUN=false
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --strategy)
+            STRATEGY="$2"
+            shift 2
+            ;;
+        --config)
+            CONFIG="$2"
+            shift 2
+            ;;
+        --num-gpus)
+            NUM_GPUS="$2"
+            shift 2
+            ;;
+        --dry-run)
+            DRY_RUN=true
+            shift
+            ;;
+        -h|--help)
+            echo "Usage: $0 [options]"
+            echo ""
+            echo "Options:"
+            echo "  --strategy <ddp|fsdp|deepspeed>  Training strategy (default: deepspeed)"
+            echo "  --config <path>                  Custom config file (optional)"
+            echo "  --num-gpus <n>                   Number of GPUs to use (default: 8)"
+            echo "  --dry-run                        Show command without executing"
+            echo ""
+            echo "Examples:"
+            echo "  # Use DeepSpeed (recommended)"
+            echo "  $0 --strategy deepspeed"
+            echo ""
+            echo "  # Use DDP"
+            echo "  $0 --strategy ddp"
+            echo ""
+            echo "  # Use FSDP"
+            echo "  $0 --strategy fsdp"
+            echo ""
+            echo "  # Use custom config"
+            echo "  $0 --strategy ddp --config config/my_config.yaml"
+            exit 0
+            ;;
+        *)
+            echo -e "${RED}Error: Unknown option $1${NC}"
+            exit 1
+            ;;
+    esac
+done
+
+# Check GPU availability
+GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+echo -e "Available GPUs: ${GREEN}$GPU_COUNT${NC}"
+
+if [ $GPU_COUNT -lt $NUM_GPUS ]; then
+    echo -e "${RED}Error: Requested $NUM_GPUS GPUs but only $GPU_COUNT available${NC}"
+    exit 1
+fi
+
+# Set default config based on strategy if not provided
+if [ -z "$CONFIG" ]; then
+    case $STRATEGY in
+        ddp)
+            CONFIG="config/training_config_gemma3_1b_8gpu_ddp.yaml"
+            ;;
+        fsdp)
+            CONFIG="config/training_config_gemma3_1b_8gpu_fsdp.yaml"
+            ;;
+        deepspeed)
+            CONFIG="config/training_config_gemma3_1b_8gpu_deepspeed.yaml"
+            ;;
+        *)
+            echo -e "${RED}Error: Invalid strategy '$STRATEGY'. Choose from: ddp, fsdp, deepspeed${NC}"
+            exit 1
+            ;;
+    esac
+fi
+
+# Check if config file exists
+if [ ! -f "$CONFIG" ]; then
+    echo -e "${RED}Error: Config file not found: $CONFIG${NC}"
+    exit 1
+fi
+
+echo -e "Strategy: ${YELLOW}$STRATEGY${NC}"
+echo -e "Config: ${YELLOW}$CONFIG${NC}"
+echo -e "GPUs: ${YELLOW}$NUM_GPUS${NC}"
+echo ""
+
+# Build the command
+CMD="$UV_PREFIX scripts/train_multi_gpu.py --config $CONFIG --strategy $STRATEGY --num_gpus $NUM_GPUS"
+
+if [ "$DRY_RUN" = true ]; then
+    echo -e "${YELLOW}Dry run mode - Command that would be executed:${NC}"
+    echo "$CMD"
+    exit 0
+fi
+
+# Show GPU memory before training
+echo -e "${GREEN}GPU Memory Usage Before Training:${NC}"
+nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv
+
+echo ""
+echo -e "${GREEN}Starting training...${NC}"
+echo "Command: $CMD"
+echo ""
+
+# Set environment variables for optimal performance
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export NCCL_DEBUG=WARN  # Set to INFO for debugging
+export NCCL_ASYNC_ERROR_HANDLING=1
+
+# For DeepSpeed, set additional optimizations
+if [ "$STRATEGY" = "deepspeed" ]; then
+    export DS_SKIP_CUDA_CHECK=1
+    export TOKENIZERS_PARALLELISM=false
+fi
+
+# Execute the training command
+$CMD
+
+# Check exit status
+if [ $? -eq 0 ]; then
+    echo ""
+    echo -e "${GREEN}Training completed successfully!${NC}"
+    
+    # Show GPU memory after training
+    echo ""
+    echo -e "${GREEN}GPU Memory Usage After Training:${NC}"
+    nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv
+else
+    echo ""
+    echo -e "${RED}Training failed!${NC}"
+    exit 1
+fi
--- a/scripts/train_multi_gpu.py
+++ b/scripts/train_multi_gpu.py
@ -0,0 +1,224 @@
+#!/usr/bin/env python3
+"""
+Multi-GPU training launcher for progressive reasoning model
+Supports DDP, FSDP, and DeepSpeed strategies
+"""
+
+import os
+import sys
+import argparse
+import subprocess
+import shutil
+import torch
+from pathlib import Path
+
+# Add src to path
+sys.path.append(str(Path(__file__).parent.parent))
+
+
+def get_gpu_count():
+    """Get the number of available GPUs"""
+    if torch.cuda.is_available():
+        return torch.cuda.device_count()
+    return 0
+
+
+def setup_environment_for_strategy(strategy):
+    """Set up environment variables for different strategies"""
+    if strategy == "deepspeed":
+        # DeepSpeed specific environment
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "12355"
+        os.environ["RANK"] = "0"
+        os.environ["LOCAL_RANK"] = "0"
+        os.environ["WORLD_SIZE"] = str(get_gpu_count())
+    elif strategy in ["ddp", "fsdp"]:
+        # Standard distributed training environment
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "12355"
+        # Let torchrun handle the rest
+    
+    # General optimizations
+    os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
+    os.environ["NCCL_DEBUG"] = "INFO"
+    os.environ["TORCH_DISTRIBUTED_DEBUG"] = "INFO"
+
+
+def launch_ddp_training(config_path, num_gpus):
+    """Launch DDP training using torchrun"""
+    print(f"Launching DDP training on {num_gpus} GPUs...")
+    
+    setup_environment_for_strategy("ddp")
+    
+    # Use torchrun for DDP
+    # Check if uv is available
+    python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"]
+    
+    cmd = [
+        "torchrun",
+        "--nproc_per_node", str(num_gpus),
+        "--master_port", "12355",
+    ] + python_cmd + [
+        "--config", config_path,
+        "--distributed"
+    ]
+    
+    print(f"Running command: {' '.join(cmd)}")
+    return subprocess.run(cmd, cwd=Path(__file__).parent.parent)
+
+
+def launch_fsdp_training(config_path, num_gpus):
+    """Launch FSDP training using accelerate"""
+    print(f"Launching FSDP training on {num_gpus} GPUs...")
+    
+    setup_environment_for_strategy("fsdp")
+    
+    # Create accelerate config for FSDP
+    accelerate_config = f"""
+compute_environment: LOCAL_MACHINE
+distributed_type: FSDP
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: false
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_transformer_layer_cls_to_wrap: Gemma2DecoderLayer
+  fsdp_use_orig_params: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: {num_gpus}
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+"""
+    
+    # Save config temporarily
+    config_file = Path(__file__).parent.parent / "accelerate_config.yaml"
+    with open(config_file, "w") as f:
+        f.write(accelerate_config)
+    
+    # Check if uv is available
+    python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"]
+    
+    cmd = [
+        "accelerate", "launch",
+        "--config_file", str(config_file),
+    ] + python_cmd + [
+        "--config", config_path
+    ]
+    
+    print(f"Running command: {' '.join(cmd)}")
+    result = subprocess.run(cmd, cwd=Path(__file__).parent.parent)
+    
+    # Clean up config file
+    config_file.unlink(missing_ok=True)
+    
+    return result
+
+
+def launch_deepspeed_training(config_path, num_gpus):
+    """Launch DeepSpeed training"""
+    print(f"Launching DeepSpeed training on {num_gpus} GPUs...")
+    
+    setup_environment_for_strategy("deepspeed")
+    
+    # Create DeepSpeed hostfile
+    hostfile = Path(__file__).parent.parent / "hostfile"
+    with open(hostfile, "w") as f:
+        f.write(f"localhost slots={num_gpus}\n")
+    
+    # Check if uv is available
+    python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"]
+    
+    cmd = [
+        "deepspeed",
+        "--hostfile", str(hostfile),
+        "--num_gpus", str(num_gpus),
+    ] + python_cmd + [
+        "--config", config_path,
+        "--deepspeed"
+    ]
+    
+    print(f"Running command: {' '.join(cmd)}")
+    result = subprocess.run(cmd, cwd=Path(__file__).parent.parent)
+    
+    # Clean up hostfile
+    hostfile.unlink(missing_ok=True)
+    
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Multi-GPU Progressive LLM Training")
+    parser.add_argument("--config", type=str, required=True,
+                        help="Path to training configuration file")
+    parser.add_argument("--strategy", type=str, default="ddp",
+                        choices=["ddp", "fsdp", "deepspeed"],
+                        help="Multi-GPU strategy to use")
+    parser.add_argument("--num_gpus", type=int, default=None,
+                        help="Number of GPUs to use (default: all available)")
+    parser.add_argument("--dry_run", action="store_true",
+                        help="Print commands without executing")
+    
+    args = parser.parse_args()
+    
+    # Get GPU count
+    available_gpus = get_gpu_count()
+    if available_gpus == 0:
+        print("❌ No GPUs available!")
+        sys.exit(1)
+    
+    num_gpus = args.num_gpus or available_gpus
+    if num_gpus > available_gpus:
+        print(f"❌ Requested {num_gpus} GPUs but only {available_gpus} available")
+        sys.exit(1)
+    
+    # Check config file exists
+    if not Path(args.config).exists():
+        print(f"❌ Config file not found: {args.config}")
+        sys.exit(1)
+    
+    print("Progressive LLM Training - Multi-GPU Launcher")
+    print("=" * 60)
+    print(f"Strategy: {args.strategy}")
+    print(f"GPUs: {num_gpus} / {available_gpus}")
+    print(f"Config: {args.config}")
+    print("=" * 60)
+    
+    if args.dry_run:
+        print("DRY RUN - Commands that would be executed:")
+        # Show what would be run
+        if args.strategy == "ddp":
+            print("torchrun --nproc_per_node", num_gpus, "scripts/train_progressive.py")
+        elif args.strategy == "fsdp":
+            print("accelerate launch --config_file accelerate_config.yaml scripts/train_progressive.py")
+        elif args.strategy == "deepspeed":
+            print("deepspeed --num_gpus", num_gpus, "scripts/train_progressive.py")
+        return
+    
+    # Launch training
+    if args.strategy == "ddp":
+        result = launch_ddp_training(args.config, num_gpus)
+    elif args.strategy == "fsdp":
+        result = launch_fsdp_training(args.config, num_gpus)
+    elif args.strategy == "deepspeed":
+        result = launch_deepspeed_training(args.config, num_gpus)
+    
+    if result.returncode == 0:
+        print("✅ Training completed successfully!")
+    else:
+        print("❌ Training failed!")
+        sys.exit(result.returncode)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/train_progressive.py
+++ b/scripts/train_progressive.py
@ -6,6 +6,7 @@ Main training script for progressive reasoning model
 import sys
 import yaml
 import argparse
+import os
 from pathlib import Path

 # Add src to path
@ -56,6 +57,18 @@ Examples:
        help="Load config and model but skip training (for testing)"
    )
    
+    parser.add_argument(
+        "--distributed",
+        action="store_true",
+        help="Enable distributed training"
+    )
+    
+    parser.add_argument(
+        "--deepspeed",
+        action="store_true",
+        help="Enable DeepSpeed training"
+    )
+    
    return parser.parse_args()


@ -74,9 +87,34 @@ def load_config(config_path: str) -> dict:
    return config


+def setup_distributed_training():
+    """Setup distributed training environment"""
+    # Check if we're in a distributed environment
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        import torch.distributed as dist
+        import torch
+        
+        # Initialize distributed training
+        if not dist.is_initialized():
+            dist.init_process_group(backend="nccl")
+            
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+        torch.cuda.set_device(local_rank)
+        
+        print(f"Distributed training initialized: rank {dist.get_rank()}/{dist.get_world_size()}")
+        return True
+    
+    return False
+
+
 def main():
    args = parse_args()
    
+    # Setup distributed training if requested
+    is_distributed = False
+    if args.distributed or args.deepspeed:
+        is_distributed = setup_distributed_training()
+    
    print("Progressive LLM Training for 松尾研LLMコンペ2025")
    print("=" * 50)
    
@ -95,11 +133,26 @@ def main():
        print(f"Error loading config: {e}")
        sys.exit(1)
    
+    # Add distributed/deepspeed flags to config
+    config["training_args"] = config.get("training_args", {})
+    if args.distributed:
+        config["training_args"]["distributed"] = True
+    if args.deepspeed:
+        config["training_args"]["deepspeed"] = True
+        # Add DeepSpeed config from main config
+        if "deepspeed" in config:
+            config["training_args"]["deepspeed_config"] = config["deepspeed"]
+    
    # Print configuration info
    print(f"Experiment: {config['experiment']['name']}")
    print(f"Base model: {config['experiment']['base_model']}")
    print(f"Output directory: {config['experiment']['output_dir']}")
    print(f"Stages: {len(config['progressive_stages'])}")
+    if is_distributed:
+        print("Mode: Distributed Training")
+    if args.deepspeed:
+        print("Backend: DeepSpeed")
+    print("=" * 50)
    
    # Prepare sample datasets if requested
    if args.prepare_data:
--- a/src/training.py
+++ b/src/training.py
@ -367,27 +367,55 @@ class ProgressiveTrainer:
        
        print(f"Final dataset size: {len(dataset)} examples")
        
-        # Training arguments - with CPU offload optimizations
-        training_args = TrainingArguments(
-            output_dir=f"./outputs/checkpoints/{stage_name}",
-            num_train_epochs=stage_config["training"]["num_epochs"],
-            per_device_train_batch_size=stage_config["training"]["per_device_batch_size"],
-            gradient_accumulation_steps=stage_config["training"]["gradient_accumulation_steps"],
-            learning_rate=float(stage_config["training"]["learning_rate"]),  # Ensure it's a float
-            warmup_steps=stage_config["training"]["warmup_steps"],
-            logging_steps=stage_config["training"].get("logging_steps", 10),
-            save_strategy="epoch",
-            eval_strategy="no",
-            bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
-            gradient_checkpointing=self.config["model"].get("gradient_checkpointing", False),
-            max_grad_norm=stage_config["training"].get("max_grad_norm", 1.0),
-            report_to="wandb" if self.config["experiment"]["use_wandb"] else "none",
-            run_name=f"{self.config['experiment']['name']}_{stage_name}",
-            dataloader_pin_memory=False,  # Reduce memory usage
-            remove_unused_columns=False,  # Keep all columns
-            optim=stage_config["training"].get("optim", "adamw_torch"),  # Support 8-bit optimizers
-            dataloader_num_workers=stage_config["training"].get("dataloader_num_workers", 2),
-        )
+        # Training arguments - with multi-GPU and CPU offload optimizations
+        training_args_dict = {
+            "output_dir": f"./outputs/checkpoints/{stage_name}",
+            "num_train_epochs": stage_config["training"]["num_epochs"],
+            "per_device_train_batch_size": stage_config["training"]["per_device_batch_size"],
+            "gradient_accumulation_steps": stage_config["training"]["gradient_accumulation_steps"],
+            "learning_rate": float(stage_config["training"]["learning_rate"]),  # Ensure it's a float
+            "warmup_steps": stage_config["training"]["warmup_steps"],
+            "logging_steps": stage_config["training"].get("logging_steps", 10),
+            "save_strategy": "epoch",
+            "eval_strategy": "no",
+            "bf16": torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
+            "gradient_checkpointing": self.config["model"].get("gradient_checkpointing", False),
+            "max_grad_norm": stage_config["training"].get("max_grad_norm", 1.0),
+            "report_to": "wandb" if self.config["experiment"]["use_wandb"] else "none",
+            "run_name": f"{self.config['experiment']['name']}_{stage_name}",
+            "dataloader_pin_memory": stage_config["training"].get("dataloader_pin_memory", False),
+            "remove_unused_columns": False,  # Keep all columns
+            "optim": stage_config["training"].get("optim", "adamw_torch"),  # Support 8-bit optimizers
+            "dataloader_num_workers": stage_config["training"].get("dataloader_num_workers", 2),
+        }
+        
+        # Add multi-GPU specific settings
+        if self.config.get("training_args", {}).get("distributed", False):
+            training_args_dict.update({
+                "ddp_find_unused_parameters": False,
+                "ddp_bucket_cap_mb": 200,
+                "ddp_broadcast_buffers": False,
+            })
+        
+        # Add DeepSpeed configuration
+        if self.config.get("training_args", {}).get("deepspeed", False):
+            deepspeed_config = self.config.get("training_args", {}).get("deepspeed_config")
+            if deepspeed_config:
+                training_args_dict["deepspeed"] = deepspeed_config
+        
+        # Add FSDP configuration
+        if "fsdp" in self.config:
+            fsdp_config = self.config["fsdp"]
+            training_args_dict.update({
+                "fsdp": fsdp_config.get("fsdp_sharding_strategy", "FULL_SHARD"),
+                "fsdp_transformer_layer_cls_to_wrap": fsdp_config.get("fsdp_transformer_layer_cls_to_wrap"),
+                "fsdp_auto_wrap_policy": fsdp_config.get("fsdp_auto_wrap_policy", "TRANSFORMER_BASED_WRAP"),
+                "fsdp_min_num_params": fsdp_config.get("fsdp_min_num_params", 1000000),
+                "fsdp_cpu_ram_efficient_loading": fsdp_config.get("fsdp_cpu_offload", False),
+                "fsdp_sync_module_states": fsdp_config.get("fsdp_sync_module_states", True),
+            })
+        
+        training_args = TrainingArguments(**training_args_dict)
        
        # Print dataset info for debugging
        print(f"Dataset columns: {dataset.column_names}")
--- a/uv.lock
+++ b/uv.lock
@ -0,0 +1,7 @@
+# This file is automatically @generated by uv.
+# It is not intended for manual editing.
+version = 1
+requires-python = ">=3.9"
+
+# Note: This is a placeholder lock file.
+# Run `uv lock` to generate the actual lock file with resolved dependencies.