こんにちは

2025-07-10 22:25:11 +09:00 · 2025-07-10 22:25:11 +09:00 · 5ca971b0a4
commit 5ca971b0a4
parent 3c513fee17
19 changed files with 1559 additions and 41 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,13 @@
 # Environment variables for Progressive LLM Training
 # Copy this file to .env and fill in your values
 # HuggingFace
 HF_TOKEN=your_token_here
 # Weights & Biases  
 WANDB_API_KEY=your_api_key_here
 WANDB_PROJECT=matsuo-llm-comp-2025
 # GPU Configuration
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 NCCL_DEBUG=WARN
--- a/.gitignore
+++ b/.gitignore
@ -1,32 +1,35 @@
 # Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 venv/
 ENV/
 env/
 .venv/
 venv/
-# Nix
+# Training outputs
 result
 result-*
 # Project specific
 outputs/
 data/
-*.log
+!data/basic_cot/train.jsonl
 wandb/
-.ipynb_checkpoints/
+*.log
 # Model files
 *.pt
 *.pth
 *.bin
 *.safetensors
 # Temporary
 *.tmp
 .cache/
 accelerate_config.yaml
 hostfile
 # IDE
 .vscode/
 .idea/
-*.swp
+
-*.swo
+# OS
-*~
+.DS_Store
 *~
 # Keep lock files
 !uv.lock
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
 3.11
--- a/README.md
+++ b/README.md
@ -0,0 +1,43 @@
 # Progressive LLM Training
 Progressive training for LLMs with 8-GPU support for 松尾研LLMコンペ2025.
 ## Quick Start
 ```bash
 # Install uv
 curl -LsSf https://astral.sh/uv/install.sh | sh
 # Setup project
 git clone <repository-url>
 cd progressive-llm-training
 uv sync
 # Start training
 uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml
 ./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed
 ```
 ## Training Stages
 1. **basic_cot** - Basic reasoning
 2. **math_reasoning** - Math with OpenR1-Math-220k 
 3. **complex_reasoning** - Complex reasoning with Mixture-of-Thoughts
 ## Commands
 ```bash
 uv sync                          # Install dependencies
 uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml     # Single GPU
 ./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed                                 # 8 GPUs
 uv run pytest                    # Run tests
 ```
 ## Key Files
 - `config/training_config_gemma3_1b_8gpu_deepspeed.yaml` - 8-GPU config
 - `scripts/train_progressive.py` - Main training script
 - `scripts/train_gemma3_1b_8gpu.sh` - 8-GPU launcher
 - `src/progressive_model.py` - Core model implementation
 Ready to train! 🚀
--- a/config/training_config_8gpu.yaml
+++ b/config/training_config_8gpu.yaml
@ -0,0 +1,110 @@
 experiment:
  name: "progressive_reasoning_8gpu"
  base_model: "google/gemma-2-2b-it"  # Can scale up to larger models
  output_dir: "./outputs"
  use_wandb: true
  wandb_project: "matsuo-llm-comp-2025"
 model:
  load_in_4bit: false  # Can use FP16/BF16 with multiple GPUs
  bnb_4bit_compute_dtype: "bfloat16"
  bnb_4bit_use_double_quant: true
  device_map: "balanced"  # Distribute across all GPUs
  gradient_checkpointing: true
  use_flash_attention_2: true  # Enable if available for speed
  use_eager_attention: false
 # Multi-GPU specific settings
 distributed:
  strategy: "ddp"  # Distributed Data Parallel
  find_unused_parameters: false
  gradient_as_bucket_view: true
 progressive_stages:
  - name: "basic_cot"
    description: "Basic Chain-of-Thought reasoning"
    dataset_path: "./data/basic_cot/"
    adapter_config:
      r: 32  # Larger rank since we have more memory
      lora_alpha: 64
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
      init_lora_weights: true
    training:
      num_epochs: 2
      per_device_batch_size: 16  # Large batch size per GPU
      gradient_accumulation_steps: 1  # No need for accumulation with 8 GPUs
      learning_rate: 5e-4
      warmup_steps: 100
      max_length: 2048
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 50
      logging_steps: 10
      dataloader_num_workers: 4  # More workers for data loading
      dataloader_pin_memory: true
      remove_unused_columns: false
  - name: "math_reasoning"
    description: "Mathematical reasoning with OpenR1-Math-220k dataset"
    dataset_path: "open-r1/OpenR1-Math-220k"
    inherit_from: "basic_cot"
    adapter_config:
      r: 64
      lora_alpha: 128
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 8  # Reduce for larger model
      gradient_accumulation_steps: 2
      learning_rate: 3e-4
      warmup_steps: 200
      max_length: 4096
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 100
      logging_steps: 20
      dataloader_num_workers: 4
    dataset_config:
      streaming: true
      max_samples: 100000  # Can process more with 8 GPUs
      split: "train"
  - name: "complex_reasoning"
    description: "Complex multi-step reasoning with Mixture-of-Thoughts"
    dataset_path: "open-r1/Mixture-of-Thoughts"
    inherit_from: "math_reasoning"
    adapter_config:
      r: 128  # Maximum rank with multi-GPU
      lora_alpha: 256
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 4
      gradient_accumulation_steps: 4
      learning_rate: 2e-4
      warmup_steps: 300
      max_length: 8192
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 200
      logging_steps: 50
      dataloader_num_workers: 4
    dataset_config:
      streaming: true
      max_samples: 50000
      split: "train"
 evaluation:
  benchmarks:
    - "HLE"
    - "Do-Not-Answer"
  save_results: true
  results_dir: "./outputs/evaluation_results"
--- a/config/training_config_8gpu_deepspeed.yaml
+++ b/config/training_config_8gpu_deepspeed.yaml
@ -0,0 +1,138 @@
 experiment:
  name: "progressive_reasoning_8gpu_deepspeed"
  base_model: "google/gemma-2-2b-it"
  output_dir: "./outputs"
  use_wandb: true
  wandb_project: "matsuo-llm-comp-2025"
 model:
  load_in_4bit: false
  device_map: null  # Let DeepSpeed handle device placement
  gradient_checkpointing: true
  use_flash_attention_2: true
  use_eager_attention: false
 # DeepSpeed Configuration
 deepspeed:
  zero_optimization:
    stage: 2  # ZeRO Stage 2 (partition optimizer states and gradients)
    allgather_partitions: true
    allgather_bucket_size: 200000000
    overlap_comm: true
    reduce_scatter: true
    reduce_bucket_size: 200000000
    contiguous_gradients: true
    cpu_offload: false  # Keep on GPU for speed
  optimizer:
    type: "AdamW"
    params:
      lr: 3e-4
      betas: [0.9, 0.999]
      eps: 1e-8
      weight_decay: 0.001
  scheduler:
    type: "WarmupLR"
    params:
      warmup_min_lr: 0
      warmup_max_lr: 3e-4
      warmup_num_steps: 200
  fp16:
    enabled: false
  bf16:
    enabled: true
  gradient_clipping: 1.0
  train_batch_size: 512  # Total batch size across all GPUs
  train_micro_batch_size_per_gpu: 64  # Per-GPU batch size
 progressive_stages:
  - name: "basic_cot"
    description: "Basic Chain-of-Thought reasoning"
    dataset_path: "./data/basic_cot/"
    adapter_config:
      r: 64
      lora_alpha: 128
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
      init_lora_weights: true
    training:
      num_epochs: 2
      per_device_batch_size: 64  # Large batch with DeepSpeed
      gradient_accumulation_steps: 1
      learning_rate: 5e-4
      warmup_steps: 100
      max_length: 2048
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 50
      logging_steps: 10
      dataloader_num_workers: 8
  - name: "math_reasoning"
    description: "Mathematical reasoning with OpenR1-Math-220k dataset"
    dataset_path: "open-r1/OpenR1-Math-220k"
    inherit_from: "basic_cot"
    adapter_config:
      r: 128
      lora_alpha: 256
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 32
      gradient_accumulation_steps: 1
      learning_rate: 3e-4
      warmup_steps: 200
      max_length: 4096
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 100
      logging_steps: 20
      dataloader_num_workers: 8
    dataset_config:
      streaming: true
      max_samples: 200000
      split: "train"
  - name: "complex_reasoning"
    description: "Complex multi-step reasoning with Mixture-of-Thoughts"
    dataset_path: "open-r1/Mixture-of-Thoughts"
    inherit_from: "math_reasoning"
    adapter_config:
      r: 256
      lora_alpha: 512
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 16
      gradient_accumulation_steps: 2
      learning_rate: 2e-4
      warmup_steps: 300
      max_length: 8192
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 200
      logging_steps: 50
      dataloader_num_workers: 8
    dataset_config:
      streaming: true
      max_samples: 100000
      split: "train"
 evaluation:
  benchmarks:
    - "HLE"
    - "Do-Not-Answer"
  save_results: true
  results_dir: "./outputs/evaluation_results"
--- a/config/training_config_8gpu_fsdp.yaml
+++ b/config/training_config_8gpu_fsdp.yaml
@ -0,0 +1,113 @@
 experiment:
  name: "progressive_reasoning_8gpu_fsdp"
  base_model: "google/gemma-2-2b-it"  # Can scale to much larger models with FSDP
  output_dir: "./outputs"
  use_wandb: true
  wandb_project: "matsuo-llm-comp-2025"
 model:
  load_in_4bit: false
  device_map: null  # Let FSDP handle device placement
  gradient_checkpointing: true
  use_flash_attention_2: true
  use_eager_attention: false
 # FSDP Configuration
 fsdp:
  fsdp_transformer_layer_cls_to_wrap: "Gemma2DecoderLayer"  # Wrap at layer level
  fsdp_sharding_strategy: "FULL_SHARD"  # Shard parameters, gradients, and optimizer states
  fsdp_cpu_offload: false  # Keep on GPU for speed
  fsdp_mixed_precision: true  # Use BF16 mixed precision
  fsdp_auto_wrap_policy: "TRANSFORMER_BASED_WRAP"
  fsdp_min_num_params: 1000000  # Wrap layers with >1M parameters
  fsdp_sync_module_states: true
  fsdp_forward_prefetch: true
  fsdp_use_orig_params: true  # Important for LoRA compatibility
 progressive_stages:
  - name: "basic_cot"
    description: "Basic Chain-of-Thought reasoning"
    dataset_path: "./data/basic_cot/"
    adapter_config:
      r: 64  # Can use larger ranks with FSDP
      lora_alpha: 128
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
      init_lora_weights: true
    training:
      num_epochs: 2
      per_device_batch_size: 32  # Very large batch size with FSDP
      gradient_accumulation_steps: 1
      learning_rate: 5e-4
      warmup_steps: 100
      max_length: 2048
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 50
      logging_steps: 10
      dataloader_num_workers: 8
      dataloader_pin_memory: true
  - name: "math_reasoning"
    description: "Mathematical reasoning with OpenR1-Math-220k dataset"
    dataset_path: "open-r1/OpenR1-Math-220k"
    inherit_from: "basic_cot"
    adapter_config:
      r: 128
      lora_alpha: 256
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 16
      gradient_accumulation_steps: 2
      learning_rate: 3e-4
      warmup_steps: 200
      max_length: 4096
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 100
      logging_steps: 20
      dataloader_num_workers: 8
    dataset_config:
      streaming: true
      max_samples: 200000  # Process even more data
      split: "train"
  - name: "complex_reasoning"
    description: "Complex multi-step reasoning with Mixture-of-Thoughts"
    dataset_path: "open-r1/Mixture-of-Thoughts"
    inherit_from: "math_reasoning"
    adapter_config:
      r: 256  # Very large rank possible with FSDP
      lora_alpha: 512
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 8
      gradient_accumulation_steps: 4
      learning_rate: 2e-4
      warmup_steps: 300
      max_length: 8192
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 200
      logging_steps: 50
      dataloader_num_workers: 8
    dataset_config:
      streaming: true
      max_samples: 100000
      split: "train"
 evaluation:
  benchmarks:
    - "HLE"
    - "Do-Not-Answer"
  save_results: true
  results_dir: "./outputs/evaluation_results"
--- a/config/training_config_gemma3_1b_8gpu_ddp.yaml
+++ b/config/training_config_gemma3_1b_8gpu_ddp.yaml
@ -0,0 +1,109 @@
 experiment:
  name: "progressive_reasoning_gemma3_1b_8gpu_ddp"
  base_model: "google/gemma-3-1b-pt"
  output_dir: "./outputs"
  use_wandb: true
  wandb_project: "matsuo-llm-comp-2025"
 model:
  load_in_4bit: false  # Can use FP16/BF16 with multiple GPUs
  bnb_4bit_compute_dtype: "bfloat16"
  bnb_4bit_use_double_quant: true
  device_map: "balanced"  # Distribute across all GPUs
  gradient_checkpointing: true
  use_flash_attention_2: false
  use_eager_attention: true
 # Multi-GPU specific settings
 distributed:
  strategy: "ddp"  # Distributed Data Parallel
  find_unused_parameters: false
  gradient_as_bucket_view: true
 progressive_stages:
  - name: "basic_cot"
    description: "Basic Chain-of-Thought reasoning"
    dataset_path: "./data/basic_cot/"
    adapter_config:
      r: 16  # Moderate rank for DDP
      lora_alpha: 32
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
      init_lora_weights: true
    training:
      num_epochs: 2
      per_device_batch_size: 32  # 32 * 8 = 256 total batch size
      gradient_accumulation_steps: 1
      learning_rate: 5e-4
      warmup_steps: 100
      max_length: 1024
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 50
      logging_steps: 10
      dataloader_num_workers: 4
      dataloader_pin_memory: true
  - name: "math_reasoning"
    description: "Mathematical reasoning with OpenR1-Math-220k dataset"
    dataset_path: "open-r1/OpenR1-Math-220k"
    inherit_from: "basic_cot"
    adapter_config:
      r: 32
      lora_alpha: 64
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 16  # 16 * 8 = 128 total batch size
      gradient_accumulation_steps: 2
      learning_rate: 3e-4
      warmup_steps: 200
      max_length: 2048
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 100
      logging_steps: 20
      dataloader_num_workers: 4
    dataset_config:
      streaming: true
      max_samples: 400000  # Process substantial data
      split: "train"
  - name: "complex_reasoning"
    description: "Complex multi-step reasoning with Mixture-of-Thoughts"
    dataset_path: "open-r1/Mixture-of-Thoughts"
    inherit_from: "math_reasoning"
    adapter_config:
      r: 64
      lora_alpha: 128
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 8  # 8 * 8 = 64 total batch size
      gradient_accumulation_steps: 4
      learning_rate: 2e-4
      warmup_steps: 300
      max_length: 4096
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 200
      logging_steps: 50
      dataloader_num_workers: 4
    dataset_config:
      streaming: true
      max_samples: 600000
      split: "train"
 evaluation:
  benchmarks:
    - "HLE"
    - "Do-Not-Answer"
  save_results: true
  results_dir: "./outputs/evaluation_results"
--- a/config/training_config_gemma3_1b_8gpu_deepspeed.yaml
+++ b/config/training_config_gemma3_1b_8gpu_deepspeed.yaml
@ -0,0 +1,139 @@
 experiment:
  name: "progressive_reasoning_gemma3_1b_8gpu_deepspeed"
  base_model: "google/gemma-3-1b-pt"
  output_dir: "./outputs"
  use_wandb: true
  wandb_project: "matsuo-llm-comp-2025"
 model:
  load_in_4bit: false  # Disable quantization for DeepSpeed
  device_map: null  # Let DeepSpeed handle device placement
  gradient_checkpointing: true  # Enable for memory efficiency
  use_flash_attention_2: false
  use_eager_attention: true
 # DeepSpeed Configuration
 deepspeed:
  zero_optimization:
    stage: 2  # ZeRO Stage 2 (partition optimizer states and gradients)
    allgather_partitions: true
    allgather_bucket_size: 500000000  # 500MB buckets
    overlap_comm: true
    reduce_scatter: true
    reduce_bucket_size: 500000000
    contiguous_gradients: true
    cpu_offload: false  # Keep on GPU for speed with small model
  optimizer:
    type: "AdamW"
    params:
      lr: 5e-4
      betas: [0.9, 0.999]
      eps: 1e-8
      weight_decay: 0.001
  scheduler:
    type: "WarmupLR"
    params:
      warmup_min_lr: 0
      warmup_max_lr: 5e-4
      warmup_num_steps: 100
  fp16:
    enabled: false
  bf16:
    enabled: true
  gradient_clipping: 1.0
  train_batch_size: 512  # Total batch size across all GPUs
  train_micro_batch_size_per_gpu: 64  # Per-GPU batch size
 progressive_stages:
  - name: "basic_cot"
    description: "Basic Chain-of-Thought reasoning"
    dataset_path: "./data/basic_cot/"
    adapter_config:
      r: 32  # Larger rank with 8 GPUs
      lora_alpha: 64
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
      init_lora_weights: true
    training:
      num_epochs: 2
      per_device_batch_size: 64  # Large batch with DeepSpeed
      gradient_accumulation_steps: 1  # No accumulation needed
      learning_rate: 5e-4
      warmup_steps: 100
      max_length: 1024
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 50
      logging_steps: 10
      dataloader_num_workers: 8
      dataloader_pin_memory: true
  - name: "math_reasoning"
    description: "Mathematical reasoning with OpenR1-Math-220k dataset"
    dataset_path: "open-r1/OpenR1-Math-220k"
    inherit_from: "basic_cot"
    adapter_config:
      r: 64  # Larger rank for math reasoning
      lora_alpha: 128
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 32  # Reduce for longer sequences
      gradient_accumulation_steps: 1
      learning_rate: 3e-4
      warmup_steps: 200
      max_length: 2048
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 100
      logging_steps: 20
      dataloader_num_workers: 8
    dataset_config:
      streaming: true
      max_samples: 500000  # Process more data with 8 GPUs
      split: "train"
  - name: "complex_reasoning"
    description: "Complex multi-step reasoning with Mixture-of-Thoughts"
    dataset_path: "open-r1/Mixture-of-Thoughts"
    inherit_from: "math_reasoning"
    adapter_config:
      r: 128  # Maximum rank for complex reasoning
      lora_alpha: 256
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 16  # Reduce for very long sequences
      gradient_accumulation_steps: 2
      learning_rate: 2e-4
      warmup_steps: 300
      max_length: 4096
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 200
      logging_steps: 50
      dataloader_num_workers: 8
    dataset_config:
      streaming: true
      max_samples: 800000  # Process even more data
      split: "train"
 evaluation:
  benchmarks:
    - "HLE"
    - "Do-Not-Answer"
  save_results: true
  results_dir: "./outputs/evaluation_results"
--- a/config/training_config_gemma3_1b_8gpu_fsdp.yaml
+++ b/config/training_config_gemma3_1b_8gpu_fsdp.yaml
@ -0,0 +1,113 @@
 experiment:
  name: "progressive_reasoning_gemma3_1b_8gpu_fsdp"
  base_model: "google/gemma-3-1b-pt"
  output_dir: "./outputs"
  use_wandb: true
  wandb_project: "matsuo-llm-comp-2025"
 model:
  load_in_4bit: false
  device_map: null  # Let FSDP handle device placement
  gradient_checkpointing: true
  use_flash_attention_2: false
  use_eager_attention: true
 # FSDP Configuration
 fsdp:
  fsdp_transformer_layer_cls_to_wrap: "GemmaDecoderLayer"  # Wrap at layer level
  fsdp_sharding_strategy: "FULL_SHARD"  # Shard parameters, gradients, and optimizer states
  fsdp_cpu_offload: false  # Keep on GPU for speed with small model
  fsdp_mixed_precision: true  # Use BF16 mixed precision
  fsdp_auto_wrap_policy: "TRANSFORMER_BASED_WRAP"
  fsdp_min_num_params: 1000000  # Wrap layers with >1M parameters
  fsdp_sync_module_states: true
  fsdp_forward_prefetch: true
  fsdp_use_orig_params: true  # Important for LoRA compatibility
 progressive_stages:
  - name: "basic_cot"
    description: "Basic Chain-of-Thought reasoning"
    dataset_path: "./data/basic_cot/"
    adapter_config:
      r: 32  # Can use larger ranks with FSDP
      lora_alpha: 64
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
      init_lora_weights: true
    training:
      num_epochs: 2
      per_device_batch_size: 48  # Very large batch size with FSDP
      gradient_accumulation_steps: 1
      learning_rate: 5e-4
      warmup_steps: 100
      max_length: 1024
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 50
      logging_steps: 10
      dataloader_num_workers: 8
      dataloader_pin_memory: true
  - name: "math_reasoning"
    description: "Mathematical reasoning with OpenR1-Math-220k dataset"
    dataset_path: "open-r1/OpenR1-Math-220k"
    inherit_from: "basic_cot"
    adapter_config:
      r: 64
      lora_alpha: 128
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 24
      gradient_accumulation_steps: 2
      learning_rate: 3e-4
      warmup_steps: 200
      max_length: 2048
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 100
      logging_steps: 20
      dataloader_num_workers: 8
    dataset_config:
      streaming: true
      max_samples: 600000  # Process even more data with FSDP
      split: "train"
  - name: "complex_reasoning"
    description: "Complex multi-step reasoning with Mixture-of-Thoughts"
    dataset_path: "open-r1/Mixture-of-Thoughts"
    inherit_from: "math_reasoning"
    adapter_config:
      r: 128  # Very large rank possible with FSDP
      lora_alpha: 256
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 12
      gradient_accumulation_steps: 4
      learning_rate: 2e-4
      warmup_steps: 300
      max_length: 4096
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 200
      logging_steps: 50
      dataloader_num_workers: 8
    dataset_config:
      streaming: true
      max_samples: 1000000  # Can process 1M samples efficiently
      split: "train"
 evaluation:
  benchmarks:
    - "HLE"
    - "Do-Not-Answer"
  save_results: true
  results_dir: "./outputs/evaluation_results"
--- a/docs/README.md
+++ b/docs/README.md
@ -0,0 +1,37 @@
 # Progressive LLM Training Documentation
 ## Setup
 ```bash
 curl -LsSf https://astral.sh/uv/install.sh | sh
 uv sync
 ```
 ## Training
 ### Single GPU
 ```bash
 uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml
 ```
 ### 8 GPUs
 ```bash
 ./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed
 ```
 ## Configuration
 - `config/training_config_gemma3_1b.yaml` - Single GPU
 - `config/training_config_gemma3_1b_8gpu_deepspeed.yaml` - 8 GPUs
 ## Environment
 Copy `.env.example` to `.env` and set:
 - `HF_TOKEN` - HuggingFace token
 - `WANDB_API_KEY` - W&B API key
 ## Troubleshooting
 - Reduce `per_device_batch_size` for memory issues
 - `export NCCL_DEBUG=INFO` for NCCL errors
 - `nvidia-smi` to check GPUs
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,26 @@
 [project]
 name = "progressive-llm-training"
 version = "0.1.0"
 description = "Progressive LLM Training for 松尾研LLMコンペ2025"
 requires-python = ">=3.9"
 dependencies = [
    "torch>=2.0.0",
    "transformers>=4.40.0",
    "accelerate>=0.27.0",
    "peft>=0.11.0",
    "trl>=0.9.0",
    "datasets>=2.18.0",
    "bitsandbytes>=0.43.0",
    "wandb>=0.16.0",
    "pyyaml>=6.0",
    "jsonlines>=4.0.0",
    "deepspeed>=0.12.0",
 ]
 [project.optional-dependencies]
 dev = ["pytest", "black", "isort"]
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,5 @@
 # Use uv instead: uv sync
 torch>=2.0.0
 transformers>=4.40.0
 accelerate>=0.27.0
 peft>=0.11.0
@ -7,7 +9,4 @@ bitsandbytes>=0.43.0
 wandb>=0.16.0
 pyyaml>=6.0
 jsonlines>=4.0.0
-scikit-learn>=1.3.0
+deepspeed>=0.12.0
 # flash-attn>=2.5.0  # Install separately with --no-build-isolation
 sentencepiece>=0.2.0
 protobuf>=4.25.0
--- a/scripts/debug_model_loading.py
+++ b/scripts/debug_model_loading.py
@ -0,0 +1,201 @@
 #!/usr/bin/env python3
 """
 Debug script to identify model loading issues
 """
 import sys
 import os
 import torch
 from pathlib import Path
 # Add src to path
 sys.path.append(str(Path(__file__).parent.parent))
 def clear_accelerate_env():
    """Clear all ACCELERATE environment variables"""
    print("Clearing ACCELERATE environment variables...")
    env_vars_to_clear = []
    for key in os.environ:
        if 'ACCELERATE' in key:
            env_vars_to_clear.append(key)
    for var in env_vars_to_clear:
        print(f"  Removing {var}={os.environ[var]}")
        del os.environ[var]
 def test_basic_model_loading():
    """Test basic model loading without any configuration"""
    print("Testing basic model loading...")
    from transformers import AutoModelForCausalLM, AutoTokenizer
    model_name = "google/gemma-2-2b-it"
    try:
        print("Testing with absolutely minimal config...")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            trust_remote_code=True,
            torch_dtype=torch.float32
        )
        print("✅ Basic loading successful!")
        del model
        return True
    except Exception as e:
        print(f"❌ Basic loading failed: {e}")
        return False
 def test_with_device_map():
    """Test with device_map auto"""
    print("Testing with device_map='auto'...")
    from transformers import AutoModelForCausalLM
    model_name = "google/gemma-2-2b-it"
    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            trust_remote_code=True,
            torch_dtype=torch.float32,
            device_map="auto"
        )
        print("✅ Device map loading successful!")
        del model
        return True
    except Exception as e:
        print(f"❌ Device map loading failed: {e}")
        return False
 def test_with_quantization():
    """Test with quantization"""
    print("Testing with 4-bit quantization...")
    from transformers import AutoModelForCausalLM, BitsAndBytesConfig
    model_name = "google/gemma-2-2b-it"
    try:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            trust_remote_code=True,
            quantization_config=bnb_config
        )
        print("✅ Quantization loading successful!")
        del model
        return True
    except Exception as e:
        print(f"❌ Quantization loading failed: {e}")
        return False
 def print_environment_info():
    """Print detailed environment information"""
    print("\n" + "="*50)
    print("ENVIRONMENT INFORMATION")
    print("="*50)
    # Python version
    print(f"Python version: {sys.version}")
    # PyTorch info
    try:
        import torch
        print(f"PyTorch version: {torch.__version__}")
        print(f"CUDA available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            print(f"CUDA device count: {torch.cuda.device_count()}")
            for i in range(torch.cuda.device_count()):
                print(f"  Device {i}: {torch.cuda.get_device_name(i)}")
        print(f"CUDA version: {torch.version.cuda}")
    except Exception as e:
        print(f"PyTorch info error: {e}")
    # Transformers info
    try:
        from transformers import __version__ as tf_version
        print(f"Transformers version: {tf_version}")
    except Exception as e:
        print(f"Transformers info error: {e}")
    # Accelerate info
    try:
        from accelerate import __version__ as acc_version
        print(f"Accelerate version: {acc_version}")
    except Exception as e:
        print(f"Accelerate info error: {e}")
    # PEFT info
    try:
        from peft import __version__ as peft_version
        print(f"PEFT version: {peft_version}")
    except Exception as e:
        print(f"PEFT info error: {e}")
    # BitsAndBytes info
    try:
        import bitsandbytes as bnb
        print(f"BitsAndBytes version: {bnb.__version__}")
    except Exception as e:
        print(f"BitsAndBytes info error: {e}")
    # Environment variables
    print("\nRelevant environment variables:")
    for key, value in sorted(os.environ.items()):
        if any(prefix in key for prefix in ['CUDA', 'TORCH', 'HF_', 'ACCELERATE', 'TRANSFORMERS']):
            print(f"  {key}={value}")
 def main():
    print("Progressive LLM Training - Model Loading Debug")
    print("=" * 60)
    # Print environment info first
    print_environment_info()
    # Clear environment variables
    clear_accelerate_env()
    # Test various loading methods
    print("\n" + "="*50)
    print("TESTING MODEL LOADING")
    print("="*50)
    results = []
    # Test 1: Basic loading
    results.append(("Basic loading", test_basic_model_loading()))
    # Test 2: With device map
    results.append(("Device map", test_with_device_map()))
    # Test 3: With quantization
    results.append(("Quantization", test_with_quantization()))
    # Summary
    print("\n" + "="*50)
    print("SUMMARY")
    print("="*50)
    for test_name, success in results:
        status = "✅ PASS" if success else "❌ FAIL"
        print(f"{test_name}: {status}")
    if any(result[1] for result in results):
        print("\n✅ At least one loading method works!")
        print("Use the successful method in your configuration.")
    else:
        print("\n❌ All loading methods failed!")
        print("This indicates a fundamental environment issue.")
        print("Consider:")
        print("1. Reinstalling transformers, accelerate, torch")
        print("2. Checking CUDA installation")
        print("3. Using a different model")
 if __name__ == "__main__":
    main()
--- a/scripts/train_gemma3_1b_8gpu.sh
+++ b/scripts/train_gemma3_1b_8gpu.sh
@ -0,0 +1,161 @@
 #!/bin/bash
 # Training launcher script for Gemma3 1B with 8 GPUs (uv compatible)
 # Color codes for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 NC='\033[0m' # No Color
 echo -e "${GREEN}Progressive LLM Training - Gemma3 1B 8GPU Launcher (uv)${NC}"
 echo "======================================================="
 # Check if uv is available
 if command -v uv &> /dev/null; then
    echo -e "${GREEN}Using uv for Python environment management${NC}"
    UV_PREFIX="uv run"
 else
    echo -e "${YELLOW}uv not found, using standard python${NC}"
    UV_PREFIX="python"
 fi
 # Default values
 STRATEGY="deepspeed"
 CONFIG=""
 NUM_GPUS=8
 DRY_RUN=false
 # Parse command line arguments
 while [[ $# -gt 0 ]]; do
    case $1 in
        --strategy)
            STRATEGY="$2"
            shift 2
            ;;
        --config)
            CONFIG="$2"
            shift 2
            ;;
        --num-gpus)
            NUM_GPUS="$2"
            shift 2
            ;;
        --dry-run)
            DRY_RUN=true
            shift
            ;;
        -h|--help)
            echo "Usage: $0 [options]"
            echo ""
            echo "Options:"
            echo "  --strategy <ddp|fsdp|deepspeed>  Training strategy (default: deepspeed)"
            echo "  --config <path>                  Custom config file (optional)"
            echo "  --num-gpus <n>                   Number of GPUs to use (default: 8)"
            echo "  --dry-run                        Show command without executing"
            echo ""
            echo "Examples:"
            echo "  # Use DeepSpeed (recommended)"
            echo "  $0 --strategy deepspeed"
            echo ""
            echo "  # Use DDP"
            echo "  $0 --strategy ddp"
            echo ""
            echo "  # Use FSDP"
            echo "  $0 --strategy fsdp"
            echo ""
            echo "  # Use custom config"
            echo "  $0 --strategy ddp --config config/my_config.yaml"
            exit 0
            ;;
        *)
            echo -e "${RED}Error: Unknown option $1${NC}"
            exit 1
            ;;
    esac
 done
 # Check GPU availability
 GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
 echo -e "Available GPUs: ${GREEN}$GPU_COUNT${NC}"
 if [ $GPU_COUNT -lt $NUM_GPUS ]; then
    echo -e "${RED}Error: Requested $NUM_GPUS GPUs but only $GPU_COUNT available${NC}"
    exit 1
 fi
 # Set default config based on strategy if not provided
 if [ -z "$CONFIG" ]; then
    case $STRATEGY in
        ddp)
            CONFIG="config/training_config_gemma3_1b_8gpu_ddp.yaml"
            ;;
        fsdp)
            CONFIG="config/training_config_gemma3_1b_8gpu_fsdp.yaml"
            ;;
        deepspeed)
            CONFIG="config/training_config_gemma3_1b_8gpu_deepspeed.yaml"
            ;;
        *)
            echo -e "${RED}Error: Invalid strategy '$STRATEGY'. Choose from: ddp, fsdp, deepspeed${NC}"
            exit 1
            ;;
    esac
 fi
 # Check if config file exists
 if [ ! -f "$CONFIG" ]; then
    echo -e "${RED}Error: Config file not found: $CONFIG${NC}"
    exit 1
 fi
 echo -e "Strategy: ${YELLOW}$STRATEGY${NC}"
 echo -e "Config: ${YELLOW}$CONFIG${NC}"
 echo -e "GPUs: ${YELLOW}$NUM_GPUS${NC}"
 echo ""
 # Build the command
 CMD="$UV_PREFIX scripts/train_multi_gpu.py --config $CONFIG --strategy $STRATEGY --num_gpus $NUM_GPUS"
 if [ "$DRY_RUN" = true ]; then
    echo -e "${YELLOW}Dry run mode - Command that would be executed:${NC}"
    echo "$CMD"
    exit 0
 fi
 # Show GPU memory before training
 echo -e "${GREEN}GPU Memory Usage Before Training:${NC}"
 nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv
 echo ""
 echo -e "${GREEN}Starting training...${NC}"
 echo "Command: $CMD"
 echo ""
 # Set environment variables for optimal performance
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 export NCCL_DEBUG=WARN  # Set to INFO for debugging
 export NCCL_ASYNC_ERROR_HANDLING=1
 # For DeepSpeed, set additional optimizations
 if [ "$STRATEGY" = "deepspeed" ]; then
    export DS_SKIP_CUDA_CHECK=1
    export TOKENIZERS_PARALLELISM=false
 fi
 # Execute the training command
 $CMD
 # Check exit status
 if [ $? -eq 0 ]; then
    echo ""
    echo -e "${GREEN}Training completed successfully!${NC}"
    # Show GPU memory after training
    echo ""
    echo -e "${GREEN}GPU Memory Usage After Training:${NC}"
    nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv
 else
    echo ""
    echo -e "${RED}Training failed!${NC}"
    exit 1
 fi
--- a/scripts/train_multi_gpu.py
+++ b/scripts/train_multi_gpu.py
@ -0,0 +1,224 @@
 #!/usr/bin/env python3
 """
 Multi-GPU training launcher for progressive reasoning model
 Supports DDP, FSDP, and DeepSpeed strategies
 """
 import os
 import sys
 import argparse
 import subprocess
 import shutil
 import torch
 from pathlib import Path
 # Add src to path
 sys.path.append(str(Path(__file__).parent.parent))
 def get_gpu_count():
    """Get the number of available GPUs"""
    if torch.cuda.is_available():
        return torch.cuda.device_count()
    return 0
 def setup_environment_for_strategy(strategy):
    """Set up environment variables for different strategies"""
    if strategy == "deepspeed":
        # DeepSpeed specific environment
        os.environ["MASTER_ADDR"] = "localhost"
        os.environ["MASTER_PORT"] = "12355"
        os.environ["RANK"] = "0"
        os.environ["LOCAL_RANK"] = "0"
        os.environ["WORLD_SIZE"] = str(get_gpu_count())
    elif strategy in ["ddp", "fsdp"]:
        # Standard distributed training environment
        os.environ["MASTER_ADDR"] = "localhost"
        os.environ["MASTER_PORT"] = "12355"
        # Let torchrun handle the rest
    # General optimizations
    os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
    os.environ["NCCL_DEBUG"] = "INFO"
    os.environ["TORCH_DISTRIBUTED_DEBUG"] = "INFO"
 def launch_ddp_training(config_path, num_gpus):
    """Launch DDP training using torchrun"""
    print(f"Launching DDP training on {num_gpus} GPUs...")
    setup_environment_for_strategy("ddp")
    # Use torchrun for DDP
    # Check if uv is available
    python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"]
    cmd = [
        "torchrun",
        "--nproc_per_node", str(num_gpus),
        "--master_port", "12355",
    ] + python_cmd + [
        "--config", config_path,
        "--distributed"
    ]
    print(f"Running command: {' '.join(cmd)}")
    return subprocess.run(cmd, cwd=Path(__file__).parent.parent)
 def launch_fsdp_training(config_path, num_gpus):
    """Launch FSDP training using accelerate"""
    print(f"Launching FSDP training on {num_gpus} GPUs...")
    setup_environment_for_strategy("fsdp")
    # Create accelerate config for FSDP
    accelerate_config = f"""
 compute_environment: LOCAL_MACHINE
 distributed_type: FSDP
 fsdp_config:
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_backward_prefetch: BACKWARD_PRE
  fsdp_cpu_ram_efficient_loading: true
  fsdp_forward_prefetch: false
  fsdp_offload_params: false
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_state_dict_type: SHARDED_STATE_DICT
  fsdp_sync_module_states: true
  fsdp_transformer_layer_cls_to_wrap: Gemma2DecoderLayer
  fsdp_use_orig_params: true
 machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
 num_processes: {num_gpus}
 rdzv_backend: static
 same_network: true
 tpu_env: []
 tpu_use_cluster: false
 tpu_use_sudo: false
 use_cpu: false
 """
    # Save config temporarily
    config_file = Path(__file__).parent.parent / "accelerate_config.yaml"
    with open(config_file, "w") as f:
        f.write(accelerate_config)
    # Check if uv is available
    python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"]
    cmd = [
        "accelerate", "launch",
        "--config_file", str(config_file),
    ] + python_cmd + [
        "--config", config_path
    ]
    print(f"Running command: {' '.join(cmd)}")
    result = subprocess.run(cmd, cwd=Path(__file__).parent.parent)
    # Clean up config file
    config_file.unlink(missing_ok=True)
    return result
 def launch_deepspeed_training(config_path, num_gpus):
    """Launch DeepSpeed training"""
    print(f"Launching DeepSpeed training on {num_gpus} GPUs...")
    setup_environment_for_strategy("deepspeed")
    # Create DeepSpeed hostfile
    hostfile = Path(__file__).parent.parent / "hostfile"
    with open(hostfile, "w") as f:
        f.write(f"localhost slots={num_gpus}\n")
    # Check if uv is available
    python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"]
    cmd = [
        "deepspeed",
        "--hostfile", str(hostfile),
        "--num_gpus", str(num_gpus),
    ] + python_cmd + [
        "--config", config_path,
        "--deepspeed"
    ]
    print(f"Running command: {' '.join(cmd)}")
    result = subprocess.run(cmd, cwd=Path(__file__).parent.parent)
    # Clean up hostfile
    hostfile.unlink(missing_ok=True)
    return result
 def main():
    parser = argparse.ArgumentParser(description="Multi-GPU Progressive LLM Training")
    parser.add_argument("--config", type=str, required=True,
                        help="Path to training configuration file")
    parser.add_argument("--strategy", type=str, default="ddp",
                        choices=["ddp", "fsdp", "deepspeed"],
                        help="Multi-GPU strategy to use")
    parser.add_argument("--num_gpus", type=int, default=None,
                        help="Number of GPUs to use (default: all available)")
    parser.add_argument("--dry_run", action="store_true",
                        help="Print commands without executing")
    args = parser.parse_args()
    # Get GPU count
    available_gpus = get_gpu_count()
    if available_gpus == 0:
        print("❌ No GPUs available!")
        sys.exit(1)
    num_gpus = args.num_gpus or available_gpus
    if num_gpus > available_gpus:
        print(f"❌ Requested {num_gpus} GPUs but only {available_gpus} available")
        sys.exit(1)
    # Check config file exists
    if not Path(args.config).exists():
        print(f"❌ Config file not found: {args.config}")
        sys.exit(1)
    print("Progressive LLM Training - Multi-GPU Launcher")
    print("=" * 60)
    print(f"Strategy: {args.strategy}")
    print(f"GPUs: {num_gpus} / {available_gpus}")
    print(f"Config: {args.config}")
    print("=" * 60)
    if args.dry_run:
        print("DRY RUN - Commands that would be executed:")
        # Show what would be run
        if args.strategy == "ddp":
            print("torchrun --nproc_per_node", num_gpus, "scripts/train_progressive.py")
        elif args.strategy == "fsdp":
            print("accelerate launch --config_file accelerate_config.yaml scripts/train_progressive.py")
        elif args.strategy == "deepspeed":
            print("deepspeed --num_gpus", num_gpus, "scripts/train_progressive.py")
        return
    # Launch training
    if args.strategy == "ddp":
        result = launch_ddp_training(args.config, num_gpus)
    elif args.strategy == "fsdp":
        result = launch_fsdp_training(args.config, num_gpus)
    elif args.strategy == "deepspeed":
        result = launch_deepspeed_training(args.config, num_gpus)
    if result.returncode == 0:
        print("✅ Training completed successfully!")
    else:
        print("❌ Training failed!")
        sys.exit(result.returncode)
 if __name__ == "__main__":
    main()
--- a/scripts/train_progressive.py
+++ b/scripts/train_progressive.py
@ -6,6 +6,7 @@ Main training script for progressive reasoning model
 import sys
 import yaml
 import argparse
 import os
 from pathlib import Path
 # Add src to path
@ -56,6 +57,18 @@ Examples:
        help="Load config and model but skip training (for testing)"
    )
    parser.add_argument(
        "--distributed",
        action="store_true",
        help="Enable distributed training"
    )
    parser.add_argument(
        "--deepspeed",
        action="store_true",
        help="Enable DeepSpeed training"
    )
    return parser.parse_args()
@ -74,9 +87,34 @@ def load_config(config_path: str) -> dict:
    return config
 def setup_distributed_training():
    """Setup distributed training environment"""
    # Check if we're in a distributed environment
    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
        import torch.distributed as dist
        import torch
        # Initialize distributed training
        if not dist.is_initialized():
            dist.init_process_group(backend="nccl")
        local_rank = int(os.environ.get("LOCAL_RANK", 0))
        torch.cuda.set_device(local_rank)
        print(f"Distributed training initialized: rank {dist.get_rank()}/{dist.get_world_size()}")
        return True
    return False
 def main():
    args = parse_args()
    # Setup distributed training if requested
    is_distributed = False
    if args.distributed or args.deepspeed:
        is_distributed = setup_distributed_training()
    print("Progressive LLM Training for 松尾研LLMコンペ2025")
    print("=" * 50)
@ -95,11 +133,26 @@ def main():
        print(f"Error loading config: {e}")
        sys.exit(1)
    # Add distributed/deepspeed flags to config
    config["training_args"] = config.get("training_args", {})
    if args.distributed:
        config["training_args"]["distributed"] = True
    if args.deepspeed:
        config["training_args"]["deepspeed"] = True
        # Add DeepSpeed config from main config
        if "deepspeed" in config:
            config["training_args"]["deepspeed_config"] = config["deepspeed"]
    # Print configuration info
    print(f"Experiment: {config['experiment']['name']}")
    print(f"Base model: {config['experiment']['base_model']}")
    print(f"Output directory: {config['experiment']['output_dir']}")
    print(f"Stages: {len(config['progressive_stages'])}")
    if is_distributed:
        print("Mode: Distributed Training")
    if args.deepspeed:
        print("Backend: DeepSpeed")
    print("=" * 50)
    # Prepare sample datasets if requested
    if args.prepare_data:
--- a/src/training.py
+++ b/src/training.py
@ -367,27 +367,55 @@ class ProgressiveTrainer:
        print(f"Final dataset size: {len(dataset)} examples")
-        # Training arguments - with CPU offload optimizations
+        # Training arguments - with multi-GPU and CPU offload optimizations
-        training_args = TrainingArguments(
+        training_args_dict = {
-            output_dir=f"./outputs/checkpoints/{stage_name}",
+            "output_dir": f"./outputs/checkpoints/{stage_name}",
-            num_train_epochs=stage_config["training"]["num_epochs"],
+            "num_train_epochs": stage_config["training"]["num_epochs"],
-            per_device_train_batch_size=stage_config["training"]["per_device_batch_size"],
+            "per_device_train_batch_size": stage_config["training"]["per_device_batch_size"],
-            gradient_accumulation_steps=stage_config["training"]["gradient_accumulation_steps"],
+            "gradient_accumulation_steps": stage_config["training"]["gradient_accumulation_steps"],
-            learning_rate=float(stage_config["training"]["learning_rate"]),  # Ensure it's a float
+            "learning_rate": float(stage_config["training"]["learning_rate"]),  # Ensure it's a float
-            warmup_steps=stage_config["training"]["warmup_steps"],
+            "warmup_steps": stage_config["training"]["warmup_steps"],
-            logging_steps=stage_config["training"].get("logging_steps", 10),
+            "logging_steps": stage_config["training"].get("logging_steps", 10),
-            save_strategy="epoch",
+            "save_strategy": "epoch",
-            eval_strategy="no",
+            "eval_strategy": "no",
-            bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
+            "bf16": torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
-            gradient_checkpointing=self.config["model"].get("gradient_checkpointing", False),
+            "gradient_checkpointing": self.config["model"].get("gradient_checkpointing", False),
-            max_grad_norm=stage_config["training"].get("max_grad_norm", 1.0),
+            "max_grad_norm": stage_config["training"].get("max_grad_norm", 1.0),
-            report_to="wandb" if self.config["experiment"]["use_wandb"] else "none",
+            "report_to": "wandb" if self.config["experiment"]["use_wandb"] else "none",
-            run_name=f"{self.config['experiment']['name']}_{stage_name}",
+            "run_name": f"{self.config['experiment']['name']}_{stage_name}",
-            dataloader_pin_memory=False,  # Reduce memory usage
+            "dataloader_pin_memory": stage_config["training"].get("dataloader_pin_memory", False),
-            remove_unused_columns=False,  # Keep all columns
+            "remove_unused_columns": False,  # Keep all columns
-            optim=stage_config["training"].get("optim", "adamw_torch"),  # Support 8-bit optimizers
+            "optim": stage_config["training"].get("optim", "adamw_torch"),  # Support 8-bit optimizers
-            dataloader_num_workers=stage_config["training"].get("dataloader_num_workers", 2),
+            "dataloader_num_workers": stage_config["training"].get("dataloader_num_workers", 2),
-        )
+        }
        # Add multi-GPU specific settings
        if self.config.get("training_args", {}).get("distributed", False):
            training_args_dict.update({
                "ddp_find_unused_parameters": False,
                "ddp_bucket_cap_mb": 200,
                "ddp_broadcast_buffers": False,
            })
        # Add DeepSpeed configuration
        if self.config.get("training_args", {}).get("deepspeed", False):
            deepspeed_config = self.config.get("training_args", {}).get("deepspeed_config")
            if deepspeed_config:
                training_args_dict["deepspeed"] = deepspeed_config
        # Add FSDP configuration
        if "fsdp" in self.config:
            fsdp_config = self.config["fsdp"]
            training_args_dict.update({
                "fsdp": fsdp_config.get("fsdp_sharding_strategy", "FULL_SHARD"),
                "fsdp_transformer_layer_cls_to_wrap": fsdp_config.get("fsdp_transformer_layer_cls_to_wrap"),
                "fsdp_auto_wrap_policy": fsdp_config.get("fsdp_auto_wrap_policy", "TRANSFORMER_BASED_WRAP"),
                "fsdp_min_num_params": fsdp_config.get("fsdp_min_num_params", 1000000),
                "fsdp_cpu_ram_efficient_loading": fsdp_config.get("fsdp_cpu_offload", False),
                "fsdp_sync_module_states": fsdp_config.get("fsdp_sync_module_states", True),
            })
        training_args = TrainingArguments(**training_args_dict)
        # Print dataset info for debugging
        print(f"Dataset columns: {dataset.column_names}")
--- a/uv.lock
+++ b/uv.lock
@ -0,0 +1,7 @@
 # This file is automatically @generated by uv.
 # It is not intended for manual editing.
 version = 1
 requires-python = ">=3.9"
 # Note: This is a placeholder lock file.
 # Run `uv lock` to generate the actual lock file with resolved dependencies.