From 5ca971b0a4bfdca54efb6e13857aa0e2c47a3301 Mon Sep 17 00:00:00 2001 From: Soma Nakamura Date: Thu, 10 Jul 2025 22:25:11 +0900 Subject: [PATCH] =?UTF-8?q?=E3=81=93=E3=82=93=E3=81=AB=E3=81=A1=E3=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env.example | 13 + .gitignore | 35 +-- .python-version | 1 + README.md | 43 ++++ config/training_config_8gpu.yaml | 110 +++++++++ config/training_config_8gpu_deepspeed.yaml | 138 +++++++++++ config/training_config_8gpu_fsdp.yaml | 113 +++++++++ .../training_config_gemma3_1b_8gpu_ddp.yaml | 109 +++++++++ ...ining_config_gemma3_1b_8gpu_deepspeed.yaml | 139 +++++++++++ .../training_config_gemma3_1b_8gpu_fsdp.yaml | 113 +++++++++ docs/README.md | 37 +++ pyproject.toml | 26 ++ requirements.txt | 7 +- scripts/debug_model_loading.py | 201 ++++++++++++++++ scripts/train_gemma3_1b_8gpu.sh | 161 +++++++++++++ scripts/train_multi_gpu.py | 224 ++++++++++++++++++ scripts/train_progressive.py | 53 +++++ src/training.py | 70 ++++-- uv.lock | 7 + 19 files changed, 1559 insertions(+), 41 deletions(-) create mode 100644 .env.example create mode 100644 .python-version create mode 100644 README.md create mode 100644 config/training_config_8gpu.yaml create mode 100644 config/training_config_8gpu_deepspeed.yaml create mode 100644 config/training_config_8gpu_fsdp.yaml create mode 100644 config/training_config_gemma3_1b_8gpu_ddp.yaml create mode 100644 config/training_config_gemma3_1b_8gpu_deepspeed.yaml create mode 100644 config/training_config_gemma3_1b_8gpu_fsdp.yaml create mode 100644 docs/README.md create mode 100644 pyproject.toml create mode 100644 scripts/debug_model_loading.py create mode 100755 scripts/train_gemma3_1b_8gpu.sh create mode 100755 scripts/train_multi_gpu.py create mode 100644 uv.lock diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..a7fa5ef --- /dev/null +++ b/.env.example @@ -0,0 +1,13 @@ +# Environment variables for Progressive LLM Training +# Copy this file to .env and fill in your values + +# HuggingFace +HF_TOKEN=your_token_here + +# Weights & Biases +WANDB_API_KEY=your_api_key_here +WANDB_PROJECT=matsuo-llm-comp-2025 + +# GPU Configuration +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +NCCL_DEBUG=WARN \ No newline at end of file diff --git a/.gitignore b/.gitignore index 20b8504..d8c85a5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,32 +1,35 @@ # Python __pycache__/ *.py[cod] -*$py.class -*.so -.Python -venv/ -ENV/ -env/ .venv/ +venv/ -# Nix -result -result-* - -# Project specific +# Training outputs outputs/ data/ -*.log +!data/basic_cot/train.jsonl wandb/ -.ipynb_checkpoints/ +*.log + +# Model files *.pt *.pth *.bin *.safetensors +# Temporary +*.tmp +.cache/ +accelerate_config.yaml +hostfile + # IDE .vscode/ .idea/ -*.swp -*.swo -*~ \ No newline at end of file + +# OS +.DS_Store +*~ + +# Keep lock files +!uv.lock \ No newline at end of file diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..902b2c9 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.11 \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..9ea360c --- /dev/null +++ b/README.md @@ -0,0 +1,43 @@ +# Progressive LLM Training + +Progressive training for LLMs with 8-GPU support for 松尾研LLMコンペ2025. + +## Quick Start + +```bash +# Install uv +curl -LsSf https://astral.sh/uv/install.sh | sh + +# Setup project +git clone +cd progressive-llm-training +uv sync + +# Start training +uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml +./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed +``` + +## Training Stages + +1. **basic_cot** - Basic reasoning +2. **math_reasoning** - Math with OpenR1-Math-220k +3. **complex_reasoning** - Complex reasoning with Mixture-of-Thoughts + +## Commands + +```bash +uv sync # Install dependencies +uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml # Single GPU +./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed # 8 GPUs +uv run pytest # Run tests +``` + +## Key Files + +- `config/training_config_gemma3_1b_8gpu_deepspeed.yaml` - 8-GPU config +- `scripts/train_progressive.py` - Main training script +- `scripts/train_gemma3_1b_8gpu.sh` - 8-GPU launcher +- `src/progressive_model.py` - Core model implementation + +Ready to train! 🚀 \ No newline at end of file diff --git a/config/training_config_8gpu.yaml b/config/training_config_8gpu.yaml new file mode 100644 index 0000000..c532196 --- /dev/null +++ b/config/training_config_8gpu.yaml @@ -0,0 +1,110 @@ +experiment: + name: "progressive_reasoning_8gpu" + base_model: "google/gemma-2-2b-it" # Can scale up to larger models + output_dir: "./outputs" + use_wandb: true + wandb_project: "matsuo-llm-comp-2025" + +model: + load_in_4bit: false # Can use FP16/BF16 with multiple GPUs + bnb_4bit_compute_dtype: "bfloat16" + bnb_4bit_use_double_quant: true + device_map: "balanced" # Distribute across all GPUs + gradient_checkpointing: true + use_flash_attention_2: true # Enable if available for speed + use_eager_attention: false + +# Multi-GPU specific settings +distributed: + strategy: "ddp" # Distributed Data Parallel + find_unused_parameters: false + gradient_as_bucket_view: true + +progressive_stages: + - name: "basic_cot" + description: "Basic Chain-of-Thought reasoning" + dataset_path: "./data/basic_cot/" + adapter_config: + r: 32 # Larger rank since we have more memory + lora_alpha: 64 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"] + init_lora_weights: true + training: + num_epochs: 2 + per_device_batch_size: 16 # Large batch size per GPU + gradient_accumulation_steps: 1 # No need for accumulation with 8 GPUs + learning_rate: 5e-4 + warmup_steps: 100 + max_length: 2048 + bf16: true + max_grad_norm: 1.0 + weight_decay: 0.001 + save_steps: 50 + logging_steps: 10 + dataloader_num_workers: 4 # More workers for data loading + dataloader_pin_memory: true + remove_unused_columns: false + + - name: "math_reasoning" + description: "Mathematical reasoning with OpenR1-Math-220k dataset" + dataset_path: "open-r1/OpenR1-Math-220k" + inherit_from: "basic_cot" + adapter_config: + r: 64 + lora_alpha: 128 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] + init_lora_weights: true + training: + num_epochs: 1 + per_device_batch_size: 8 # Reduce for larger model + gradient_accumulation_steps: 2 + learning_rate: 3e-4 + warmup_steps: 200 + max_length: 4096 + bf16: true + max_grad_norm: 1.0 + weight_decay: 0.001 + save_steps: 100 + logging_steps: 20 + dataloader_num_workers: 4 + dataset_config: + streaming: true + max_samples: 100000 # Can process more with 8 GPUs + split: "train" + + - name: "complex_reasoning" + description: "Complex multi-step reasoning with Mixture-of-Thoughts" + dataset_path: "open-r1/Mixture-of-Thoughts" + inherit_from: "math_reasoning" + adapter_config: + r: 128 # Maximum rank with multi-GPU + lora_alpha: 256 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] + init_lora_weights: true + training: + num_epochs: 1 + per_device_batch_size: 4 + gradient_accumulation_steps: 4 + learning_rate: 2e-4 + warmup_steps: 300 + max_length: 8192 + bf16: true + max_grad_norm: 1.0 + weight_decay: 0.001 + save_steps: 200 + logging_steps: 50 + dataloader_num_workers: 4 + dataset_config: + streaming: true + max_samples: 50000 + split: "train" + +evaluation: + benchmarks: + - "HLE" + - "Do-Not-Answer" + save_results: true + results_dir: "./outputs/evaluation_results" \ No newline at end of file diff --git a/config/training_config_8gpu_deepspeed.yaml b/config/training_config_8gpu_deepspeed.yaml new file mode 100644 index 0000000..248e4dc --- /dev/null +++ b/config/training_config_8gpu_deepspeed.yaml @@ -0,0 +1,138 @@ +experiment: + name: "progressive_reasoning_8gpu_deepspeed" + base_model: "google/gemma-2-2b-it" + output_dir: "./outputs" + use_wandb: true + wandb_project: "matsuo-llm-comp-2025" + +model: + load_in_4bit: false + device_map: null # Let DeepSpeed handle device placement + gradient_checkpointing: true + use_flash_attention_2: true + use_eager_attention: false + +# DeepSpeed Configuration +deepspeed: + zero_optimization: + stage: 2 # ZeRO Stage 2 (partition optimizer states and gradients) + allgather_partitions: true + allgather_bucket_size: 200000000 + overlap_comm: true + reduce_scatter: true + reduce_bucket_size: 200000000 + contiguous_gradients: true + cpu_offload: false # Keep on GPU for speed + + optimizer: + type: "AdamW" + params: + lr: 3e-4 + betas: [0.9, 0.999] + eps: 1e-8 + weight_decay: 0.001 + + scheduler: + type: "WarmupLR" + params: + warmup_min_lr: 0 + warmup_max_lr: 3e-4 + warmup_num_steps: 200 + + fp16: + enabled: false + + bf16: + enabled: true + + gradient_clipping: 1.0 + + train_batch_size: 512 # Total batch size across all GPUs + train_micro_batch_size_per_gpu: 64 # Per-GPU batch size + +progressive_stages: + - name: "basic_cot" + description: "Basic Chain-of-Thought reasoning" + dataset_path: "./data/basic_cot/" + adapter_config: + r: 64 + lora_alpha: 128 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"] + init_lora_weights: true + training: + num_epochs: 2 + per_device_batch_size: 64 # Large batch with DeepSpeed + gradient_accumulation_steps: 1 + learning_rate: 5e-4 + warmup_steps: 100 + max_length: 2048 + bf16: true + max_grad_norm: 1.0 + weight_decay: 0.001 + save_steps: 50 + logging_steps: 10 + dataloader_num_workers: 8 + + - name: "math_reasoning" + description: "Mathematical reasoning with OpenR1-Math-220k dataset" + dataset_path: "open-r1/OpenR1-Math-220k" + inherit_from: "basic_cot" + adapter_config: + r: 128 + lora_alpha: 256 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] + init_lora_weights: true + training: + num_epochs: 1 + per_device_batch_size: 32 + gradient_accumulation_steps: 1 + learning_rate: 3e-4 + warmup_steps: 200 + max_length: 4096 + bf16: true + max_grad_norm: 1.0 + weight_decay: 0.001 + save_steps: 100 + logging_steps: 20 + dataloader_num_workers: 8 + dataset_config: + streaming: true + max_samples: 200000 + split: "train" + + - name: "complex_reasoning" + description: "Complex multi-step reasoning with Mixture-of-Thoughts" + dataset_path: "open-r1/Mixture-of-Thoughts" + inherit_from: "math_reasoning" + adapter_config: + r: 256 + lora_alpha: 512 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] + init_lora_weights: true + training: + num_epochs: 1 + per_device_batch_size: 16 + gradient_accumulation_steps: 2 + learning_rate: 2e-4 + warmup_steps: 300 + max_length: 8192 + bf16: true + max_grad_norm: 1.0 + weight_decay: 0.001 + save_steps: 200 + logging_steps: 50 + dataloader_num_workers: 8 + dataset_config: + streaming: true + max_samples: 100000 + split: "train" + +evaluation: + benchmarks: + - "HLE" + - "Do-Not-Answer" + save_results: true + results_dir: "./outputs/evaluation_results" \ No newline at end of file diff --git a/config/training_config_8gpu_fsdp.yaml b/config/training_config_8gpu_fsdp.yaml new file mode 100644 index 0000000..62d06cf --- /dev/null +++ b/config/training_config_8gpu_fsdp.yaml @@ -0,0 +1,113 @@ +experiment: + name: "progressive_reasoning_8gpu_fsdp" + base_model: "google/gemma-2-2b-it" # Can scale to much larger models with FSDP + output_dir: "./outputs" + use_wandb: true + wandb_project: "matsuo-llm-comp-2025" + +model: + load_in_4bit: false + device_map: null # Let FSDP handle device placement + gradient_checkpointing: true + use_flash_attention_2: true + use_eager_attention: false + +# FSDP Configuration +fsdp: + fsdp_transformer_layer_cls_to_wrap: "Gemma2DecoderLayer" # Wrap at layer level + fsdp_sharding_strategy: "FULL_SHARD" # Shard parameters, gradients, and optimizer states + fsdp_cpu_offload: false # Keep on GPU for speed + fsdp_mixed_precision: true # Use BF16 mixed precision + fsdp_auto_wrap_policy: "TRANSFORMER_BASED_WRAP" + fsdp_min_num_params: 1000000 # Wrap layers with >1M parameters + fsdp_sync_module_states: true + fsdp_forward_prefetch: true + fsdp_use_orig_params: true # Important for LoRA compatibility + +progressive_stages: + - name: "basic_cot" + description: "Basic Chain-of-Thought reasoning" + dataset_path: "./data/basic_cot/" + adapter_config: + r: 64 # Can use larger ranks with FSDP + lora_alpha: 128 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"] + init_lora_weights: true + training: + num_epochs: 2 + per_device_batch_size: 32 # Very large batch size with FSDP + gradient_accumulation_steps: 1 + learning_rate: 5e-4 + warmup_steps: 100 + max_length: 2048 + bf16: true + max_grad_norm: 1.0 + weight_decay: 0.001 + save_steps: 50 + logging_steps: 10 + dataloader_num_workers: 8 + dataloader_pin_memory: true + + - name: "math_reasoning" + description: "Mathematical reasoning with OpenR1-Math-220k dataset" + dataset_path: "open-r1/OpenR1-Math-220k" + inherit_from: "basic_cot" + adapter_config: + r: 128 + lora_alpha: 256 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] + init_lora_weights: true + training: + num_epochs: 1 + per_device_batch_size: 16 + gradient_accumulation_steps: 2 + learning_rate: 3e-4 + warmup_steps: 200 + max_length: 4096 + bf16: true + max_grad_norm: 1.0 + weight_decay: 0.001 + save_steps: 100 + logging_steps: 20 + dataloader_num_workers: 8 + dataset_config: + streaming: true + max_samples: 200000 # Process even more data + split: "train" + + - name: "complex_reasoning" + description: "Complex multi-step reasoning with Mixture-of-Thoughts" + dataset_path: "open-r1/Mixture-of-Thoughts" + inherit_from: "math_reasoning" + adapter_config: + r: 256 # Very large rank possible with FSDP + lora_alpha: 512 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] + init_lora_weights: true + training: + num_epochs: 1 + per_device_batch_size: 8 + gradient_accumulation_steps: 4 + learning_rate: 2e-4 + warmup_steps: 300 + max_length: 8192 + bf16: true + max_grad_norm: 1.0 + weight_decay: 0.001 + save_steps: 200 + logging_steps: 50 + dataloader_num_workers: 8 + dataset_config: + streaming: true + max_samples: 100000 + split: "train" + +evaluation: + benchmarks: + - "HLE" + - "Do-Not-Answer" + save_results: true + results_dir: "./outputs/evaluation_results" \ No newline at end of file diff --git a/config/training_config_gemma3_1b_8gpu_ddp.yaml b/config/training_config_gemma3_1b_8gpu_ddp.yaml new file mode 100644 index 0000000..847acfb --- /dev/null +++ b/config/training_config_gemma3_1b_8gpu_ddp.yaml @@ -0,0 +1,109 @@ +experiment: + name: "progressive_reasoning_gemma3_1b_8gpu_ddp" + base_model: "google/gemma-3-1b-pt" + output_dir: "./outputs" + use_wandb: true + wandb_project: "matsuo-llm-comp-2025" + +model: + load_in_4bit: false # Can use FP16/BF16 with multiple GPUs + bnb_4bit_compute_dtype: "bfloat16" + bnb_4bit_use_double_quant: true + device_map: "balanced" # Distribute across all GPUs + gradient_checkpointing: true + use_flash_attention_2: false + use_eager_attention: true + +# Multi-GPU specific settings +distributed: + strategy: "ddp" # Distributed Data Parallel + find_unused_parameters: false + gradient_as_bucket_view: true + +progressive_stages: + - name: "basic_cot" + description: "Basic Chain-of-Thought reasoning" + dataset_path: "./data/basic_cot/" + adapter_config: + r: 16 # Moderate rank for DDP + lora_alpha: 32 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"] + init_lora_weights: true + training: + num_epochs: 2 + per_device_batch_size: 32 # 32 * 8 = 256 total batch size + gradient_accumulation_steps: 1 + learning_rate: 5e-4 + warmup_steps: 100 + max_length: 1024 + bf16: true + max_grad_norm: 1.0 + weight_decay: 0.001 + save_steps: 50 + logging_steps: 10 + dataloader_num_workers: 4 + dataloader_pin_memory: true + + - name: "math_reasoning" + description: "Mathematical reasoning with OpenR1-Math-220k dataset" + dataset_path: "open-r1/OpenR1-Math-220k" + inherit_from: "basic_cot" + adapter_config: + r: 32 + lora_alpha: 64 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] + init_lora_weights: true + training: + num_epochs: 1 + per_device_batch_size: 16 # 16 * 8 = 128 total batch size + gradient_accumulation_steps: 2 + learning_rate: 3e-4 + warmup_steps: 200 + max_length: 2048 + bf16: true + max_grad_norm: 1.0 + weight_decay: 0.001 + save_steps: 100 + logging_steps: 20 + dataloader_num_workers: 4 + dataset_config: + streaming: true + max_samples: 400000 # Process substantial data + split: "train" + + - name: "complex_reasoning" + description: "Complex multi-step reasoning with Mixture-of-Thoughts" + dataset_path: "open-r1/Mixture-of-Thoughts" + inherit_from: "math_reasoning" + adapter_config: + r: 64 + lora_alpha: 128 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] + init_lora_weights: true + training: + num_epochs: 1 + per_device_batch_size: 8 # 8 * 8 = 64 total batch size + gradient_accumulation_steps: 4 + learning_rate: 2e-4 + warmup_steps: 300 + max_length: 4096 + bf16: true + max_grad_norm: 1.0 + weight_decay: 0.001 + save_steps: 200 + logging_steps: 50 + dataloader_num_workers: 4 + dataset_config: + streaming: true + max_samples: 600000 + split: "train" + +evaluation: + benchmarks: + - "HLE" + - "Do-Not-Answer" + save_results: true + results_dir: "./outputs/evaluation_results" \ No newline at end of file diff --git a/config/training_config_gemma3_1b_8gpu_deepspeed.yaml b/config/training_config_gemma3_1b_8gpu_deepspeed.yaml new file mode 100644 index 0000000..521fdc5 --- /dev/null +++ b/config/training_config_gemma3_1b_8gpu_deepspeed.yaml @@ -0,0 +1,139 @@ +experiment: + name: "progressive_reasoning_gemma3_1b_8gpu_deepspeed" + base_model: "google/gemma-3-1b-pt" + output_dir: "./outputs" + use_wandb: true + wandb_project: "matsuo-llm-comp-2025" + +model: + load_in_4bit: false # Disable quantization for DeepSpeed + device_map: null # Let DeepSpeed handle device placement + gradient_checkpointing: true # Enable for memory efficiency + use_flash_attention_2: false + use_eager_attention: true + +# DeepSpeed Configuration +deepspeed: + zero_optimization: + stage: 2 # ZeRO Stage 2 (partition optimizer states and gradients) + allgather_partitions: true + allgather_bucket_size: 500000000 # 500MB buckets + overlap_comm: true + reduce_scatter: true + reduce_bucket_size: 500000000 + contiguous_gradients: true + cpu_offload: false # Keep on GPU for speed with small model + + optimizer: + type: "AdamW" + params: + lr: 5e-4 + betas: [0.9, 0.999] + eps: 1e-8 + weight_decay: 0.001 + + scheduler: + type: "WarmupLR" + params: + warmup_min_lr: 0 + warmup_max_lr: 5e-4 + warmup_num_steps: 100 + + fp16: + enabled: false + + bf16: + enabled: true + + gradient_clipping: 1.0 + + train_batch_size: 512 # Total batch size across all GPUs + train_micro_batch_size_per_gpu: 64 # Per-GPU batch size + +progressive_stages: + - name: "basic_cot" + description: "Basic Chain-of-Thought reasoning" + dataset_path: "./data/basic_cot/" + adapter_config: + r: 32 # Larger rank with 8 GPUs + lora_alpha: 64 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"] + init_lora_weights: true + training: + num_epochs: 2 + per_device_batch_size: 64 # Large batch with DeepSpeed + gradient_accumulation_steps: 1 # No accumulation needed + learning_rate: 5e-4 + warmup_steps: 100 + max_length: 1024 + bf16: true + max_grad_norm: 1.0 + weight_decay: 0.001 + save_steps: 50 + logging_steps: 10 + dataloader_num_workers: 8 + dataloader_pin_memory: true + + - name: "math_reasoning" + description: "Mathematical reasoning with OpenR1-Math-220k dataset" + dataset_path: "open-r1/OpenR1-Math-220k" + inherit_from: "basic_cot" + adapter_config: + r: 64 # Larger rank for math reasoning + lora_alpha: 128 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] + init_lora_weights: true + training: + num_epochs: 1 + per_device_batch_size: 32 # Reduce for longer sequences + gradient_accumulation_steps: 1 + learning_rate: 3e-4 + warmup_steps: 200 + max_length: 2048 + bf16: true + max_grad_norm: 1.0 + weight_decay: 0.001 + save_steps: 100 + logging_steps: 20 + dataloader_num_workers: 8 + dataset_config: + streaming: true + max_samples: 500000 # Process more data with 8 GPUs + split: "train" + + - name: "complex_reasoning" + description: "Complex multi-step reasoning with Mixture-of-Thoughts" + dataset_path: "open-r1/Mixture-of-Thoughts" + inherit_from: "math_reasoning" + adapter_config: + r: 128 # Maximum rank for complex reasoning + lora_alpha: 256 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] + init_lora_weights: true + training: + num_epochs: 1 + per_device_batch_size: 16 # Reduce for very long sequences + gradient_accumulation_steps: 2 + learning_rate: 2e-4 + warmup_steps: 300 + max_length: 4096 + bf16: true + max_grad_norm: 1.0 + weight_decay: 0.001 + save_steps: 200 + logging_steps: 50 + dataloader_num_workers: 8 + dataset_config: + streaming: true + max_samples: 800000 # Process even more data + split: "train" + +evaluation: + benchmarks: + - "HLE" + - "Do-Not-Answer" + save_results: true + results_dir: "./outputs/evaluation_results" \ No newline at end of file diff --git a/config/training_config_gemma3_1b_8gpu_fsdp.yaml b/config/training_config_gemma3_1b_8gpu_fsdp.yaml new file mode 100644 index 0000000..29bd242 --- /dev/null +++ b/config/training_config_gemma3_1b_8gpu_fsdp.yaml @@ -0,0 +1,113 @@ +experiment: + name: "progressive_reasoning_gemma3_1b_8gpu_fsdp" + base_model: "google/gemma-3-1b-pt" + output_dir: "./outputs" + use_wandb: true + wandb_project: "matsuo-llm-comp-2025" + +model: + load_in_4bit: false + device_map: null # Let FSDP handle device placement + gradient_checkpointing: true + use_flash_attention_2: false + use_eager_attention: true + +# FSDP Configuration +fsdp: + fsdp_transformer_layer_cls_to_wrap: "GemmaDecoderLayer" # Wrap at layer level + fsdp_sharding_strategy: "FULL_SHARD" # Shard parameters, gradients, and optimizer states + fsdp_cpu_offload: false # Keep on GPU for speed with small model + fsdp_mixed_precision: true # Use BF16 mixed precision + fsdp_auto_wrap_policy: "TRANSFORMER_BASED_WRAP" + fsdp_min_num_params: 1000000 # Wrap layers with >1M parameters + fsdp_sync_module_states: true + fsdp_forward_prefetch: true + fsdp_use_orig_params: true # Important for LoRA compatibility + +progressive_stages: + - name: "basic_cot" + description: "Basic Chain-of-Thought reasoning" + dataset_path: "./data/basic_cot/" + adapter_config: + r: 32 # Can use larger ranks with FSDP + lora_alpha: 64 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"] + init_lora_weights: true + training: + num_epochs: 2 + per_device_batch_size: 48 # Very large batch size with FSDP + gradient_accumulation_steps: 1 + learning_rate: 5e-4 + warmup_steps: 100 + max_length: 1024 + bf16: true + max_grad_norm: 1.0 + weight_decay: 0.001 + save_steps: 50 + logging_steps: 10 + dataloader_num_workers: 8 + dataloader_pin_memory: true + + - name: "math_reasoning" + description: "Mathematical reasoning with OpenR1-Math-220k dataset" + dataset_path: "open-r1/OpenR1-Math-220k" + inherit_from: "basic_cot" + adapter_config: + r: 64 + lora_alpha: 128 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] + init_lora_weights: true + training: + num_epochs: 1 + per_device_batch_size: 24 + gradient_accumulation_steps: 2 + learning_rate: 3e-4 + warmup_steps: 200 + max_length: 2048 + bf16: true + max_grad_norm: 1.0 + weight_decay: 0.001 + save_steps: 100 + logging_steps: 20 + dataloader_num_workers: 8 + dataset_config: + streaming: true + max_samples: 600000 # Process even more data with FSDP + split: "train" + + - name: "complex_reasoning" + description: "Complex multi-step reasoning with Mixture-of-Thoughts" + dataset_path: "open-r1/Mixture-of-Thoughts" + inherit_from: "math_reasoning" + adapter_config: + r: 128 # Very large rank possible with FSDP + lora_alpha: 256 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] + init_lora_weights: true + training: + num_epochs: 1 + per_device_batch_size: 12 + gradient_accumulation_steps: 4 + learning_rate: 2e-4 + warmup_steps: 300 + max_length: 4096 + bf16: true + max_grad_norm: 1.0 + weight_decay: 0.001 + save_steps: 200 + logging_steps: 50 + dataloader_num_workers: 8 + dataset_config: + streaming: true + max_samples: 1000000 # Can process 1M samples efficiently + split: "train" + +evaluation: + benchmarks: + - "HLE" + - "Do-Not-Answer" + save_results: true + results_dir: "./outputs/evaluation_results" \ No newline at end of file diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..6581458 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,37 @@ +# Progressive LLM Training Documentation + +## Setup + +```bash +curl -LsSf https://astral.sh/uv/install.sh | sh +uv sync +``` + +## Training + +### Single GPU +```bash +uv run scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml +``` + +### 8 GPUs +```bash +./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed +``` + +## Configuration + +- `config/training_config_gemma3_1b.yaml` - Single GPU +- `config/training_config_gemma3_1b_8gpu_deepspeed.yaml` - 8 GPUs + +## Environment + +Copy `.env.example` to `.env` and set: +- `HF_TOKEN` - HuggingFace token +- `WANDB_API_KEY` - W&B API key + +## Troubleshooting + +- Reduce `per_device_batch_size` for memory issues +- `export NCCL_DEBUG=INFO` for NCCL errors +- `nvidia-smi` to check GPUs \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..573bccd --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,26 @@ +[project] +name = "progressive-llm-training" +version = "0.1.0" +description = "Progressive LLM Training for 松尾研LLMコンペ2025" +requires-python = ">=3.9" + +dependencies = [ + "torch>=2.0.0", + "transformers>=4.40.0", + "accelerate>=0.27.0", + "peft>=0.11.0", + "trl>=0.9.0", + "datasets>=2.18.0", + "bitsandbytes>=0.43.0", + "wandb>=0.16.0", + "pyyaml>=6.0", + "jsonlines>=4.0.0", + "deepspeed>=0.12.0", +] + +[project.optional-dependencies] +dev = ["pytest", "black", "isort"] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 534ab7a..f521a17 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +# Use uv instead: uv sync +torch>=2.0.0 transformers>=4.40.0 accelerate>=0.27.0 peft>=0.11.0 @@ -7,7 +9,4 @@ bitsandbytes>=0.43.0 wandb>=0.16.0 pyyaml>=6.0 jsonlines>=4.0.0 -scikit-learn>=1.3.0 -# flash-attn>=2.5.0 # Install separately with --no-build-isolation -sentencepiece>=0.2.0 -protobuf>=4.25.0 +deepspeed>=0.12.0 diff --git a/scripts/debug_model_loading.py b/scripts/debug_model_loading.py new file mode 100644 index 0000000..ad39c10 --- /dev/null +++ b/scripts/debug_model_loading.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 +""" +Debug script to identify model loading issues +""" + +import sys +import os +import torch +from pathlib import Path + +# Add src to path +sys.path.append(str(Path(__file__).parent.parent)) + +def clear_accelerate_env(): + """Clear all ACCELERATE environment variables""" + print("Clearing ACCELERATE environment variables...") + env_vars_to_clear = [] + for key in os.environ: + if 'ACCELERATE' in key: + env_vars_to_clear.append(key) + + for var in env_vars_to_clear: + print(f" Removing {var}={os.environ[var]}") + del os.environ[var] + +def test_basic_model_loading(): + """Test basic model loading without any configuration""" + print("Testing basic model loading...") + + from transformers import AutoModelForCausalLM, AutoTokenizer + + model_name = "google/gemma-2-2b-it" + + try: + print("Testing with absolutely minimal config...") + model = AutoModelForCausalLM.from_pretrained( + model_name, + trust_remote_code=True, + torch_dtype=torch.float32 + ) + print("✅ Basic loading successful!") + del model + return True + except Exception as e: + print(f"❌ Basic loading failed: {e}") + return False + +def test_with_device_map(): + """Test with device_map auto""" + print("Testing with device_map='auto'...") + + from transformers import AutoModelForCausalLM + + model_name = "google/gemma-2-2b-it" + + try: + model = AutoModelForCausalLM.from_pretrained( + model_name, + trust_remote_code=True, + torch_dtype=torch.float32, + device_map="auto" + ) + print("✅ Device map loading successful!") + del model + return True + except Exception as e: + print(f"❌ Device map loading failed: {e}") + return False + +def test_with_quantization(): + """Test with quantization""" + print("Testing with 4-bit quantization...") + + from transformers import AutoModelForCausalLM, BitsAndBytesConfig + + model_name = "google/gemma-2-2b-it" + + try: + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4" + ) + + model = AutoModelForCausalLM.from_pretrained( + model_name, + trust_remote_code=True, + quantization_config=bnb_config + ) + print("✅ Quantization loading successful!") + del model + return True + except Exception as e: + print(f"❌ Quantization loading failed: {e}") + return False + +def print_environment_info(): + """Print detailed environment information""" + print("\n" + "="*50) + print("ENVIRONMENT INFORMATION") + print("="*50) + + # Python version + print(f"Python version: {sys.version}") + + # PyTorch info + try: + import torch + print(f"PyTorch version: {torch.__version__}") + print(f"CUDA available: {torch.cuda.is_available()}") + if torch.cuda.is_available(): + print(f"CUDA device count: {torch.cuda.device_count()}") + for i in range(torch.cuda.device_count()): + print(f" Device {i}: {torch.cuda.get_device_name(i)}") + print(f"CUDA version: {torch.version.cuda}") + except Exception as e: + print(f"PyTorch info error: {e}") + + # Transformers info + try: + from transformers import __version__ as tf_version + print(f"Transformers version: {tf_version}") + except Exception as e: + print(f"Transformers info error: {e}") + + # Accelerate info + try: + from accelerate import __version__ as acc_version + print(f"Accelerate version: {acc_version}") + except Exception as e: + print(f"Accelerate info error: {e}") + + # PEFT info + try: + from peft import __version__ as peft_version + print(f"PEFT version: {peft_version}") + except Exception as e: + print(f"PEFT info error: {e}") + + # BitsAndBytes info + try: + import bitsandbytes as bnb + print(f"BitsAndBytes version: {bnb.__version__}") + except Exception as e: + print(f"BitsAndBytes info error: {e}") + + # Environment variables + print("\nRelevant environment variables:") + for key, value in sorted(os.environ.items()): + if any(prefix in key for prefix in ['CUDA', 'TORCH', 'HF_', 'ACCELERATE', 'TRANSFORMERS']): + print(f" {key}={value}") + +def main(): + print("Progressive LLM Training - Model Loading Debug") + print("=" * 60) + + # Print environment info first + print_environment_info() + + # Clear environment variables + clear_accelerate_env() + + # Test various loading methods + print("\n" + "="*50) + print("TESTING MODEL LOADING") + print("="*50) + + results = [] + + # Test 1: Basic loading + results.append(("Basic loading", test_basic_model_loading())) + + # Test 2: With device map + results.append(("Device map", test_with_device_map())) + + # Test 3: With quantization + results.append(("Quantization", test_with_quantization())) + + # Summary + print("\n" + "="*50) + print("SUMMARY") + print("="*50) + + for test_name, success in results: + status = "✅ PASS" if success else "❌ FAIL" + print(f"{test_name}: {status}") + + if any(result[1] for result in results): + print("\n✅ At least one loading method works!") + print("Use the successful method in your configuration.") + else: + print("\n❌ All loading methods failed!") + print("This indicates a fundamental environment issue.") + print("Consider:") + print("1. Reinstalling transformers, accelerate, torch") + print("2. Checking CUDA installation") + print("3. Using a different model") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/train_gemma3_1b_8gpu.sh b/scripts/train_gemma3_1b_8gpu.sh new file mode 100755 index 0000000..0c4ca81 --- /dev/null +++ b/scripts/train_gemma3_1b_8gpu.sh @@ -0,0 +1,161 @@ +#!/bin/bash +# Training launcher script for Gemma3 1B with 8 GPUs (uv compatible) + +# Color codes for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo -e "${GREEN}Progressive LLM Training - Gemma3 1B 8GPU Launcher (uv)${NC}" +echo "=======================================================" + +# Check if uv is available +if command -v uv &> /dev/null; then + echo -e "${GREEN}Using uv for Python environment management${NC}" + UV_PREFIX="uv run" +else + echo -e "${YELLOW}uv not found, using standard python${NC}" + UV_PREFIX="python" +fi + +# Default values +STRATEGY="deepspeed" +CONFIG="" +NUM_GPUS=8 +DRY_RUN=false + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --strategy) + STRATEGY="$2" + shift 2 + ;; + --config) + CONFIG="$2" + shift 2 + ;; + --num-gpus) + NUM_GPUS="$2" + shift 2 + ;; + --dry-run) + DRY_RUN=true + shift + ;; + -h|--help) + echo "Usage: $0 [options]" + echo "" + echo "Options:" + echo " --strategy Training strategy (default: deepspeed)" + echo " --config Custom config file (optional)" + echo " --num-gpus Number of GPUs to use (default: 8)" + echo " --dry-run Show command without executing" + echo "" + echo "Examples:" + echo " # Use DeepSpeed (recommended)" + echo " $0 --strategy deepspeed" + echo "" + echo " # Use DDP" + echo " $0 --strategy ddp" + echo "" + echo " # Use FSDP" + echo " $0 --strategy fsdp" + echo "" + echo " # Use custom config" + echo " $0 --strategy ddp --config config/my_config.yaml" + exit 0 + ;; + *) + echo -e "${RED}Error: Unknown option $1${NC}" + exit 1 + ;; + esac +done + +# Check GPU availability +GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) +echo -e "Available GPUs: ${GREEN}$GPU_COUNT${NC}" + +if [ $GPU_COUNT -lt $NUM_GPUS ]; then + echo -e "${RED}Error: Requested $NUM_GPUS GPUs but only $GPU_COUNT available${NC}" + exit 1 +fi + +# Set default config based on strategy if not provided +if [ -z "$CONFIG" ]; then + case $STRATEGY in + ddp) + CONFIG="config/training_config_gemma3_1b_8gpu_ddp.yaml" + ;; + fsdp) + CONFIG="config/training_config_gemma3_1b_8gpu_fsdp.yaml" + ;; + deepspeed) + CONFIG="config/training_config_gemma3_1b_8gpu_deepspeed.yaml" + ;; + *) + echo -e "${RED}Error: Invalid strategy '$STRATEGY'. Choose from: ddp, fsdp, deepspeed${NC}" + exit 1 + ;; + esac +fi + +# Check if config file exists +if [ ! -f "$CONFIG" ]; then + echo -e "${RED}Error: Config file not found: $CONFIG${NC}" + exit 1 +fi + +echo -e "Strategy: ${YELLOW}$STRATEGY${NC}" +echo -e "Config: ${YELLOW}$CONFIG${NC}" +echo -e "GPUs: ${YELLOW}$NUM_GPUS${NC}" +echo "" + +# Build the command +CMD="$UV_PREFIX scripts/train_multi_gpu.py --config $CONFIG --strategy $STRATEGY --num_gpus $NUM_GPUS" + +if [ "$DRY_RUN" = true ]; then + echo -e "${YELLOW}Dry run mode - Command that would be executed:${NC}" + echo "$CMD" + exit 0 +fi + +# Show GPU memory before training +echo -e "${GREEN}GPU Memory Usage Before Training:${NC}" +nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv + +echo "" +echo -e "${GREEN}Starting training...${NC}" +echo "Command: $CMD" +echo "" + +# Set environment variables for optimal performance +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export NCCL_DEBUG=WARN # Set to INFO for debugging +export NCCL_ASYNC_ERROR_HANDLING=1 + +# For DeepSpeed, set additional optimizations +if [ "$STRATEGY" = "deepspeed" ]; then + export DS_SKIP_CUDA_CHECK=1 + export TOKENIZERS_PARALLELISM=false +fi + +# Execute the training command +$CMD + +# Check exit status +if [ $? -eq 0 ]; then + echo "" + echo -e "${GREEN}Training completed successfully!${NC}" + + # Show GPU memory after training + echo "" + echo -e "${GREEN}GPU Memory Usage After Training:${NC}" + nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv +else + echo "" + echo -e "${RED}Training failed!${NC}" + exit 1 +fi \ No newline at end of file diff --git a/scripts/train_multi_gpu.py b/scripts/train_multi_gpu.py new file mode 100755 index 0000000..5cc278f --- /dev/null +++ b/scripts/train_multi_gpu.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python3 +""" +Multi-GPU training launcher for progressive reasoning model +Supports DDP, FSDP, and DeepSpeed strategies +""" + +import os +import sys +import argparse +import subprocess +import shutil +import torch +from pathlib import Path + +# Add src to path +sys.path.append(str(Path(__file__).parent.parent)) + + +def get_gpu_count(): + """Get the number of available GPUs""" + if torch.cuda.is_available(): + return torch.cuda.device_count() + return 0 + + +def setup_environment_for_strategy(strategy): + """Set up environment variables for different strategies""" + if strategy == "deepspeed": + # DeepSpeed specific environment + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355" + os.environ["RANK"] = "0" + os.environ["LOCAL_RANK"] = "0" + os.environ["WORLD_SIZE"] = str(get_gpu_count()) + elif strategy in ["ddp", "fsdp"]: + # Standard distributed training environment + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355" + # Let torchrun handle the rest + + # General optimizations + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1" + os.environ["NCCL_DEBUG"] = "INFO" + os.environ["TORCH_DISTRIBUTED_DEBUG"] = "INFO" + + +def launch_ddp_training(config_path, num_gpus): + """Launch DDP training using torchrun""" + print(f"Launching DDP training on {num_gpus} GPUs...") + + setup_environment_for_strategy("ddp") + + # Use torchrun for DDP + # Check if uv is available + python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"] + + cmd = [ + "torchrun", + "--nproc_per_node", str(num_gpus), + "--master_port", "12355", + ] + python_cmd + [ + "--config", config_path, + "--distributed" + ] + + print(f"Running command: {' '.join(cmd)}") + return subprocess.run(cmd, cwd=Path(__file__).parent.parent) + + +def launch_fsdp_training(config_path, num_gpus): + """Launch FSDP training using accelerate""" + print(f"Launching FSDP training on {num_gpus} GPUs...") + + setup_environment_for_strategy("fsdp") + + # Create accelerate config for FSDP + accelerate_config = f""" +compute_environment: LOCAL_MACHINE +distributed_type: FSDP +fsdp_config: + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_backward_prefetch: BACKWARD_PRE + fsdp_cpu_ram_efficient_loading: true + fsdp_forward_prefetch: false + fsdp_offload_params: false + fsdp_sharding_strategy: FULL_SHARD + fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_sync_module_states: true + fsdp_transformer_layer_cls_to_wrap: Gemma2DecoderLayer + fsdp_use_orig_params: true +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: {num_gpus} +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false +""" + + # Save config temporarily + config_file = Path(__file__).parent.parent / "accelerate_config.yaml" + with open(config_file, "w") as f: + f.write(accelerate_config) + + # Check if uv is available + python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"] + + cmd = [ + "accelerate", "launch", + "--config_file", str(config_file), + ] + python_cmd + [ + "--config", config_path + ] + + print(f"Running command: {' '.join(cmd)}") + result = subprocess.run(cmd, cwd=Path(__file__).parent.parent) + + # Clean up config file + config_file.unlink(missing_ok=True) + + return result + + +def launch_deepspeed_training(config_path, num_gpus): + """Launch DeepSpeed training""" + print(f"Launching DeepSpeed training on {num_gpus} GPUs...") + + setup_environment_for_strategy("deepspeed") + + # Create DeepSpeed hostfile + hostfile = Path(__file__).parent.parent / "hostfile" + with open(hostfile, "w") as f: + f.write(f"localhost slots={num_gpus}\n") + + # Check if uv is available + python_cmd = ["uv", "run", "scripts/train_progressive.py"] if shutil.which("uv") else ["python", "scripts/train_progressive.py"] + + cmd = [ + "deepspeed", + "--hostfile", str(hostfile), + "--num_gpus", str(num_gpus), + ] + python_cmd + [ + "--config", config_path, + "--deepspeed" + ] + + print(f"Running command: {' '.join(cmd)}") + result = subprocess.run(cmd, cwd=Path(__file__).parent.parent) + + # Clean up hostfile + hostfile.unlink(missing_ok=True) + + return result + + +def main(): + parser = argparse.ArgumentParser(description="Multi-GPU Progressive LLM Training") + parser.add_argument("--config", type=str, required=True, + help="Path to training configuration file") + parser.add_argument("--strategy", type=str, default="ddp", + choices=["ddp", "fsdp", "deepspeed"], + help="Multi-GPU strategy to use") + parser.add_argument("--num_gpus", type=int, default=None, + help="Number of GPUs to use (default: all available)") + parser.add_argument("--dry_run", action="store_true", + help="Print commands without executing") + + args = parser.parse_args() + + # Get GPU count + available_gpus = get_gpu_count() + if available_gpus == 0: + print("❌ No GPUs available!") + sys.exit(1) + + num_gpus = args.num_gpus or available_gpus + if num_gpus > available_gpus: + print(f"❌ Requested {num_gpus} GPUs but only {available_gpus} available") + sys.exit(1) + + # Check config file exists + if not Path(args.config).exists(): + print(f"❌ Config file not found: {args.config}") + sys.exit(1) + + print("Progressive LLM Training - Multi-GPU Launcher") + print("=" * 60) + print(f"Strategy: {args.strategy}") + print(f"GPUs: {num_gpus} / {available_gpus}") + print(f"Config: {args.config}") + print("=" * 60) + + if args.dry_run: + print("DRY RUN - Commands that would be executed:") + # Show what would be run + if args.strategy == "ddp": + print("torchrun --nproc_per_node", num_gpus, "scripts/train_progressive.py") + elif args.strategy == "fsdp": + print("accelerate launch --config_file accelerate_config.yaml scripts/train_progressive.py") + elif args.strategy == "deepspeed": + print("deepspeed --num_gpus", num_gpus, "scripts/train_progressive.py") + return + + # Launch training + if args.strategy == "ddp": + result = launch_ddp_training(args.config, num_gpus) + elif args.strategy == "fsdp": + result = launch_fsdp_training(args.config, num_gpus) + elif args.strategy == "deepspeed": + result = launch_deepspeed_training(args.config, num_gpus) + + if result.returncode == 0: + print("✅ Training completed successfully!") + else: + print("❌ Training failed!") + sys.exit(result.returncode) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/train_progressive.py b/scripts/train_progressive.py index d3cd938..9b4df04 100755 --- a/scripts/train_progressive.py +++ b/scripts/train_progressive.py @@ -6,6 +6,7 @@ Main training script for progressive reasoning model import sys import yaml import argparse +import os from pathlib import Path # Add src to path @@ -56,6 +57,18 @@ Examples: help="Load config and model but skip training (for testing)" ) + parser.add_argument( + "--distributed", + action="store_true", + help="Enable distributed training" + ) + + parser.add_argument( + "--deepspeed", + action="store_true", + help="Enable DeepSpeed training" + ) + return parser.parse_args() @@ -74,9 +87,34 @@ def load_config(config_path: str) -> dict: return config +def setup_distributed_training(): + """Setup distributed training environment""" + # Check if we're in a distributed environment + if "RANK" in os.environ and "WORLD_SIZE" in os.environ: + import torch.distributed as dist + import torch + + # Initialize distributed training + if not dist.is_initialized(): + dist.init_process_group(backend="nccl") + + local_rank = int(os.environ.get("LOCAL_RANK", 0)) + torch.cuda.set_device(local_rank) + + print(f"Distributed training initialized: rank {dist.get_rank()}/{dist.get_world_size()}") + return True + + return False + + def main(): args = parse_args() + # Setup distributed training if requested + is_distributed = False + if args.distributed or args.deepspeed: + is_distributed = setup_distributed_training() + print("Progressive LLM Training for 松尾研LLMコンペ2025") print("=" * 50) @@ -95,11 +133,26 @@ def main(): print(f"Error loading config: {e}") sys.exit(1) + # Add distributed/deepspeed flags to config + config["training_args"] = config.get("training_args", {}) + if args.distributed: + config["training_args"]["distributed"] = True + if args.deepspeed: + config["training_args"]["deepspeed"] = True + # Add DeepSpeed config from main config + if "deepspeed" in config: + config["training_args"]["deepspeed_config"] = config["deepspeed"] + # Print configuration info print(f"Experiment: {config['experiment']['name']}") print(f"Base model: {config['experiment']['base_model']}") print(f"Output directory: {config['experiment']['output_dir']}") print(f"Stages: {len(config['progressive_stages'])}") + if is_distributed: + print("Mode: Distributed Training") + if args.deepspeed: + print("Backend: DeepSpeed") + print("=" * 50) # Prepare sample datasets if requested if args.prepare_data: diff --git a/src/training.py b/src/training.py index af6f63a..8f05f2e 100644 --- a/src/training.py +++ b/src/training.py @@ -367,27 +367,55 @@ class ProgressiveTrainer: print(f"Final dataset size: {len(dataset)} examples") - # Training arguments - with CPU offload optimizations - training_args = TrainingArguments( - output_dir=f"./outputs/checkpoints/{stage_name}", - num_train_epochs=stage_config["training"]["num_epochs"], - per_device_train_batch_size=stage_config["training"]["per_device_batch_size"], - gradient_accumulation_steps=stage_config["training"]["gradient_accumulation_steps"], - learning_rate=float(stage_config["training"]["learning_rate"]), # Ensure it's a float - warmup_steps=stage_config["training"]["warmup_steps"], - logging_steps=stage_config["training"].get("logging_steps", 10), - save_strategy="epoch", - eval_strategy="no", - bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(), - gradient_checkpointing=self.config["model"].get("gradient_checkpointing", False), - max_grad_norm=stage_config["training"].get("max_grad_norm", 1.0), - report_to="wandb" if self.config["experiment"]["use_wandb"] else "none", - run_name=f"{self.config['experiment']['name']}_{stage_name}", - dataloader_pin_memory=False, # Reduce memory usage - remove_unused_columns=False, # Keep all columns - optim=stage_config["training"].get("optim", "adamw_torch"), # Support 8-bit optimizers - dataloader_num_workers=stage_config["training"].get("dataloader_num_workers", 2), - ) + # Training arguments - with multi-GPU and CPU offload optimizations + training_args_dict = { + "output_dir": f"./outputs/checkpoints/{stage_name}", + "num_train_epochs": stage_config["training"]["num_epochs"], + "per_device_train_batch_size": stage_config["training"]["per_device_batch_size"], + "gradient_accumulation_steps": stage_config["training"]["gradient_accumulation_steps"], + "learning_rate": float(stage_config["training"]["learning_rate"]), # Ensure it's a float + "warmup_steps": stage_config["training"]["warmup_steps"], + "logging_steps": stage_config["training"].get("logging_steps", 10), + "save_strategy": "epoch", + "eval_strategy": "no", + "bf16": torch.cuda.is_available() and torch.cuda.is_bf16_supported(), + "gradient_checkpointing": self.config["model"].get("gradient_checkpointing", False), + "max_grad_norm": stage_config["training"].get("max_grad_norm", 1.0), + "report_to": "wandb" if self.config["experiment"]["use_wandb"] else "none", + "run_name": f"{self.config['experiment']['name']}_{stage_name}", + "dataloader_pin_memory": stage_config["training"].get("dataloader_pin_memory", False), + "remove_unused_columns": False, # Keep all columns + "optim": stage_config["training"].get("optim", "adamw_torch"), # Support 8-bit optimizers + "dataloader_num_workers": stage_config["training"].get("dataloader_num_workers", 2), + } + + # Add multi-GPU specific settings + if self.config.get("training_args", {}).get("distributed", False): + training_args_dict.update({ + "ddp_find_unused_parameters": False, + "ddp_bucket_cap_mb": 200, + "ddp_broadcast_buffers": False, + }) + + # Add DeepSpeed configuration + if self.config.get("training_args", {}).get("deepspeed", False): + deepspeed_config = self.config.get("training_args", {}).get("deepspeed_config") + if deepspeed_config: + training_args_dict["deepspeed"] = deepspeed_config + + # Add FSDP configuration + if "fsdp" in self.config: + fsdp_config = self.config["fsdp"] + training_args_dict.update({ + "fsdp": fsdp_config.get("fsdp_sharding_strategy", "FULL_SHARD"), + "fsdp_transformer_layer_cls_to_wrap": fsdp_config.get("fsdp_transformer_layer_cls_to_wrap"), + "fsdp_auto_wrap_policy": fsdp_config.get("fsdp_auto_wrap_policy", "TRANSFORMER_BASED_WRAP"), + "fsdp_min_num_params": fsdp_config.get("fsdp_min_num_params", 1000000), + "fsdp_cpu_ram_efficient_loading": fsdp_config.get("fsdp_cpu_offload", False), + "fsdp_sync_module_states": fsdp_config.get("fsdp_sync_module_states", True), + }) + + training_args = TrainingArguments(**training_args_dict) # Print dataset info for debugging print(f"Dataset columns: {dataset.column_names}") diff --git a/uv.lock b/uv.lock new file mode 100644 index 0000000..f5d5904 --- /dev/null +++ b/uv.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by uv. +# It is not intended for manual editing. +version = 1 +requires-python = ">=3.9" + +# Note: This is a placeholder lock file. +# Run `uv lock` to generate the actual lock file with resolved dependencies. \ No newline at end of file