こんにちは

2025-07-10 22:47:07 +09:00 · 2025-07-10 22:47:07 +09:00 · 4799392e24
commit 4799392e24
parent 6d823eb371
4 changed files with 50 additions and 14 deletions
--- a/README.md
+++ b/README.md
@ -14,7 +14,7 @@ pip install -r requirements.txt
 # Start training
 python scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml
-./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed
+./scripts/train_gemma3_1b_8gpu.sh --strategy ddp
 ```
 ## Training Stages
@ -28,7 +28,8 @@ python scripts/train_progressive.py --config config/training_config_gemma3_1b.ya
 ```bash
 pip install -r requirements.txt                                                     # Install dependencies
 python scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml # Single GPU
-./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed                             # 8 GPUs
+./scripts/train_gemma3_1b_8gpu.sh --strategy ddp                                   # 8 GPUs (DDP)
 python scripts/train_ddp_simple.py config/training_config_gemma3_1b_8gpu_ddp.yaml  # 8 GPUs (Simple)
 pytest                                                                              # Run tests
 ```
--- a/scripts/train_ddp_simple.py
+++ b/scripts/train_ddp_simple.py
@ -0,0 +1,42 @@
 #!/usr/bin/env python3
 """
 Simple DDP training script without complex launcher
 """
 import os
 import sys
 import subprocess
 from pathlib import Path
 def main():
    if len(sys.argv) < 2:
        print("Usage: python train_ddp_simple.py <config_file> [num_gpus]")
        print("Example: python train_ddp_simple.py config/training_config_gemma3_1b_8gpu_ddp.yaml 8")
        sys.exit(1)
    config_file = sys.argv[1]
    num_gpus = int(sys.argv[2]) if len(sys.argv) > 2 else 8
    # Set environment variables
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "12355"
    # Build torchrun command
    cmd = [
        "torchrun",
        "--nproc_per_node", str(num_gpus),
        "--master_port", "12355",
        "scripts/train_progressive.py",
        "--config", config_file,
        "--distributed"
    ]
    print(f"Running DDP training with {num_gpus} GPUs")
    print(f"Command: {' '.join(cmd)}")
    # Execute
    result = subprocess.run(cmd)
    sys.exit(result.returncode)
 if __name__ == "__main__":
    main()
--- a/scripts/train_gemma3_1b_8gpu.sh
+++ b/scripts/train_gemma3_1b_8gpu.sh
@ -14,7 +14,7 @@ echo "=================================================="
 UV_PREFIX="python"
 # Default values
-STRATEGY="deepspeed"
+STRATEGY="ddp"
 CONFIG=""
 NUM_GPUS=8
 DRY_RUN=false
@ -48,8 +48,8 @@ while [[ $# -gt 0 ]]; do
            echo "  --dry-run                        Show command without executing"
            echo ""
            echo "Examples:"
-            echo "  # Use DeepSpeed (recommended)"
+            echo "  # Use DDP (recommended)"
-            echo "  $0 --strategy deepspeed"
+            echo "  $0 --strategy ddp"
            echo ""
            echo "  # Use DDP"
            echo "  $0 --strategy ddp"
@ -91,6 +91,7 @@ if [ -z "$CONFIG" ]; then
            ;;
        *)
            echo -e "${RED}Error: Invalid strategy '$STRATEGY'. Choose from: ddp, fsdp, deepspeed${NC}"
            echo -e "${YELLOW}Note: DDP is recommended for single-node training${NC}"
            exit 1
            ;;
    esac
--- a/scripts/train_multi_gpu.py
+++ b/scripts/train_multi_gpu.py
@ -128,16 +128,11 @@ def launch_deepspeed_training(config_path, num_gpus):
    setup_environment_for_strategy("deepspeed")
    # Create DeepSpeed hostfile
    hostfile = Path(__file__).parent.parent / "hostfile"
    with open(hostfile, "w") as f:
        f.write(f"localhost slots={num_gpus}\n")
    python_cmd = ["python", "scripts/train_progressive.py"]
    # Use --num_gpus without hostfile for single node
    cmd = [
        "deepspeed",
        "--hostfile", str(hostfile),
        "--num_gpus", str(num_gpus),
    ] + python_cmd + [
        "--config", config_path,
@ -147,9 +142,6 @@ def launch_deepspeed_training(config_path, num_gpus):
    print(f"Running command: {' '.join(cmd)}")
    result = subprocess.run(cmd, cwd=Path(__file__).parent.parent)
    # Clean up hostfile
    hostfile.unlink(missing_ok=True)
    return result