From 4799392e240630c81573df2589c16e5803a0bd84 Mon Sep 17 00:00:00 2001
From: Soma Nakamura <p@centraworks.net>
Date: Thu, 10 Jul 2025 22:47:07 +0900
Subject: [PATCH] =?UTF-8?q?=E3=81=93=E3=82=93=E3=81=AB=E3=81=A1=E3=81=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                       |  5 ++--
 scripts/train_ddp_simple.py     | 42 +++++++++++++++++++++++++++++++++
 scripts/train_gemma3_1b_8gpu.sh |  7 +++---
 scripts/train_multi_gpu.py      | 10 +-------
 4 files changed, 50 insertions(+), 14 deletions(-)
 create mode 100755 scripts/train_ddp_simple.py

diff --git a/README.md b/README.md
index 58e2f74..c8f0272 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ pip install -r requirements.txt
 
 # Start training
 python scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml
-./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed
+./scripts/train_gemma3_1b_8gpu.sh --strategy ddp
 ```
 
 ## Training Stages
@@ -28,7 +28,8 @@ python scripts/train_progressive.py --config config/training_config_gemma3_1b.ya
 ```bash
 pip install -r requirements.txt                                                     # Install dependencies
 python scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml # Single GPU
-./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed                             # 8 GPUs
+./scripts/train_gemma3_1b_8gpu.sh --strategy ddp                                   # 8 GPUs (DDP)
+python scripts/train_ddp_simple.py config/training_config_gemma3_1b_8gpu_ddp.yaml  # 8 GPUs (Simple)
 pytest                                                                              # Run tests
 ```
 
diff --git a/scripts/train_ddp_simple.py b/scripts/train_ddp_simple.py
new file mode 100755
index 0000000..8b9d661
--- /dev/null
+++ b/scripts/train_ddp_simple.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+"""
+Simple DDP training script without complex launcher
+"""
+
+import os
+import sys
+import subprocess
+from pathlib import Path
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python train_ddp_simple.py <config_file> [num_gpus]")
+        print("Example: python train_ddp_simple.py config/training_config_gemma3_1b_8gpu_ddp.yaml 8")
+        sys.exit(1)
+    
+    config_file = sys.argv[1]
+    num_gpus = int(sys.argv[2]) if len(sys.argv) > 2 else 8
+    
+    # Set environment variables
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    
+    # Build torchrun command
+    cmd = [
+        "torchrun",
+        "--nproc_per_node", str(num_gpus),
+        "--master_port", "12355",
+        "scripts/train_progressive.py",
+        "--config", config_file,
+        "--distributed"
+    ]
+    
+    print(f"Running DDP training with {num_gpus} GPUs")
+    print(f"Command: {' '.join(cmd)}")
+    
+    # Execute
+    result = subprocess.run(cmd)
+    sys.exit(result.returncode)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/scripts/train_gemma3_1b_8gpu.sh b/scripts/train_gemma3_1b_8gpu.sh
index fbda9d7..5347274 100755
--- a/scripts/train_gemma3_1b_8gpu.sh
+++ b/scripts/train_gemma3_1b_8gpu.sh
@@ -14,7 +14,7 @@ echo "=================================================="
 UV_PREFIX="python"
 
 # Default values
-STRATEGY="deepspeed"
+STRATEGY="ddp"
 CONFIG=""
 NUM_GPUS=8
 DRY_RUN=false
@@ -48,8 +48,8 @@ while [[ $# -gt 0 ]]; do
             echo "  --dry-run                        Show command without executing"
             echo ""
             echo "Examples:"
-            echo "  # Use DeepSpeed (recommended)"
-            echo "  $0 --strategy deepspeed"
+            echo "  # Use DDP (recommended)"
+            echo "  $0 --strategy ddp"
             echo ""
             echo "  # Use DDP"
             echo "  $0 --strategy ddp"
@@ -91,6 +91,7 @@ if [ -z "$CONFIG" ]; then
             ;;
         *)
             echo -e "${RED}Error: Invalid strategy '$STRATEGY'. Choose from: ddp, fsdp, deepspeed${NC}"
+            echo -e "${YELLOW}Note: DDP is recommended for single-node training${NC}"
             exit 1
             ;;
     esac
diff --git a/scripts/train_multi_gpu.py b/scripts/train_multi_gpu.py
index 2e40abc..93b606d 100755
--- a/scripts/train_multi_gpu.py
+++ b/scripts/train_multi_gpu.py
@@ -128,16 +128,11 @@ def launch_deepspeed_training(config_path, num_gpus):
     
     setup_environment_for_strategy("deepspeed")
     
-    # Create DeepSpeed hostfile
-    hostfile = Path(__file__).parent.parent / "hostfile"
-    with open(hostfile, "w") as f:
-        f.write(f"localhost slots={num_gpus}\n")
-    
     python_cmd = ["python", "scripts/train_progressive.py"]
     
+    # Use --num_gpus without hostfile for single node
     cmd = [
         "deepspeed",
-        "--hostfile", str(hostfile),
         "--num_gpus", str(num_gpus),
     ] + python_cmd + [
         "--config", config_path,
@@ -147,9 +142,6 @@ def launch_deepspeed_training(config_path, num_gpus):
     print(f"Running command: {' '.join(cmd)}")
     result = subprocess.run(cmd, cwd=Path(__file__).parent.parent)
     
-    # Clean up hostfile
-    hostfile.unlink(missing_ok=True)
-    
     return result