From 4799392e240630c81573df2589c16e5803a0bd84 Mon Sep 17 00:00:00 2001
From: Soma Nakamura
Date: Thu, 10 Jul 2025 22:47:07 +0900
Subject: [PATCH] =?UTF-8?q?=E3=81=93=E3=82=93=E3=81=AB=E3=81=A1=E3=81=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
README.md | 5 ++--
scripts/train_ddp_simple.py | 42 +++++++++++++++++++++++++++++++++
scripts/train_gemma3_1b_8gpu.sh | 7 +++---
scripts/train_multi_gpu.py | 10 +-------
4 files changed, 50 insertions(+), 14 deletions(-)
create mode 100755 scripts/train_ddp_simple.py
diff --git a/README.md b/README.md
index 58e2f74..c8f0272 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ pip install -r requirements.txt
# Start training
python scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml
-./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed
+./scripts/train_gemma3_1b_8gpu.sh --strategy ddp
```
## Training Stages
@@ -28,7 +28,8 @@ python scripts/train_progressive.py --config config/training_config_gemma3_1b.ya
```bash
pip install -r requirements.txt # Install dependencies
python scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml # Single GPU
-./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed # 8 GPUs
+./scripts/train_gemma3_1b_8gpu.sh --strategy ddp # 8 GPUs (DDP)
+python scripts/train_ddp_simple.py config/training_config_gemma3_1b_8gpu_ddp.yaml # 8 GPUs (Simple)
pytest # Run tests
```
diff --git a/scripts/train_ddp_simple.py b/scripts/train_ddp_simple.py
new file mode 100755
index 0000000..8b9d661
--- /dev/null
+++ b/scripts/train_ddp_simple.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+"""
+Simple DDP training script without complex launcher
+"""
+
+import os
+import sys
+import subprocess
+from pathlib import Path
+
+def main():
+ if len(sys.argv) < 2:
+ print("Usage: python train_ddp_simple.py [num_gpus]")
+ print("Example: python train_ddp_simple.py config/training_config_gemma3_1b_8gpu_ddp.yaml 8")
+ sys.exit(1)
+
+ config_file = sys.argv[1]
+ num_gpus = int(sys.argv[2]) if len(sys.argv) > 2 else 8
+
+ # Set environment variables
+ os.environ["MASTER_ADDR"] = "localhost"
+ os.environ["MASTER_PORT"] = "12355"
+
+ # Build torchrun command
+ cmd = [
+ "torchrun",
+ "--nproc_per_node", str(num_gpus),
+ "--master_port", "12355",
+ "scripts/train_progressive.py",
+ "--config", config_file,
+ "--distributed"
+ ]
+
+ print(f"Running DDP training with {num_gpus} GPUs")
+ print(f"Command: {' '.join(cmd)}")
+
+ # Execute
+ result = subprocess.run(cmd)
+ sys.exit(result.returncode)
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/scripts/train_gemma3_1b_8gpu.sh b/scripts/train_gemma3_1b_8gpu.sh
index fbda9d7..5347274 100755
--- a/scripts/train_gemma3_1b_8gpu.sh
+++ b/scripts/train_gemma3_1b_8gpu.sh
@@ -14,7 +14,7 @@ echo "=================================================="
UV_PREFIX="python"
# Default values
-STRATEGY="deepspeed"
+STRATEGY="ddp"
CONFIG=""
NUM_GPUS=8
DRY_RUN=false
@@ -48,8 +48,8 @@ while [[ $# -gt 0 ]]; do
echo " --dry-run Show command without executing"
echo ""
echo "Examples:"
- echo " # Use DeepSpeed (recommended)"
- echo " $0 --strategy deepspeed"
+ echo " # Use DDP (recommended)"
+ echo " $0 --strategy ddp"
echo ""
echo " # Use DDP"
echo " $0 --strategy ddp"
@@ -91,6 +91,7 @@ if [ -z "$CONFIG" ]; then
;;
*)
echo -e "${RED}Error: Invalid strategy '$STRATEGY'. Choose from: ddp, fsdp, deepspeed${NC}"
+ echo -e "${YELLOW}Note: DDP is recommended for single-node training${NC}"
exit 1
;;
esac
diff --git a/scripts/train_multi_gpu.py b/scripts/train_multi_gpu.py
index 2e40abc..93b606d 100755
--- a/scripts/train_multi_gpu.py
+++ b/scripts/train_multi_gpu.py
@@ -128,16 +128,11 @@ def launch_deepspeed_training(config_path, num_gpus):
setup_environment_for_strategy("deepspeed")
- # Create DeepSpeed hostfile
- hostfile = Path(__file__).parent.parent / "hostfile"
- with open(hostfile, "w") as f:
- f.write(f"localhost slots={num_gpus}\n")
-
python_cmd = ["python", "scripts/train_progressive.py"]
+ # Use --num_gpus without hostfile for single node
cmd = [
"deepspeed",
- "--hostfile", str(hostfile),
"--num_gpus", str(num_gpus),
] + python_cmd + [
"--config", config_path,
@@ -147,9 +142,6 @@ def launch_deepspeed_training(config_path, num_gpus):
print(f"Running command: {' '.join(cmd)}")
result = subprocess.run(cmd, cwd=Path(__file__).parent.parent)
- # Clean up hostfile
- hostfile.unlink(missing_ok=True)
-
return result