diff --git a/README.md b/README.md index 58e2f74..c8f0272 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ pip install -r requirements.txt # Start training python scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml -./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed +./scripts/train_gemma3_1b_8gpu.sh --strategy ddp ``` ## Training Stages @@ -28,7 +28,8 @@ python scripts/train_progressive.py --config config/training_config_gemma3_1b.ya ```bash pip install -r requirements.txt # Install dependencies python scripts/train_progressive.py --config config/training_config_gemma3_1b.yaml # Single GPU -./scripts/train_gemma3_1b_8gpu.sh --strategy deepspeed # 8 GPUs +./scripts/train_gemma3_1b_8gpu.sh --strategy ddp # 8 GPUs (DDP) +python scripts/train_ddp_simple.py config/training_config_gemma3_1b_8gpu_ddp.yaml # 8 GPUs (Simple) pytest # Run tests ``` diff --git a/scripts/train_ddp_simple.py b/scripts/train_ddp_simple.py new file mode 100755 index 0000000..8b9d661 --- /dev/null +++ b/scripts/train_ddp_simple.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +""" +Simple DDP training script without complex launcher +""" + +import os +import sys +import subprocess +from pathlib import Path + +def main(): + if len(sys.argv) < 2: + print("Usage: python train_ddp_simple.py [num_gpus]") + print("Example: python train_ddp_simple.py config/training_config_gemma3_1b_8gpu_ddp.yaml 8") + sys.exit(1) + + config_file = sys.argv[1] + num_gpus = int(sys.argv[2]) if len(sys.argv) > 2 else 8 + + # Set environment variables + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355" + + # Build torchrun command + cmd = [ + "torchrun", + "--nproc_per_node", str(num_gpus), + "--master_port", "12355", + "scripts/train_progressive.py", + "--config", config_file, + "--distributed" + ] + + print(f"Running DDP training with {num_gpus} GPUs") + print(f"Command: {' '.join(cmd)}") + + # Execute + result = subprocess.run(cmd) + sys.exit(result.returncode) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/train_gemma3_1b_8gpu.sh b/scripts/train_gemma3_1b_8gpu.sh index fbda9d7..5347274 100755 --- a/scripts/train_gemma3_1b_8gpu.sh +++ b/scripts/train_gemma3_1b_8gpu.sh @@ -14,7 +14,7 @@ echo "==================================================" UV_PREFIX="python" # Default values -STRATEGY="deepspeed" +STRATEGY="ddp" CONFIG="" NUM_GPUS=8 DRY_RUN=false @@ -48,8 +48,8 @@ while [[ $# -gt 0 ]]; do echo " --dry-run Show command without executing" echo "" echo "Examples:" - echo " # Use DeepSpeed (recommended)" - echo " $0 --strategy deepspeed" + echo " # Use DDP (recommended)" + echo " $0 --strategy ddp" echo "" echo " # Use DDP" echo " $0 --strategy ddp" @@ -91,6 +91,7 @@ if [ -z "$CONFIG" ]; then ;; *) echo -e "${RED}Error: Invalid strategy '$STRATEGY'. Choose from: ddp, fsdp, deepspeed${NC}" + echo -e "${YELLOW}Note: DDP is recommended for single-node training${NC}" exit 1 ;; esac diff --git a/scripts/train_multi_gpu.py b/scripts/train_multi_gpu.py index 2e40abc..93b606d 100755 --- a/scripts/train_multi_gpu.py +++ b/scripts/train_multi_gpu.py @@ -128,16 +128,11 @@ def launch_deepspeed_training(config_path, num_gpus): setup_environment_for_strategy("deepspeed") - # Create DeepSpeed hostfile - hostfile = Path(__file__).parent.parent / "hostfile" - with open(hostfile, "w") as f: - f.write(f"localhost slots={num_gpus}\n") - python_cmd = ["python", "scripts/train_progressive.py"] + # Use --num_gpus without hostfile for single node cmd = [ "deepspeed", - "--hostfile", str(hostfile), "--num_gpus", str(num_gpus), ] + python_cmd + [ "--config", config_path, @@ -147,9 +142,6 @@ def launch_deepspeed_training(config_path, num_gpus): print(f"Running command: {' '.join(cmd)}") result = subprocess.run(cmd, cwd=Path(__file__).parent.parent) - # Clean up hostfile - hostfile.unlink(missing_ok=True) - return result