diff --git a/scripts/train_multi_gpu.py b/scripts/train_multi_gpu.py index 93b606d..e13ee68 100755 --- a/scripts/train_multi_gpu.py +++ b/scripts/train_multi_gpu.py @@ -50,13 +50,11 @@ def launch_ddp_training(config_path, num_gpus): setup_environment_for_strategy("ddp") # Use torchrun for DDP - python_cmd = ["python", "scripts/train_progressive.py"] - cmd = [ "torchrun", "--nproc_per_node", str(num_gpus), "--master_port", "12355", - ] + python_cmd + [ + "scripts/train_progressive.py", "--config", config_path, "--distributed" ] @@ -104,12 +102,10 @@ use_cpu: false with open(config_file, "w") as f: f.write(accelerate_config) - python_cmd = ["python", "scripts/train_progressive.py"] - cmd = [ "accelerate", "launch", "--config_file", str(config_file), - ] + python_cmd + [ + "scripts/train_progressive.py", "--config", config_path ] @@ -128,13 +124,11 @@ def launch_deepspeed_training(config_path, num_gpus): setup_environment_for_strategy("deepspeed") - python_cmd = ["python", "scripts/train_progressive.py"] - # Use --num_gpus without hostfile for single node cmd = [ "deepspeed", "--num_gpus", str(num_gpus), - ] + python_cmd + [ + "scripts/train_progressive.py", "--config", config_path, "--deepspeed" ]