161 lines
No EOL
4.4 KiB
Bash
Executable file
161 lines
No EOL
4.4 KiB
Bash
Executable file
#!/bin/bash
|
|
# Training launcher script for Gemma3 1B with 8 GPUs (uv compatible)
|
|
|
|
# Color codes for output
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
NC='\033[0m' # No Color
|
|
|
|
echo -e "${GREEN}Progressive LLM Training - Gemma3 1B 8GPU Launcher (uv)${NC}"
|
|
echo "======================================================="
|
|
|
|
# Check if uv is available
|
|
if command -v uv &> /dev/null; then
|
|
echo -e "${GREEN}Using uv for Python environment management${NC}"
|
|
UV_PREFIX="uv run"
|
|
else
|
|
echo -e "${YELLOW}uv not found, using standard python${NC}"
|
|
UV_PREFIX="python"
|
|
fi
|
|
|
|
# Default values
|
|
STRATEGY="deepspeed"
|
|
CONFIG=""
|
|
NUM_GPUS=8
|
|
DRY_RUN=false
|
|
|
|
# Parse command line arguments
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
--strategy)
|
|
STRATEGY="$2"
|
|
shift 2
|
|
;;
|
|
--config)
|
|
CONFIG="$2"
|
|
shift 2
|
|
;;
|
|
--num-gpus)
|
|
NUM_GPUS="$2"
|
|
shift 2
|
|
;;
|
|
--dry-run)
|
|
DRY_RUN=true
|
|
shift
|
|
;;
|
|
-h|--help)
|
|
echo "Usage: $0 [options]"
|
|
echo ""
|
|
echo "Options:"
|
|
echo " --strategy <ddp|fsdp|deepspeed> Training strategy (default: deepspeed)"
|
|
echo " --config <path> Custom config file (optional)"
|
|
echo " --num-gpus <n> Number of GPUs to use (default: 8)"
|
|
echo " --dry-run Show command without executing"
|
|
echo ""
|
|
echo "Examples:"
|
|
echo " # Use DeepSpeed (recommended)"
|
|
echo " $0 --strategy deepspeed"
|
|
echo ""
|
|
echo " # Use DDP"
|
|
echo " $0 --strategy ddp"
|
|
echo ""
|
|
echo " # Use FSDP"
|
|
echo " $0 --strategy fsdp"
|
|
echo ""
|
|
echo " # Use custom config"
|
|
echo " $0 --strategy ddp --config config/my_config.yaml"
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo -e "${RED}Error: Unknown option $1${NC}"
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Check GPU availability
|
|
GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
|
|
echo -e "Available GPUs: ${GREEN}$GPU_COUNT${NC}"
|
|
|
|
if [ $GPU_COUNT -lt $NUM_GPUS ]; then
|
|
echo -e "${RED}Error: Requested $NUM_GPUS GPUs but only $GPU_COUNT available${NC}"
|
|
exit 1
|
|
fi
|
|
|
|
# Set default config based on strategy if not provided
|
|
if [ -z "$CONFIG" ]; then
|
|
case $STRATEGY in
|
|
ddp)
|
|
CONFIG="config/training_config_gemma3_1b_8gpu_ddp.yaml"
|
|
;;
|
|
fsdp)
|
|
CONFIG="config/training_config_gemma3_1b_8gpu_fsdp.yaml"
|
|
;;
|
|
deepspeed)
|
|
CONFIG="config/training_config_gemma3_1b_8gpu_deepspeed.yaml"
|
|
;;
|
|
*)
|
|
echo -e "${RED}Error: Invalid strategy '$STRATEGY'. Choose from: ddp, fsdp, deepspeed${NC}"
|
|
exit 1
|
|
;;
|
|
esac
|
|
fi
|
|
|
|
# Check if config file exists
|
|
if [ ! -f "$CONFIG" ]; then
|
|
echo -e "${RED}Error: Config file not found: $CONFIG${NC}"
|
|
exit 1
|
|
fi
|
|
|
|
echo -e "Strategy: ${YELLOW}$STRATEGY${NC}"
|
|
echo -e "Config: ${YELLOW}$CONFIG${NC}"
|
|
echo -e "GPUs: ${YELLOW}$NUM_GPUS${NC}"
|
|
echo ""
|
|
|
|
# Build the command
|
|
CMD="$UV_PREFIX scripts/train_multi_gpu.py --config $CONFIG --strategy $STRATEGY --num_gpus $NUM_GPUS"
|
|
|
|
if [ "$DRY_RUN" = true ]; then
|
|
echo -e "${YELLOW}Dry run mode - Command that would be executed:${NC}"
|
|
echo "$CMD"
|
|
exit 0
|
|
fi
|
|
|
|
# Show GPU memory before training
|
|
echo -e "${GREEN}GPU Memory Usage Before Training:${NC}"
|
|
nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv
|
|
|
|
echo ""
|
|
echo -e "${GREEN}Starting training...${NC}"
|
|
echo "Command: $CMD"
|
|
echo ""
|
|
|
|
# Set environment variables for optimal performance
|
|
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
|
export NCCL_DEBUG=WARN # Set to INFO for debugging
|
|
export NCCL_ASYNC_ERROR_HANDLING=1
|
|
|
|
# For DeepSpeed, set additional optimizations
|
|
if [ "$STRATEGY" = "deepspeed" ]; then
|
|
export DS_SKIP_CUDA_CHECK=1
|
|
export TOKENIZERS_PARALLELISM=false
|
|
fi
|
|
|
|
# Execute the training command
|
|
$CMD
|
|
|
|
# Check exit status
|
|
if [ $? -eq 0 ]; then
|
|
echo ""
|
|
echo -e "${GREEN}Training completed successfully!${NC}"
|
|
|
|
# Show GPU memory after training
|
|
echo ""
|
|
echo -e "${GREEN}GPU Memory Usage After Training:${NC}"
|
|
nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv
|
|
else
|
|
echo ""
|
|
echo -e "${RED}Training failed!${NC}"
|
|
exit 1
|
|
fi |