progressive-llm/scripts/train_gemma3_1b_8gpu.sh
2025-07-10 22:47:07 +09:00

156 lines
No EOL
4.2 KiB
Bash
Executable file

#!/bin/bash
# Training launcher script for Gemma3 1B with 8 GPUs (uv compatible)
# Color codes for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
echo -e "${GREEN}Progressive LLM Training - Gemma3 1B 8GPU Launcher${NC}"
echo "=================================================="
# Use standard python
UV_PREFIX="python"
# Default values
STRATEGY="ddp"
CONFIG=""
NUM_GPUS=8
DRY_RUN=false
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--strategy)
STRATEGY="$2"
shift 2
;;
--config)
CONFIG="$2"
shift 2
;;
--num-gpus)
NUM_GPUS="$2"
shift 2
;;
--dry-run)
DRY_RUN=true
shift
;;
-h|--help)
echo "Usage: $0 [options]"
echo ""
echo "Options:"
echo " --strategy <ddp|fsdp|deepspeed> Training strategy (default: deepspeed)"
echo " --config <path> Custom config file (optional)"
echo " --num-gpus <n> Number of GPUs to use (default: 8)"
echo " --dry-run Show command without executing"
echo ""
echo "Examples:"
echo " # Use DDP (recommended)"
echo " $0 --strategy ddp"
echo ""
echo " # Use DDP"
echo " $0 --strategy ddp"
echo ""
echo " # Use FSDP"
echo " $0 --strategy fsdp"
echo ""
echo " # Use custom config"
echo " $0 --strategy ddp --config config/my_config.yaml"
exit 0
;;
*)
echo -e "${RED}Error: Unknown option $1${NC}"
exit 1
;;
esac
done
# Check GPU availability
GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
echo -e "Available GPUs: ${GREEN}$GPU_COUNT${NC}"
if [ $GPU_COUNT -lt $NUM_GPUS ]; then
echo -e "${RED}Error: Requested $NUM_GPUS GPUs but only $GPU_COUNT available${NC}"
exit 1
fi
# Set default config based on strategy if not provided
if [ -z "$CONFIG" ]; then
case $STRATEGY in
ddp)
CONFIG="config/training_config_gemma3_1b_8gpu_ddp.yaml"
;;
fsdp)
CONFIG="config/training_config_gemma3_1b_8gpu_fsdp.yaml"
;;
deepspeed)
CONFIG="config/training_config_gemma3_1b_8gpu_deepspeed.yaml"
;;
*)
echo -e "${RED}Error: Invalid strategy '$STRATEGY'. Choose from: ddp, fsdp, deepspeed${NC}"
echo -e "${YELLOW}Note: DDP is recommended for single-node training${NC}"
exit 1
;;
esac
fi
# Check if config file exists
if [ ! -f "$CONFIG" ]; then
echo -e "${RED}Error: Config file not found: $CONFIG${NC}"
exit 1
fi
echo -e "Strategy: ${YELLOW}$STRATEGY${NC}"
echo -e "Config: ${YELLOW}$CONFIG${NC}"
echo -e "GPUs: ${YELLOW}$NUM_GPUS${NC}"
echo ""
# Build the command
CMD="$UV_PREFIX scripts/train_multi_gpu.py --config $CONFIG --strategy $STRATEGY --num_gpus $NUM_GPUS"
if [ "$DRY_RUN" = true ]; then
echo -e "${YELLOW}Dry run mode - Command that would be executed:${NC}"
echo "$CMD"
exit 0
fi
# Show GPU memory before training
echo -e "${GREEN}GPU Memory Usage Before Training:${NC}"
nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv
echo ""
echo -e "${GREEN}Starting training...${NC}"
echo "Command: $CMD"
echo ""
# Set environment variables for optimal performance
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export NCCL_DEBUG=WARN # Set to INFO for debugging
export NCCL_ASYNC_ERROR_HANDLING=1
# For DeepSpeed, set additional optimizations
if [ "$STRATEGY" = "deepspeed" ]; then
export DS_SKIP_CUDA_CHECK=1
export TOKENIZERS_PARALLELISM=false
fi
# Execute the training command
$CMD
# Check exit status
if [ $? -eq 0 ]; then
echo ""
echo -e "${GREEN}Training completed successfully!${NC}"
# Show GPU memory after training
echo ""
echo -e "${GREEN}GPU Memory Usage After Training:${NC}"
nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv
else
echo ""
echo -e "${RED}Training failed!${NC}"
exit 1
fi