#!/bin/bash # Training launcher script for Gemma3 1B with 8 GPUs (uv compatible) # Color codes for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' # No Color echo -e "${GREEN}Progressive LLM Training - Gemma3 1B 8GPU Launcher${NC}" echo "==================================================" # Use standard python UV_PREFIX="python" # Default values STRATEGY="ddp" CONFIG="" NUM_GPUS=8 DRY_RUN=false # Parse command line arguments while [[ $# -gt 0 ]]; do case $1 in --strategy) STRATEGY="$2" shift 2 ;; --config) CONFIG="$2" shift 2 ;; --num-gpus) NUM_GPUS="$2" shift 2 ;; --dry-run) DRY_RUN=true shift ;; -h|--help) echo "Usage: $0 [options]" echo "" echo "Options:" echo " --strategy Training strategy (default: deepspeed)" echo " --config Custom config file (optional)" echo " --num-gpus Number of GPUs to use (default: 8)" echo " --dry-run Show command without executing" echo "" echo "Examples:" echo " # Use DDP (recommended)" echo " $0 --strategy ddp" echo "" echo " # Use DDP" echo " $0 --strategy ddp" echo "" echo " # Use FSDP" echo " $0 --strategy fsdp" echo "" echo " # Use custom config" echo " $0 --strategy ddp --config config/my_config.yaml" exit 0 ;; *) echo -e "${RED}Error: Unknown option $1${NC}" exit 1 ;; esac done # Check GPU availability GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) echo -e "Available GPUs: ${GREEN}$GPU_COUNT${NC}" if [ $GPU_COUNT -lt $NUM_GPUS ]; then echo -e "${RED}Error: Requested $NUM_GPUS GPUs but only $GPU_COUNT available${NC}" exit 1 fi # Set default config based on strategy if not provided if [ -z "$CONFIG" ]; then case $STRATEGY in ddp) CONFIG="config/training_config_gemma3_1b_8gpu_ddp.yaml" ;; fsdp) CONFIG="config/training_config_gemma3_1b_8gpu_fsdp.yaml" ;; deepspeed) CONFIG="config/training_config_gemma3_1b_8gpu_deepspeed.yaml" ;; *) echo -e "${RED}Error: Invalid strategy '$STRATEGY'. Choose from: ddp, fsdp, deepspeed${NC}" echo -e "${YELLOW}Note: DDP is recommended for single-node training${NC}" exit 1 ;; esac fi # Check if config file exists if [ ! -f "$CONFIG" ]; then echo -e "${RED}Error: Config file not found: $CONFIG${NC}" exit 1 fi echo -e "Strategy: ${YELLOW}$STRATEGY${NC}" echo -e "Config: ${YELLOW}$CONFIG${NC}" echo -e "GPUs: ${YELLOW}$NUM_GPUS${NC}" echo "" # Build the command CMD="$UV_PREFIX scripts/train_multi_gpu.py --config $CONFIG --strategy $STRATEGY --num_gpus $NUM_GPUS" if [ "$DRY_RUN" = true ]; then echo -e "${YELLOW}Dry run mode - Command that would be executed:${NC}" echo "$CMD" exit 0 fi # Show GPU memory before training echo -e "${GREEN}GPU Memory Usage Before Training:${NC}" nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv echo "" echo -e "${GREEN}Starting training...${NC}" echo "Command: $CMD" echo "" # Set environment variables for optimal performance export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export NCCL_DEBUG=WARN # Set to INFO for debugging export NCCL_ASYNC_ERROR_HANDLING=1 # For DeepSpeed, set additional optimizations if [ "$STRATEGY" = "deepspeed" ]; then export DS_SKIP_CUDA_CHECK=1 export TOKENIZERS_PARALLELISM=false fi # Execute the training command $CMD # Check exit status if [ $? -eq 0 ]; then echo "" echo -e "${GREEN}Training completed successfully!${NC}" # Show GPU memory after training echo "" echo -e "${GREEN}GPU Memory Usage After Training:${NC}" nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv else echo "" echo -e "${RED}Training failed!${NC}" exit 1 fi