progressive-llm/scripts/train_gemma3_1b_8gpu.sh

#!/bin/bash
# Training launcher script for Gemma3 1B with 8 GPUs (uv compatible)

# Color codes for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

echo -e "${GREEN}Progressive LLM Training - Gemma3 1B 8GPU Launcher${NC}"
echo "=================================================="

# Use standard python
UV_PREFIX="python"

# Default values
STRATEGY="ddp"
CONFIG=""
NUM_GPUS=8
DRY_RUN=false

# Parse command line arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --strategy)
            STRATEGY="$2"
            shift 2
            ;;
        --config)
            CONFIG="$2"
            shift 2
            ;;
        --num-gpus)
            NUM_GPUS="$2"
            shift 2
            ;;
        --dry-run)
            DRY_RUN=true
            shift
            ;;
        -h|--help)
            echo "Usage: $0 [options]"
            echo ""
            echo "Options:"
            echo "  --strategy <ddp|fsdp|deepspeed>  Training strategy (default: deepspeed)"
            echo "  --config <path>                  Custom config file (optional)"
            echo "  --num-gpus <n>                   Number of GPUs to use (default: 8)"
            echo "  --dry-run                        Show command without executing"
            echo ""
            echo "Examples:"
            echo "  # Use DDP (recommended)"
            echo "  $0 --strategy ddp"
            echo ""
            echo "  # Use DDP"
            echo "  $0 --strategy ddp"
            echo ""
            echo "  # Use FSDP"
            echo "  $0 --strategy fsdp"
            echo ""
            echo "  # Use custom config"
            echo "  $0 --strategy ddp --config config/my_config.yaml"
            exit 0
            ;;
        *)
            echo -e "${RED}Error: Unknown option $1${NC}"
            exit 1
            ;;
    esac
done

# Check GPU availability
GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
echo -e "Available GPUs: ${GREEN}$GPU_COUNT${NC}"

if [ $GPU_COUNT -lt $NUM_GPUS ]; then
    echo -e "${RED}Error: Requested $NUM_GPUS GPUs but only $GPU_COUNT available${NC}"
    exit 1
fi

# Set default config based on strategy if not provided
if [ -z "$CONFIG" ]; then
    case $STRATEGY in
        ddp)
            CONFIG="config/training_config_gemma3_1b_8gpu_ddp.yaml"
            ;;
        fsdp)
            CONFIG="config/training_config_gemma3_1b_8gpu_fsdp.yaml"
            ;;
        deepspeed)
            CONFIG="config/training_config_gemma3_1b_8gpu_deepspeed.yaml"
            ;;
        *)
            echo -e "${RED}Error: Invalid strategy '$STRATEGY'. Choose from: ddp, fsdp, deepspeed${NC}"
            echo -e "${YELLOW}Note: DDP is recommended for single-node training${NC}"
            exit 1
            ;;
    esac
fi

# Check if config file exists
if [ ! -f "$CONFIG" ]; then
    echo -e "${RED}Error: Config file not found: $CONFIG${NC}"
    exit 1
fi

echo -e "Strategy: ${YELLOW}$STRATEGY${NC}"
echo -e "Config: ${YELLOW}$CONFIG${NC}"
echo -e "GPUs: ${YELLOW}$NUM_GPUS${NC}"
echo ""

# Build the command
CMD="$UV_PREFIX scripts/train_multi_gpu.py --config $CONFIG --strategy $STRATEGY --num_gpus $NUM_GPUS"

if [ "$DRY_RUN" = true ]; then
    echo -e "${YELLOW}Dry run mode - Command that would be executed:${NC}"
    echo "$CMD"
    exit 0
fi

# Show GPU memory before training
echo -e "${GREEN}GPU Memory Usage Before Training:${NC}"
nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv

echo ""
echo -e "${GREEN}Starting training...${NC}"
echo "Command: $CMD"
echo ""

# Set environment variables for optimal performance
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export NCCL_DEBUG=WARN  # Set to INFO for debugging
export NCCL_ASYNC_ERROR_HANDLING=1

# For DeepSpeed, set additional optimizations
if [ "$STRATEGY" = "deepspeed" ]; then
    export DS_SKIP_CUDA_CHECK=1
    export TOKENIZERS_PARALLELISM=false
fi

# Execute the training command
$CMD

# Check exit status
if [ $? -eq 0 ]; then
    echo ""
    echo -e "${GREEN}Training completed successfully!${NC}"

    # Show GPU memory after training
    echo ""
    echo -e "${GREEN}GPU Memory Usage After Training:${NC}"
    nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv
else
    echo ""
    echo -e "${RED}Training failed!${NC}"
    exit 1
fi