progressive-llm/scripts/check_vram.py

#!/usr/bin/env python3
"""
Check VRAM usage and model memory requirements
"""

import torch
import psutil
import sys
from pathlib import Path
import yaml

# Add src to path
sys.path.append(str(Path(__file__).parent.parent))

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig


def get_memory_info():
    """Get current memory usage"""
    if torch.cuda.is_available():
        print("=== CUDA Information ===")
        print(f"CUDA available: {torch.cuda.is_available()}")
        print(f"CUDA device: {torch.cuda.get_device_name(0)}")
        print(f"CUDA device count: {torch.cuda.device_count()}")

        # Get VRAM info
        vram_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
        vram_reserved = torch.cuda.memory_reserved(0) / 1024**3
        vram_allocated = torch.cuda.memory_allocated(0) / 1024**3
        vram_free = vram_total - vram_allocated

        print(f"\n=== VRAM Usage ===")
        print(f"Total VRAM: {vram_total:.2f} GB")
        print(f"Allocated VRAM: {vram_allocated:.2f} GB")
        print(f"Reserved VRAM: {vram_reserved:.2f} GB")
        print(f"Free VRAM: {vram_free:.2f} GB")
    else:
        print("CUDA not available!")

    # Get system RAM info
    ram = psutil.virtual_memory()
    print(f"\n=== System RAM ===")
    print(f"Total RAM: {ram.total / 1024**3:.2f} GB")
    print(f"Available RAM: {ram.available / 1024**3:.2f} GB")
    print(f"Used RAM: {ram.used / 1024**3:.2f} GB ({ram.percent}%)")


def estimate_model_size(model_name: str, quantization: str = None):
    """Estimate model memory requirements"""
    print(f"\n=== Model Memory Estimation ===")
    print(f"Model: {model_name}")

    # Common model sizes (in billions of parameters)
    model_sizes = {
        "gemma-2-2b": 2.5,
        "gemma-3-1b": 1.2,
        "llama-3.2-8b": 8,
        "llama-3.2-13b": 13,
        "llama-3.2-70b": 70,
    }

    # Find model size
    model_key = None
    for key in model_sizes:
        if key in model_name.lower():
            model_key = key
            break

    if model_key:
        params_billions = model_sizes[model_key]

        # Memory estimates (rough)
        fp32_gb = params_billions * 4  # 4 bytes per parameter
        fp16_gb = params_billions * 2  # 2 bytes per parameter
        int8_gb = params_billions * 1  # 1 byte per parameter
        int4_gb = params_billions * 0.5  # 0.5 bytes per parameter

        print(f"Estimated parameters: {params_billions}B")
        print(f"Memory requirements:")
        print(f"  FP32: ~{fp32_gb:.1f} GB")
        print(f"  FP16/BF16: ~{fp16_gb:.1f} GB")
        print(f"  INT8: ~{int8_gb:.1f} GB")
        print(f"  INT4 (QLoRA): ~{int4_gb:.1f} GB")

        # Add overhead for activations and gradients
        print(f"\nWith training overhead:")
        print(f"  FP16 + LoRA: ~{fp16_gb * 1.5:.1f} GB")
        print(f"  INT4 + QLoRA: ~{int4_gb * 1.5:.1f} GB")
    else:
        print("Model size not recognized, unable to estimate memory requirements")


def suggest_offloading_strategies():
    """Suggest CPU offloading strategies"""
    print("\n=== CPU Offloading Strategies ===")
    print("\n1. **Device Map Auto with CPU Offload**")
    print("   ```python")
    print("   device_map = {")
    print("       'model.embed_tokens': 'cpu',")
    print("       'model.layers.0': 0,  # GPU")
    print("       'model.layers.1': 0,  # GPU")
    print("       'model.layers.2': 'cpu',  # CPU")
    print("       # ... distribute layers between GPU and CPU")
    print("   }")
    print("   ```")

    print("\n2. **Accelerate's CPU Offload**")
    print("   ```yaml")
    print("   model:")
    print("     device_map: 'auto'")
    print("     max_memory:")
    print("       0: '4GB'  # Limit GPU memory")
    print("       'cpu': '20GB'  # Allow CPU memory")
    print("   ```")

    print("\n3. **DeepSpeed ZeRO-Offload**")
    print("   - ZeRO-2: Offload optimizer states to CPU")
    print("   - ZeRO-3: Offload optimizer states and parameters to CPU")
    print("   ```yaml")
    print("   deepspeed:")
    print("     zero_optimization:")
    print("       stage: 2")
    print("       offload_optimizer:")
    print("         device: 'cpu'")
    print("   ```")

    print("\n4. **Gradient Checkpointing + CPU Offload**")
    print("   - Trade compute for memory")
    print("   - Combine with layer-wise CPU offloading")

    print("\n5. **QLoRA with CPU Offload**")
    print("   - 4-bit quantization reduces base model size")
    print("   - Only LoRA parameters on GPU")
    print("   - Base model layers can be on CPU")


def check_config_compatibility(config_path: str):
    """Check if config is compatible with CPU offloading"""
    if Path(config_path).exists():
        with open(config_path) as f:
            config = yaml.safe_load(f)

        print(f"\n=== Config Analysis: {config_path} ===")
        model_config = config.get("model", {})

        print(f"Current settings:")
        print(f"  4-bit quantization: {model_config.get('load_in_4bit', False)}")
        print(f"  Gradient checkpointing: {model_config.get('gradient_checkpointing', False)}")
        print(f"  Device map: {model_config.get('device_map', 'None')}")

        if model_config.get('load_in_4bit', False):
            print("✓ Already using 4-bit quantization (good for memory)")
        else:
            print("✗ Consider enabling 4-bit quantization")

        if not model_config.get('gradient_checkpointing', False):
            print("✗ Consider enabling gradient checkpointing")


def main():
    """Main function"""
    print("VRAM and Memory Analysis for Progressive LLM Training")
    print("=" * 60)

    # Get memory info
    get_memory_info()

    # Estimate model sizes
    models = [
        "google/gemma-2-2b-it",
        "google/gemma-3-1b-pt",
        "meta-llama/Llama-3.2-8B",
    ]

    for model in models:
        estimate_model_size(model)

    # Suggest strategies
    suggest_offloading_strategies()

    # Check configs
    configs = [
        "config/training_config_gemma3_1b.yaml",
        "config/training_config_gemma2_small.yaml",
    ]

    for config in configs:
        check_config_compatibility(config)

    print("\n=== Recommendations ===")
    print("1. Start with QLoRA (4-bit) if not already enabled")
    print("2. Use device_map with max_memory limits")
    print("3. Enable gradient checkpointing")
    print("4. Consider DeepSpeed for advanced offloading")
    print("5. Monitor actual usage during training")


if __name__ == "__main__":
    main()