#!/usr/bin/env python3 """ Check VRAM usage and model memory requirements """ import torch import psutil import sys from pathlib import Path import yaml # Add src to path sys.path.append(str(Path(__file__).parent.parent)) from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig def get_memory_info(): """Get current memory usage""" if torch.cuda.is_available(): print("=== CUDA Information ===") print(f"CUDA available: {torch.cuda.is_available()}") print(f"CUDA device: {torch.cuda.get_device_name(0)}") print(f"CUDA device count: {torch.cuda.device_count()}") # Get VRAM info vram_total = torch.cuda.get_device_properties(0).total_memory / 1024**3 vram_reserved = torch.cuda.memory_reserved(0) / 1024**3 vram_allocated = torch.cuda.memory_allocated(0) / 1024**3 vram_free = vram_total - vram_allocated print(f"\n=== VRAM Usage ===") print(f"Total VRAM: {vram_total:.2f} GB") print(f"Allocated VRAM: {vram_allocated:.2f} GB") print(f"Reserved VRAM: {vram_reserved:.2f} GB") print(f"Free VRAM: {vram_free:.2f} GB") else: print("CUDA not available!") # Get system RAM info ram = psutil.virtual_memory() print(f"\n=== System RAM ===") print(f"Total RAM: {ram.total / 1024**3:.2f} GB") print(f"Available RAM: {ram.available / 1024**3:.2f} GB") print(f"Used RAM: {ram.used / 1024**3:.2f} GB ({ram.percent}%)") def estimate_model_size(model_name: str, quantization: str = None): """Estimate model memory requirements""" print(f"\n=== Model Memory Estimation ===") print(f"Model: {model_name}") # Common model sizes (in billions of parameters) model_sizes = { "gemma-2-2b": 2.5, "gemma-3-1b": 1.2, "llama-3.2-8b": 8, "llama-3.2-13b": 13, "llama-3.2-70b": 70, } # Find model size model_key = None for key in model_sizes: if key in model_name.lower(): model_key = key break if model_key: params_billions = model_sizes[model_key] # Memory estimates (rough) fp32_gb = params_billions * 4 # 4 bytes per parameter fp16_gb = params_billions * 2 # 2 bytes per parameter int8_gb = params_billions * 1 # 1 byte per parameter int4_gb = params_billions * 0.5 # 0.5 bytes per parameter print(f"Estimated parameters: {params_billions}B") print(f"Memory requirements:") print(f" FP32: ~{fp32_gb:.1f} GB") print(f" FP16/BF16: ~{fp16_gb:.1f} GB") print(f" INT8: ~{int8_gb:.1f} GB") print(f" INT4 (QLoRA): ~{int4_gb:.1f} GB") # Add overhead for activations and gradients print(f"\nWith training overhead:") print(f" FP16 + LoRA: ~{fp16_gb * 1.5:.1f} GB") print(f" INT4 + QLoRA: ~{int4_gb * 1.5:.1f} GB") else: print("Model size not recognized, unable to estimate memory requirements") def suggest_offloading_strategies(): """Suggest CPU offloading strategies""" print("\n=== CPU Offloading Strategies ===") print("\n1. **Device Map Auto with CPU Offload**") print(" ```python") print(" device_map = {") print(" 'model.embed_tokens': 'cpu',") print(" 'model.layers.0': 0, # GPU") print(" 'model.layers.1': 0, # GPU") print(" 'model.layers.2': 'cpu', # CPU") print(" # ... distribute layers between GPU and CPU") print(" }") print(" ```") print("\n2. **Accelerate's CPU Offload**") print(" ```yaml") print(" model:") print(" device_map: 'auto'") print(" max_memory:") print(" 0: '4GB' # Limit GPU memory") print(" 'cpu': '20GB' # Allow CPU memory") print(" ```") print("\n3. **DeepSpeed ZeRO-Offload**") print(" - ZeRO-2: Offload optimizer states to CPU") print(" - ZeRO-3: Offload optimizer states and parameters to CPU") print(" ```yaml") print(" deepspeed:") print(" zero_optimization:") print(" stage: 2") print(" offload_optimizer:") print(" device: 'cpu'") print(" ```") print("\n4. **Gradient Checkpointing + CPU Offload**") print(" - Trade compute for memory") print(" - Combine with layer-wise CPU offloading") print("\n5. **QLoRA with CPU Offload**") print(" - 4-bit quantization reduces base model size") print(" - Only LoRA parameters on GPU") print(" - Base model layers can be on CPU") def check_config_compatibility(config_path: str): """Check if config is compatible with CPU offloading""" if Path(config_path).exists(): with open(config_path) as f: config = yaml.safe_load(f) print(f"\n=== Config Analysis: {config_path} ===") model_config = config.get("model", {}) print(f"Current settings:") print(f" 4-bit quantization: {model_config.get('load_in_4bit', False)}") print(f" Gradient checkpointing: {model_config.get('gradient_checkpointing', False)}") print(f" Device map: {model_config.get('device_map', 'None')}") if model_config.get('load_in_4bit', False): print("✓ Already using 4-bit quantization (good for memory)") else: print("✗ Consider enabling 4-bit quantization") if not model_config.get('gradient_checkpointing', False): print("✗ Consider enabling gradient checkpointing") def main(): """Main function""" print("VRAM and Memory Analysis for Progressive LLM Training") print("=" * 60) # Get memory info get_memory_info() # Estimate model sizes models = [ "google/gemma-2-2b-it", "google/gemma-3-1b-pt", "meta-llama/Llama-3.2-8B", ] for model in models: estimate_model_size(model) # Suggest strategies suggest_offloading_strategies() # Check configs configs = [ "config/training_config_gemma3_1b.yaml", "config/training_config_gemma2_small.yaml", ] for config in configs: check_config_compatibility(config) print("\n=== Recommendations ===") print("1. Start with QLoRA (4-bit) if not already enabled") print("2. Use device_map with max_memory limits") print("3. Enable gradient checkpointing") print("4. Consider DeepSpeed for advanced offloading") print("5. Monitor actual usage during training") if __name__ == "__main__": main()