progressive-llm/scripts/check_vram.py
2025-07-10 18:09:14 +09:00

199 lines
No EOL
6.5 KiB
Python

#!/usr/bin/env python3
"""
Check VRAM usage and model memory requirements
"""
import torch
import psutil
import sys
from pathlib import Path
import yaml
# Add src to path
sys.path.append(str(Path(__file__).parent.parent))
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
def get_memory_info():
"""Get current memory usage"""
if torch.cuda.is_available():
print("=== CUDA Information ===")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(0)}")
print(f"CUDA device count: {torch.cuda.device_count()}")
# Get VRAM info
vram_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
vram_reserved = torch.cuda.memory_reserved(0) / 1024**3
vram_allocated = torch.cuda.memory_allocated(0) / 1024**3
vram_free = vram_total - vram_allocated
print(f"\n=== VRAM Usage ===")
print(f"Total VRAM: {vram_total:.2f} GB")
print(f"Allocated VRAM: {vram_allocated:.2f} GB")
print(f"Reserved VRAM: {vram_reserved:.2f} GB")
print(f"Free VRAM: {vram_free:.2f} GB")
else:
print("CUDA not available!")
# Get system RAM info
ram = psutil.virtual_memory()
print(f"\n=== System RAM ===")
print(f"Total RAM: {ram.total / 1024**3:.2f} GB")
print(f"Available RAM: {ram.available / 1024**3:.2f} GB")
print(f"Used RAM: {ram.used / 1024**3:.2f} GB ({ram.percent}%)")
def estimate_model_size(model_name: str, quantization: str = None):
"""Estimate model memory requirements"""
print(f"\n=== Model Memory Estimation ===")
print(f"Model: {model_name}")
# Common model sizes (in billions of parameters)
model_sizes = {
"gemma-2-2b": 2.5,
"gemma-3-1b": 1.2,
"llama-3.2-8b": 8,
"llama-3.2-13b": 13,
"llama-3.2-70b": 70,
}
# Find model size
model_key = None
for key in model_sizes:
if key in model_name.lower():
model_key = key
break
if model_key:
params_billions = model_sizes[model_key]
# Memory estimates (rough)
fp32_gb = params_billions * 4 # 4 bytes per parameter
fp16_gb = params_billions * 2 # 2 bytes per parameter
int8_gb = params_billions * 1 # 1 byte per parameter
int4_gb = params_billions * 0.5 # 0.5 bytes per parameter
print(f"Estimated parameters: {params_billions}B")
print(f"Memory requirements:")
print(f" FP32: ~{fp32_gb:.1f} GB")
print(f" FP16/BF16: ~{fp16_gb:.1f} GB")
print(f" INT8: ~{int8_gb:.1f} GB")
print(f" INT4 (QLoRA): ~{int4_gb:.1f} GB")
# Add overhead for activations and gradients
print(f"\nWith training overhead:")
print(f" FP16 + LoRA: ~{fp16_gb * 1.5:.1f} GB")
print(f" INT4 + QLoRA: ~{int4_gb * 1.5:.1f} GB")
else:
print("Model size not recognized, unable to estimate memory requirements")
def suggest_offloading_strategies():
"""Suggest CPU offloading strategies"""
print("\n=== CPU Offloading Strategies ===")
print("\n1. **Device Map Auto with CPU Offload**")
print(" ```python")
print(" device_map = {")
print(" 'model.embed_tokens': 'cpu',")
print(" 'model.layers.0': 0, # GPU")
print(" 'model.layers.1': 0, # GPU")
print(" 'model.layers.2': 'cpu', # CPU")
print(" # ... distribute layers between GPU and CPU")
print(" }")
print(" ```")
print("\n2. **Accelerate's CPU Offload**")
print(" ```yaml")
print(" model:")
print(" device_map: 'auto'")
print(" max_memory:")
print(" 0: '4GB' # Limit GPU memory")
print(" 'cpu': '20GB' # Allow CPU memory")
print(" ```")
print("\n3. **DeepSpeed ZeRO-Offload**")
print(" - ZeRO-2: Offload optimizer states to CPU")
print(" - ZeRO-3: Offload optimizer states and parameters to CPU")
print(" ```yaml")
print(" deepspeed:")
print(" zero_optimization:")
print(" stage: 2")
print(" offload_optimizer:")
print(" device: 'cpu'")
print(" ```")
print("\n4. **Gradient Checkpointing + CPU Offload**")
print(" - Trade compute for memory")
print(" - Combine with layer-wise CPU offloading")
print("\n5. **QLoRA with CPU Offload**")
print(" - 4-bit quantization reduces base model size")
print(" - Only LoRA parameters on GPU")
print(" - Base model layers can be on CPU")
def check_config_compatibility(config_path: str):
"""Check if config is compatible with CPU offloading"""
if Path(config_path).exists():
with open(config_path) as f:
config = yaml.safe_load(f)
print(f"\n=== Config Analysis: {config_path} ===")
model_config = config.get("model", {})
print(f"Current settings:")
print(f" 4-bit quantization: {model_config.get('load_in_4bit', False)}")
print(f" Gradient checkpointing: {model_config.get('gradient_checkpointing', False)}")
print(f" Device map: {model_config.get('device_map', 'None')}")
if model_config.get('load_in_4bit', False):
print("✓ Already using 4-bit quantization (good for memory)")
else:
print("✗ Consider enabling 4-bit quantization")
if not model_config.get('gradient_checkpointing', False):
print("✗ Consider enabling gradient checkpointing")
def main():
"""Main function"""
print("VRAM and Memory Analysis for Progressive LLM Training")
print("=" * 60)
# Get memory info
get_memory_info()
# Estimate model sizes
models = [
"google/gemma-2-2b-it",
"google/gemma-3-1b-pt",
"meta-llama/Llama-3.2-8B",
]
for model in models:
estimate_model_size(model)
# Suggest strategies
suggest_offloading_strategies()
# Check configs
configs = [
"config/training_config_gemma3_1b.yaml",
"config/training_config_gemma2_small.yaml",
]
for config in configs:
check_config_compatibility(config)
print("\n=== Recommendations ===")
print("1. Start with QLoRA (4-bit) if not already enabled")
print("2. Use device_map with max_memory limits")
print("3. Enable gradient checkpointing")
print("4. Consider DeepSpeed for advanced offloading")
print("5. Monitor actual usage during training")
if __name__ == "__main__":
main()