199 lines
No EOL
6.5 KiB
Python
199 lines
No EOL
6.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Check VRAM usage and model memory requirements
|
|
"""
|
|
|
|
import torch
|
|
import psutil
|
|
import sys
|
|
from pathlib import Path
|
|
import yaml
|
|
|
|
# Add src to path
|
|
sys.path.append(str(Path(__file__).parent.parent))
|
|
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
|
|
|
|
|
def get_memory_info():
|
|
"""Get current memory usage"""
|
|
if torch.cuda.is_available():
|
|
print("=== CUDA Information ===")
|
|
print(f"CUDA available: {torch.cuda.is_available()}")
|
|
print(f"CUDA device: {torch.cuda.get_device_name(0)}")
|
|
print(f"CUDA device count: {torch.cuda.device_count()}")
|
|
|
|
# Get VRAM info
|
|
vram_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
|
|
vram_reserved = torch.cuda.memory_reserved(0) / 1024**3
|
|
vram_allocated = torch.cuda.memory_allocated(0) / 1024**3
|
|
vram_free = vram_total - vram_allocated
|
|
|
|
print(f"\n=== VRAM Usage ===")
|
|
print(f"Total VRAM: {vram_total:.2f} GB")
|
|
print(f"Allocated VRAM: {vram_allocated:.2f} GB")
|
|
print(f"Reserved VRAM: {vram_reserved:.2f} GB")
|
|
print(f"Free VRAM: {vram_free:.2f} GB")
|
|
else:
|
|
print("CUDA not available!")
|
|
|
|
# Get system RAM info
|
|
ram = psutil.virtual_memory()
|
|
print(f"\n=== System RAM ===")
|
|
print(f"Total RAM: {ram.total / 1024**3:.2f} GB")
|
|
print(f"Available RAM: {ram.available / 1024**3:.2f} GB")
|
|
print(f"Used RAM: {ram.used / 1024**3:.2f} GB ({ram.percent}%)")
|
|
|
|
|
|
def estimate_model_size(model_name: str, quantization: str = None):
|
|
"""Estimate model memory requirements"""
|
|
print(f"\n=== Model Memory Estimation ===")
|
|
print(f"Model: {model_name}")
|
|
|
|
# Common model sizes (in billions of parameters)
|
|
model_sizes = {
|
|
"gemma-2-2b": 2.5,
|
|
"gemma-3-1b": 1.2,
|
|
"llama-3.2-8b": 8,
|
|
"llama-3.2-13b": 13,
|
|
"llama-3.2-70b": 70,
|
|
}
|
|
|
|
# Find model size
|
|
model_key = None
|
|
for key in model_sizes:
|
|
if key in model_name.lower():
|
|
model_key = key
|
|
break
|
|
|
|
if model_key:
|
|
params_billions = model_sizes[model_key]
|
|
|
|
# Memory estimates (rough)
|
|
fp32_gb = params_billions * 4 # 4 bytes per parameter
|
|
fp16_gb = params_billions * 2 # 2 bytes per parameter
|
|
int8_gb = params_billions * 1 # 1 byte per parameter
|
|
int4_gb = params_billions * 0.5 # 0.5 bytes per parameter
|
|
|
|
print(f"Estimated parameters: {params_billions}B")
|
|
print(f"Memory requirements:")
|
|
print(f" FP32: ~{fp32_gb:.1f} GB")
|
|
print(f" FP16/BF16: ~{fp16_gb:.1f} GB")
|
|
print(f" INT8: ~{int8_gb:.1f} GB")
|
|
print(f" INT4 (QLoRA): ~{int4_gb:.1f} GB")
|
|
|
|
# Add overhead for activations and gradients
|
|
print(f"\nWith training overhead:")
|
|
print(f" FP16 + LoRA: ~{fp16_gb * 1.5:.1f} GB")
|
|
print(f" INT4 + QLoRA: ~{int4_gb * 1.5:.1f} GB")
|
|
else:
|
|
print("Model size not recognized, unable to estimate memory requirements")
|
|
|
|
|
|
def suggest_offloading_strategies():
|
|
"""Suggest CPU offloading strategies"""
|
|
print("\n=== CPU Offloading Strategies ===")
|
|
print("\n1. **Device Map Auto with CPU Offload**")
|
|
print(" ```python")
|
|
print(" device_map = {")
|
|
print(" 'model.embed_tokens': 'cpu',")
|
|
print(" 'model.layers.0': 0, # GPU")
|
|
print(" 'model.layers.1': 0, # GPU")
|
|
print(" 'model.layers.2': 'cpu', # CPU")
|
|
print(" # ... distribute layers between GPU and CPU")
|
|
print(" }")
|
|
print(" ```")
|
|
|
|
print("\n2. **Accelerate's CPU Offload**")
|
|
print(" ```yaml")
|
|
print(" model:")
|
|
print(" device_map: 'auto'")
|
|
print(" max_memory:")
|
|
print(" 0: '4GB' # Limit GPU memory")
|
|
print(" 'cpu': '20GB' # Allow CPU memory")
|
|
print(" ```")
|
|
|
|
print("\n3. **DeepSpeed ZeRO-Offload**")
|
|
print(" - ZeRO-2: Offload optimizer states to CPU")
|
|
print(" - ZeRO-3: Offload optimizer states and parameters to CPU")
|
|
print(" ```yaml")
|
|
print(" deepspeed:")
|
|
print(" zero_optimization:")
|
|
print(" stage: 2")
|
|
print(" offload_optimizer:")
|
|
print(" device: 'cpu'")
|
|
print(" ```")
|
|
|
|
print("\n4. **Gradient Checkpointing + CPU Offload**")
|
|
print(" - Trade compute for memory")
|
|
print(" - Combine with layer-wise CPU offloading")
|
|
|
|
print("\n5. **QLoRA with CPU Offload**")
|
|
print(" - 4-bit quantization reduces base model size")
|
|
print(" - Only LoRA parameters on GPU")
|
|
print(" - Base model layers can be on CPU")
|
|
|
|
|
|
def check_config_compatibility(config_path: str):
|
|
"""Check if config is compatible with CPU offloading"""
|
|
if Path(config_path).exists():
|
|
with open(config_path) as f:
|
|
config = yaml.safe_load(f)
|
|
|
|
print(f"\n=== Config Analysis: {config_path} ===")
|
|
model_config = config.get("model", {})
|
|
|
|
print(f"Current settings:")
|
|
print(f" 4-bit quantization: {model_config.get('load_in_4bit', False)}")
|
|
print(f" Gradient checkpointing: {model_config.get('gradient_checkpointing', False)}")
|
|
print(f" Device map: {model_config.get('device_map', 'None')}")
|
|
|
|
if model_config.get('load_in_4bit', False):
|
|
print("✓ Already using 4-bit quantization (good for memory)")
|
|
else:
|
|
print("✗ Consider enabling 4-bit quantization")
|
|
|
|
if not model_config.get('gradient_checkpointing', False):
|
|
print("✗ Consider enabling gradient checkpointing")
|
|
|
|
|
|
def main():
|
|
"""Main function"""
|
|
print("VRAM and Memory Analysis for Progressive LLM Training")
|
|
print("=" * 60)
|
|
|
|
# Get memory info
|
|
get_memory_info()
|
|
|
|
# Estimate model sizes
|
|
models = [
|
|
"google/gemma-2-2b-it",
|
|
"google/gemma-3-1b-pt",
|
|
"meta-llama/Llama-3.2-8B",
|
|
]
|
|
|
|
for model in models:
|
|
estimate_model_size(model)
|
|
|
|
# Suggest strategies
|
|
suggest_offloading_strategies()
|
|
|
|
# Check configs
|
|
configs = [
|
|
"config/training_config_gemma3_1b.yaml",
|
|
"config/training_config_gemma2_small.yaml",
|
|
]
|
|
|
|
for config in configs:
|
|
check_config_compatibility(config)
|
|
|
|
print("\n=== Recommendations ===")
|
|
print("1. Start with QLoRA (4-bit) if not already enabled")
|
|
print("2. Use device_map with max_memory limits")
|
|
print("3. Enable gradient checkpointing")
|
|
print("4. Consider DeepSpeed for advanced offloading")
|
|
print("5. Monitor actual usage during training")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |