202 lines
No EOL
5.8 KiB
Python
Executable file
202 lines
No EOL
5.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Main training script for progressive reasoning model
|
|
"""
|
|
|
|
import sys
|
|
import yaml
|
|
import argparse
|
|
import os
|
|
from pathlib import Path
|
|
|
|
# Add src to path
|
|
sys.path.append(str(Path(__file__).parent.parent))
|
|
|
|
from src.progressive_model import ProgressiveReasoningModel
|
|
from src.training import ProgressiveTrainer
|
|
from src.data_utils import prepare_sample_datasets
|
|
|
|
|
|
def parse_args():
|
|
"""Parse command line arguments"""
|
|
parser = argparse.ArgumentParser(
|
|
description="Progressive LLM Training for 松尾研LLMコンペ2025",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Use default config
|
|
python scripts/train_progressive.py
|
|
|
|
# Use specific config file
|
|
python scripts/train_progressive.py --config config/training_config_large.yaml
|
|
|
|
# Use config with custom path
|
|
python scripts/train_progressive.py --config /path/to/my_config.yaml
|
|
|
|
# Prepare sample datasets
|
|
python scripts/train_progressive.py --prepare-data
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--config", "-c",
|
|
type=str,
|
|
default="config/training_config.yaml",
|
|
help="Path to the training configuration file (default: config/training_config.yaml)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--prepare-data",
|
|
action="store_true",
|
|
help="Prepare sample datasets before training"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Load config and model but skip training (for testing)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--distributed",
|
|
action="store_true",
|
|
help="Enable distributed training"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--deepspeed",
|
|
action="store_true",
|
|
help="Enable DeepSpeed training"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--local_rank",
|
|
type=int,
|
|
default=-1,
|
|
help="Local rank for distributed training"
|
|
)
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
def load_config(config_path: str) -> dict:
|
|
"""Load configuration from file"""
|
|
config_path = Path(config_path)
|
|
|
|
if not config_path.exists():
|
|
raise FileNotFoundError(f"Configuration file not found: {config_path}")
|
|
|
|
print(f"Loading configuration from: {config_path}")
|
|
|
|
with open(config_path) as f:
|
|
config = yaml.safe_load(f)
|
|
|
|
return config
|
|
|
|
|
|
def setup_distributed_training(local_rank=-1):
|
|
"""Setup distributed training environment"""
|
|
import torch
|
|
|
|
# Check if we're in a distributed environment
|
|
if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
|
|
import torch.distributed as dist
|
|
|
|
# Initialize distributed training
|
|
if not dist.is_initialized():
|
|
dist.init_process_group(backend="nccl")
|
|
|
|
# Use local_rank from args or environment
|
|
if local_rank >= 0:
|
|
torch.cuda.set_device(local_rank)
|
|
else:
|
|
local_rank = int(os.environ.get("LOCAL_RANK", 0))
|
|
torch.cuda.set_device(local_rank)
|
|
|
|
print(f"Distributed training initialized: rank {dist.get_rank()}/{dist.get_world_size()}, local_rank {local_rank}")
|
|
return True
|
|
|
|
# For DeepSpeed, local_rank might be set even without RANK/WORLD_SIZE initially
|
|
elif local_rank >= 0:
|
|
torch.cuda.set_device(local_rank)
|
|
print(f"Set CUDA device to local_rank {local_rank}")
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
|
|
# Setup distributed training if requested
|
|
is_distributed = False
|
|
if args.distributed or args.deepspeed:
|
|
is_distributed = setup_distributed_training(args.local_rank)
|
|
|
|
print("Progressive LLM Training for 松尾研LLMコンペ2025")
|
|
print("=" * 50)
|
|
|
|
# Load configuration
|
|
try:
|
|
config = load_config(args.config)
|
|
except FileNotFoundError as e:
|
|
print(f"Error: {e}")
|
|
print("Available config files:")
|
|
config_dir = Path("config")
|
|
if config_dir.exists():
|
|
for config_file in config_dir.glob("*.yaml"):
|
|
print(f" {config_file}")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"Error loading config: {e}")
|
|
sys.exit(1)
|
|
|
|
# Add distributed/deepspeed flags to config
|
|
config["training_args"] = config.get("training_args", {})
|
|
if args.distributed:
|
|
config["training_args"]["distributed"] = True
|
|
if args.deepspeed:
|
|
config["training_args"]["deepspeed"] = True
|
|
# Add DeepSpeed config from main config
|
|
if "deepspeed" in config:
|
|
config["training_args"]["deepspeed_config"] = config["deepspeed"]
|
|
|
|
# Print configuration info
|
|
print(f"Experiment: {config['experiment']['name']}")
|
|
print(f"Base model: {config['experiment']['base_model']}")
|
|
print(f"Output directory: {config['experiment']['output_dir']}")
|
|
print(f"Stages: {len(config['progressive_stages'])}")
|
|
if is_distributed:
|
|
print("Mode: Distributed Training")
|
|
if args.deepspeed:
|
|
print("Backend: DeepSpeed")
|
|
print("=" * 50)
|
|
|
|
# Prepare sample datasets if requested
|
|
if args.prepare_data:
|
|
print("\nPreparing sample datasets...")
|
|
prepare_sample_datasets()
|
|
print("Sample datasets prepared.")
|
|
|
|
# Initialize model wrapper
|
|
print("\nInitializing model...")
|
|
model_wrapper = ProgressiveReasoningModel(config)
|
|
model_wrapper.setup_base_model()
|
|
|
|
if args.dry_run:
|
|
print("\nDry run completed. Model loaded successfully.")
|
|
return
|
|
|
|
# Initialize trainer
|
|
print("\nInitializing trainer...")
|
|
trainer = ProgressiveTrainer(model_wrapper, config)
|
|
|
|
# Run progressive training
|
|
print("\nStarting progressive training...")
|
|
trainer.run_progressive_training()
|
|
|
|
print("\nTraining completed successfully!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |