#!/usr/bin/env python3 """ Main training script for progressive reasoning model """ import sys import yaml import argparse import os from pathlib import Path # Add src to path sys.path.append(str(Path(__file__).parent.parent)) from src.progressive_model import ProgressiveReasoningModel from src.training import ProgressiveTrainer from src.data_utils import prepare_sample_datasets def parse_args(): """Parse command line arguments""" parser = argparse.ArgumentParser( description="Progressive LLM Training for 松尾研LLMコンペ2025", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Use default config python scripts/train_progressive.py # Use specific config file python scripts/train_progressive.py --config config/training_config_large.yaml # Use config with custom path python scripts/train_progressive.py --config /path/to/my_config.yaml # Prepare sample datasets python scripts/train_progressive.py --prepare-data """ ) parser.add_argument( "--config", "-c", type=str, default="config/training_config.yaml", help="Path to the training configuration file (default: config/training_config.yaml)" ) parser.add_argument( "--prepare-data", action="store_true", help="Prepare sample datasets before training" ) parser.add_argument( "--dry-run", action="store_true", help="Load config and model but skip training (for testing)" ) parser.add_argument( "--distributed", action="store_true", help="Enable distributed training" ) parser.add_argument( "--deepspeed", action="store_true", help="Enable DeepSpeed training" ) parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training" ) return parser.parse_args() def load_config(config_path: str) -> dict: """Load configuration from file""" config_path = Path(config_path) if not config_path.exists(): raise FileNotFoundError(f"Configuration file not found: {config_path}") print(f"Loading configuration from: {config_path}") with open(config_path) as f: config = yaml.safe_load(f) return config def setup_distributed_training(local_rank=-1): """Setup distributed training environment""" import torch # Check if we're in a distributed environment if "RANK" in os.environ and "WORLD_SIZE" in os.environ: import torch.distributed as dist # Use local_rank from args or environment if local_rank >= 0: torch.cuda.set_device(local_rank) else: local_rank = int(os.environ.get("LOCAL_RANK", 0)) torch.cuda.set_device(local_rank) # Initialize distributed training if not dist.is_initialized(): dist.init_process_group(backend="nccl") print(f"Distributed training initialized: rank {dist.get_rank()}/{dist.get_world_size()}, local_rank {local_rank}") return True # For DeepSpeed, local_rank might be set even without RANK/WORLD_SIZE initially elif local_rank >= 0: torch.cuda.set_device(local_rank) print(f"Set CUDA device to local_rank {local_rank}") return True return False def main(): args = parse_args() # Setup distributed training if requested is_distributed = False if args.distributed or args.deepspeed: is_distributed = setup_distributed_training(args.local_rank) print("Progressive LLM Training for 松尾研LLMコンペ2025") print("=" * 50) # Load configuration try: config = load_config(args.config) except FileNotFoundError as e: print(f"Error: {e}") print("Available config files:") config_dir = Path("config") if config_dir.exists(): for config_file in config_dir.glob("*.yaml"): print(f" {config_file}") sys.exit(1) except Exception as e: print(f"Error loading config: {e}") sys.exit(1) # Add distributed/deepspeed flags to config config["training_args"] = config.get("training_args", {}) if args.distributed: config["training_args"]["distributed"] = True if args.deepspeed: config["training_args"]["deepspeed"] = True # Add DeepSpeed config from main config if "deepspeed" in config: config["training_args"]["deepspeed_config"] = config["deepspeed"] # Print configuration info print(f"Experiment: {config['experiment']['name']}") print(f"Base model: {config['experiment']['base_model']}") print(f"Output directory: {config['experiment']['output_dir']}") print(f"Stages: {len(config['progressive_stages'])}") if is_distributed: print("Mode: Distributed Training") if args.deepspeed: print("Backend: DeepSpeed") print("=" * 50) # Prepare sample datasets if requested if args.prepare_data: print("\nPreparing sample datasets...") prepare_sample_datasets() print("Sample datasets prepared.") # Initialize model wrapper print("\nInitializing model...") model_wrapper = ProgressiveReasoningModel(config) model_wrapper.setup_base_model() if args.dry_run: print("\nDry run completed. Model loaded successfully.") return # Initialize trainer print("\nInitializing trainer...") trainer = ProgressiveTrainer(model_wrapper, config) # Run progressive training print("\nStarting progressive training...") trainer.run_progressive_training() print("\nTraining completed successfully!") if __name__ == "__main__": main()