diff --git a/config/training_config_gemma3_1b_8gpu_ddp.yaml b/config/training_config_gemma3_1b_8gpu_ddp.yaml index 847acfb..527c9a8 100644 --- a/config/training_config_gemma3_1b_8gpu_ddp.yaml +++ b/config/training_config_gemma3_1b_8gpu_ddp.yaml @@ -32,7 +32,7 @@ progressive_stages: init_lora_weights: true training: num_epochs: 2 - per_device_batch_size: 32 # 32 * 8 = 256 total batch size + per_device_batch_size: 8 # 8 * 8 = 64 total batch size (reduced for memory) gradient_accumulation_steps: 1 learning_rate: 5e-4 warmup_steps: 100 @@ -43,7 +43,7 @@ progressive_stages: save_steps: 50 logging_steps: 10 dataloader_num_workers: 4 - dataloader_pin_memory: true + dataloader_pin_memory: false - name: "math_reasoning" description: "Mathematical reasoning with OpenR1-Math-220k dataset" @@ -57,7 +57,7 @@ progressive_stages: init_lora_weights: true training: num_epochs: 1 - per_device_batch_size: 16 # 16 * 8 = 128 total batch size + per_device_batch_size: 4 # 4 * 8 = 32 total batch size (reduced for memory) gradient_accumulation_steps: 2 learning_rate: 3e-4 warmup_steps: 200 @@ -85,7 +85,7 @@ progressive_stages: init_lora_weights: true training: num_epochs: 1 - per_device_batch_size: 8 # 8 * 8 = 64 total batch size + per_device_batch_size: 2 # 2 * 8 = 16 total batch size (reduced for memory) gradient_accumulation_steps: 4 learning_rate: 2e-4 warmup_steps: 300 diff --git a/src/progressive_model.py b/src/progressive_model.py index 99d3876..b6995ed 100644 --- a/src/progressive_model.py +++ b/src/progressive_model.py @@ -353,6 +353,11 @@ class ProgressiveReasoningModel: # Ensure model is in training mode self.model.train() + # Explicitly enable gradients for LoRA parameters + for name, param in self.model.named_parameters(): + if "lora_" in name: + param.requires_grad_(True) + # Print trainable parameters self.model.print_trainable_parameters() @@ -368,6 +373,11 @@ class ProgressiveReasoningModel: print(f"First few: {grad_params[:5]}") else: print("WARNING: No parameters require gradients!") + # Force enable gradients for all LoRA parameters + for name, param in self.model.named_parameters(): + if "lora_" in name: + param.requires_grad_(True) + print(f"Enabled gradient for: {name}") # Save adapter path adapter_path = self.output_dir / "adapters" / stage_name