grad-repair

This commit is contained in:
Soma Nakamura 2025-07-10 23:05:39 +09:00
parent 7cbf916d2b
commit 59c3bcfc7d
2 changed files with 14 additions and 4 deletions

View file

@ -32,7 +32,7 @@ progressive_stages:
init_lora_weights: true
training:
num_epochs: 2
per_device_batch_size: 32 # 32 * 8 = 256 total batch size
per_device_batch_size: 8 # 8 * 8 = 64 total batch size (reduced for memory)
gradient_accumulation_steps: 1
learning_rate: 5e-4
warmup_steps: 100
@ -43,7 +43,7 @@ progressive_stages:
save_steps: 50
logging_steps: 10
dataloader_num_workers: 4
dataloader_pin_memory: true
dataloader_pin_memory: false
- name: "math_reasoning"
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
@ -57,7 +57,7 @@ progressive_stages:
init_lora_weights: true
training:
num_epochs: 1
per_device_batch_size: 16 # 16 * 8 = 128 total batch size
per_device_batch_size: 4 # 4 * 8 = 32 total batch size (reduced for memory)
gradient_accumulation_steps: 2
learning_rate: 3e-4
warmup_steps: 200
@ -85,7 +85,7 @@ progressive_stages:
init_lora_weights: true
training:
num_epochs: 1
per_device_batch_size: 8 # 8 * 8 = 64 total batch size
per_device_batch_size: 2 # 2 * 8 = 16 total batch size (reduced for memory)
gradient_accumulation_steps: 4
learning_rate: 2e-4
warmup_steps: 300

View file

@ -353,6 +353,11 @@ class ProgressiveReasoningModel:
# Ensure model is in training mode
self.model.train()
# Explicitly enable gradients for LoRA parameters
for name, param in self.model.named_parameters():
if "lora_" in name:
param.requires_grad_(True)
# Print trainable parameters
self.model.print_trainable_parameters()
@ -368,6 +373,11 @@ class ProgressiveReasoningModel:
print(f"First few: {grad_params[:5]}")
else:
print("WARNING: No parameters require gradients!")
# Force enable gradients for all LoRA parameters
for name, param in self.model.named_parameters():
if "lora_" in name:
param.requires_grad_(True)
print(f"Enabled gradient for: {name}")
# Save adapter path
adapter_path = self.output_dir / "adapters" / stage_name