grad-repair
This commit is contained in:
parent
7cbf916d2b
commit
59c3bcfc7d
2 changed files with 14 additions and 4 deletions
|
|
@ -32,7 +32,7 @@ progressive_stages:
|
|||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 2
|
||||
per_device_batch_size: 32 # 32 * 8 = 256 total batch size
|
||||
per_device_batch_size: 8 # 8 * 8 = 64 total batch size (reduced for memory)
|
||||
gradient_accumulation_steps: 1
|
||||
learning_rate: 5e-4
|
||||
warmup_steps: 100
|
||||
|
|
@ -43,7 +43,7 @@ progressive_stages:
|
|||
save_steps: 50
|
||||
logging_steps: 10
|
||||
dataloader_num_workers: 4
|
||||
dataloader_pin_memory: true
|
||||
dataloader_pin_memory: false
|
||||
|
||||
- name: "math_reasoning"
|
||||
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
|
||||
|
|
@ -57,7 +57,7 @@ progressive_stages:
|
|||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 1
|
||||
per_device_batch_size: 16 # 16 * 8 = 128 total batch size
|
||||
per_device_batch_size: 4 # 4 * 8 = 32 total batch size (reduced for memory)
|
||||
gradient_accumulation_steps: 2
|
||||
learning_rate: 3e-4
|
||||
warmup_steps: 200
|
||||
|
|
@ -85,7 +85,7 @@ progressive_stages:
|
|||
init_lora_weights: true
|
||||
training:
|
||||
num_epochs: 1
|
||||
per_device_batch_size: 8 # 8 * 8 = 64 total batch size
|
||||
per_device_batch_size: 2 # 2 * 8 = 16 total batch size (reduced for memory)
|
||||
gradient_accumulation_steps: 4
|
||||
learning_rate: 2e-4
|
||||
warmup_steps: 300
|
||||
|
|
|
|||
|
|
@ -353,6 +353,11 @@ class ProgressiveReasoningModel:
|
|||
# Ensure model is in training mode
|
||||
self.model.train()
|
||||
|
||||
# Explicitly enable gradients for LoRA parameters
|
||||
for name, param in self.model.named_parameters():
|
||||
if "lora_" in name:
|
||||
param.requires_grad_(True)
|
||||
|
||||
# Print trainable parameters
|
||||
self.model.print_trainable_parameters()
|
||||
|
||||
|
|
@ -368,6 +373,11 @@ class ProgressiveReasoningModel:
|
|||
print(f"First few: {grad_params[:5]}")
|
||||
else:
|
||||
print("WARNING: No parameters require gradients!")
|
||||
# Force enable gradients for all LoRA parameters
|
||||
for name, param in self.model.named_parameters():
|
||||
if "lora_" in name:
|
||||
param.requires_grad_(True)
|
||||
print(f"Enabled gradient for: {name}")
|
||||
|
||||
# Save adapter path
|
||||
adapter_path = self.output_dir / "adapters" / stage_name
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue