grad-repair
This commit is contained in:
parent
7cbf916d2b
commit
59c3bcfc7d
2 changed files with 14 additions and 4 deletions
|
|
@ -32,7 +32,7 @@ progressive_stages:
|
||||||
init_lora_weights: true
|
init_lora_weights: true
|
||||||
training:
|
training:
|
||||||
num_epochs: 2
|
num_epochs: 2
|
||||||
per_device_batch_size: 32 # 32 * 8 = 256 total batch size
|
per_device_batch_size: 8 # 8 * 8 = 64 total batch size (reduced for memory)
|
||||||
gradient_accumulation_steps: 1
|
gradient_accumulation_steps: 1
|
||||||
learning_rate: 5e-4
|
learning_rate: 5e-4
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
|
|
@ -43,7 +43,7 @@ progressive_stages:
|
||||||
save_steps: 50
|
save_steps: 50
|
||||||
logging_steps: 10
|
logging_steps: 10
|
||||||
dataloader_num_workers: 4
|
dataloader_num_workers: 4
|
||||||
dataloader_pin_memory: true
|
dataloader_pin_memory: false
|
||||||
|
|
||||||
- name: "math_reasoning"
|
- name: "math_reasoning"
|
||||||
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
|
description: "Mathematical reasoning with OpenR1-Math-220k dataset"
|
||||||
|
|
@ -57,7 +57,7 @@ progressive_stages:
|
||||||
init_lora_weights: true
|
init_lora_weights: true
|
||||||
training:
|
training:
|
||||||
num_epochs: 1
|
num_epochs: 1
|
||||||
per_device_batch_size: 16 # 16 * 8 = 128 total batch size
|
per_device_batch_size: 4 # 4 * 8 = 32 total batch size (reduced for memory)
|
||||||
gradient_accumulation_steps: 2
|
gradient_accumulation_steps: 2
|
||||||
learning_rate: 3e-4
|
learning_rate: 3e-4
|
||||||
warmup_steps: 200
|
warmup_steps: 200
|
||||||
|
|
@ -85,7 +85,7 @@ progressive_stages:
|
||||||
init_lora_weights: true
|
init_lora_weights: true
|
||||||
training:
|
training:
|
||||||
num_epochs: 1
|
num_epochs: 1
|
||||||
per_device_batch_size: 8 # 8 * 8 = 64 total batch size
|
per_device_batch_size: 2 # 2 * 8 = 16 total batch size (reduced for memory)
|
||||||
gradient_accumulation_steps: 4
|
gradient_accumulation_steps: 4
|
||||||
learning_rate: 2e-4
|
learning_rate: 2e-4
|
||||||
warmup_steps: 300
|
warmup_steps: 300
|
||||||
|
|
|
||||||
|
|
@ -353,6 +353,11 @@ class ProgressiveReasoningModel:
|
||||||
# Ensure model is in training mode
|
# Ensure model is in training mode
|
||||||
self.model.train()
|
self.model.train()
|
||||||
|
|
||||||
|
# Explicitly enable gradients for LoRA parameters
|
||||||
|
for name, param in self.model.named_parameters():
|
||||||
|
if "lora_" in name:
|
||||||
|
param.requires_grad_(True)
|
||||||
|
|
||||||
# Print trainable parameters
|
# Print trainable parameters
|
||||||
self.model.print_trainable_parameters()
|
self.model.print_trainable_parameters()
|
||||||
|
|
||||||
|
|
@ -368,6 +373,11 @@ class ProgressiveReasoningModel:
|
||||||
print(f"First few: {grad_params[:5]}")
|
print(f"First few: {grad_params[:5]}")
|
||||||
else:
|
else:
|
||||||
print("WARNING: No parameters require gradients!")
|
print("WARNING: No parameters require gradients!")
|
||||||
|
# Force enable gradients for all LoRA parameters
|
||||||
|
for name, param in self.model.named_parameters():
|
||||||
|
if "lora_" in name:
|
||||||
|
param.requires_grad_(True)
|
||||||
|
print(f"Enabled gradient for: {name}")
|
||||||
|
|
||||||
# Save adapter path
|
# Save adapter path
|
||||||
adapter_path = self.output_dir / "adapters" / stage_name
|
adapter_path = self.output_dir / "adapters" / stage_name
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue