grad-repair

2025-07-10 23:05:39 +09:00 · 2025-07-10 23:05:39 +09:00 · 59c3bcfc7d
commit 59c3bcfc7d
parent 7cbf916d2b
2 changed files with 14 additions and 4 deletions
--- a/config/training_config_gemma3_1b_8gpu_ddp.yaml
+++ b/config/training_config_gemma3_1b_8gpu_ddp.yaml
@ -32,7 +32,7 @@ progressive_stages:
      init_lora_weights: true
    training:
      num_epochs: 2
-      per_device_batch_size: 32  # 32 * 8 = 256 total batch size
+      per_device_batch_size: 8  # 8 * 8 = 64 total batch size (reduced for memory)
      gradient_accumulation_steps: 1
      learning_rate: 5e-4
      warmup_steps: 100
@ -43,7 +43,7 @@ progressive_stages:
      save_steps: 50
      logging_steps: 10
      dataloader_num_workers: 4
-      dataloader_pin_memory: true
+      dataloader_pin_memory: false
  - name: "math_reasoning"
    description: "Mathematical reasoning with OpenR1-Math-220k dataset"
@ -57,7 +57,7 @@ progressive_stages:
      init_lora_weights: true
    training:
      num_epochs: 1
-      per_device_batch_size: 16  # 16 * 8 = 128 total batch size
+      per_device_batch_size: 4  # 4 * 8 = 32 total batch size (reduced for memory)
      gradient_accumulation_steps: 2
      learning_rate: 3e-4
      warmup_steps: 200
@ -85,7 +85,7 @@ progressive_stages:
      init_lora_weights: true
    training:
      num_epochs: 1
-      per_device_batch_size: 8  # 8 * 8 = 64 total batch size
+      per_device_batch_size: 2  # 2 * 8 = 16 total batch size (reduced for memory)
      gradient_accumulation_steps: 4
      learning_rate: 2e-4
      warmup_steps: 300
--- a/src/progressive_model.py
+++ b/src/progressive_model.py
@ -353,6 +353,11 @@ class ProgressiveReasoningModel:
        # Ensure model is in training mode
        self.model.train()
        # Explicitly enable gradients for LoRA parameters
        for name, param in self.model.named_parameters():
            if "lora_" in name:
                param.requires_grad_(True)
        # Print trainable parameters
        self.model.print_trainable_parameters()
@ -368,6 +373,11 @@ class ProgressiveReasoningModel:
            print(f"First few: {grad_params[:5]}")
        else:
            print("WARNING: No parameters require gradients!")
            # Force enable gradients for all LoRA parameters
            for name, param in self.model.named_parameters():
                if "lora_" in name:
                    param.requires_grad_(True)
                    print(f"Enabled gradient for: {name}")
        # Save adapter path
        adapter_path = self.output_dir / "adapters" / stage_name