こんにちは

2025-07-10 23:01:33 +09:00 · 2025-07-10 23:01:33 +09:00 · 7cbf916d2b
commit 7cbf916d2b
parent 4430e0b363
1 changed files with 19 additions and 19 deletions
--- a/config/training_config_gemma3_1b_8gpu_deepspeed.yaml
+++ b/config/training_config_gemma3_1b_8gpu_deepspeed.yaml
@ -22,7 +22,7 @@ deepspeed:
    reduce_scatter: true
    reduce_bucket_size: 500000000
    contiguous_gradients: true
-    cpu_offload: false  # Keep on GPU for speed with small model
+    cpu_offload: true  # Enable CPU offload for memory efficiency
  
  optimizer:
    type: "AdamW"
@ -47,23 +47,23 @@ deepspeed:
  
  gradient_clipping: 1.0
  
-  train_batch_size: 512  # Total batch size across all GPUs
-  train_micro_batch_size_per_gpu: 64  # Per-GPU batch size
+  train_batch_size: 64  # Total batch size across all GPUs
+  train_micro_batch_size_per_gpu: 8  # Per-GPU batch size

 progressive_stages:
  - name: "basic_cot"
    description: "Basic Chain-of-Thought reasoning"
    dataset_path: "./data/basic_cot/"
    adapter_config:
-      r: 32  # Larger rank with 8 GPUs
-      lora_alpha: 64
+      r: 16  # Reduced rank for memory
+      lora_alpha: 32
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
      init_lora_weights: true
    training:
      num_epochs: 2
-      per_device_batch_size: 64  # Large batch with DeepSpeed
-      gradient_accumulation_steps: 1  # No accumulation needed
+      per_device_batch_size: 8  # Reduced for memory
+      gradient_accumulation_steps: 1
      learning_rate: 5e-4
      warmup_steps: 100
      max_length: 1024
@ -72,23 +72,23 @@ progressive_stages:
      weight_decay: 0.001
      save_steps: 50
      logging_steps: 10
-      dataloader_num_workers: 8
-      dataloader_pin_memory: true
+      dataloader_num_workers: 4
+      dataloader_pin_memory: false

  - name: "math_reasoning"
    description: "Mathematical reasoning with OpenR1-Math-220k dataset"
    dataset_path: "open-r1/OpenR1-Math-220k"
    inherit_from: "basic_cot"
    adapter_config:
-      r: 64  # Larger rank for math reasoning
-      lora_alpha: 128
+      r: 32  # Reduced rank for memory
+      lora_alpha: 64
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
-      per_device_batch_size: 32  # Reduce for longer sequences
-      gradient_accumulation_steps: 1
+      per_device_batch_size: 4  # Further reduce for longer sequences
+      gradient_accumulation_steps: 2
      learning_rate: 3e-4
      warmup_steps: 200
      max_length: 2048
@ -97,7 +97,7 @@ progressive_stages:
      weight_decay: 0.001
      save_steps: 100
      logging_steps: 20
-      dataloader_num_workers: 8
+      dataloader_num_workers: 4
    dataset_config:
      streaming: true
      max_samples: 500000  # Process more data with 8 GPUs
@ -108,15 +108,15 @@ progressive_stages:
    dataset_path: "open-r1/Mixture-of-Thoughts"
    inherit_from: "math_reasoning"
    adapter_config:
-      r: 128  # Maximum rank for complex reasoning
-      lora_alpha: 256
+      r: 64  # Reduced rank for memory
+      lora_alpha: 128
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
-      per_device_batch_size: 16  # Reduce for very long sequences
-      gradient_accumulation_steps: 2
+      per_device_batch_size: 2  # Very small for long sequences
+      gradient_accumulation_steps: 4
      learning_rate: 2e-4
      warmup_steps: 300
      max_length: 4096
@ -125,7 +125,7 @@ progressive_stages:
      weight_decay: 0.001
      save_steps: 200
      logging_steps: 50
-      dataloader_num_workers: 8
+      dataloader_num_workers: 4
    dataset_config:
      streaming: true
      max_samples: 800000  # Process even more data