From 7cbf916d2b163009566c8d27594f39d9e0034f84 Mon Sep 17 00:00:00 2001
From: Soma Nakamura <p@centraworks.net>
Date: Thu, 10 Jul 2025 23:01:33 +0900
Subject: [PATCH] =?UTF-8?q?=E3=81=93=E3=82=93=E3=81=AB=E3=81=A1=E3=81=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...ining_config_gemma3_1b_8gpu_deepspeed.yaml | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/config/training_config_gemma3_1b_8gpu_deepspeed.yaml b/config/training_config_gemma3_1b_8gpu_deepspeed.yaml
index 5bff584..f56e16d 100644
--- a/config/training_config_gemma3_1b_8gpu_deepspeed.yaml
+++ b/config/training_config_gemma3_1b_8gpu_deepspeed.yaml
@@ -22,7 +22,7 @@ deepspeed:
     reduce_scatter: true
     reduce_bucket_size: 500000000
     contiguous_gradients: true
-    cpu_offload: false  # Keep on GPU for speed with small model
+    cpu_offload: true  # Enable CPU offload for memory efficiency
   
   optimizer:
     type: "AdamW"
@@ -47,23 +47,23 @@ deepspeed:
   
   gradient_clipping: 1.0
   
-  train_batch_size: 512  # Total batch size across all GPUs
-  train_micro_batch_size_per_gpu: 64  # Per-GPU batch size
+  train_batch_size: 64  # Total batch size across all GPUs
+  train_micro_batch_size_per_gpu: 8  # Per-GPU batch size
 
 progressive_stages:
   - name: "basic_cot"
     description: "Basic Chain-of-Thought reasoning"
     dataset_path: "./data/basic_cot/"
     adapter_config:
-      r: 32  # Larger rank with 8 GPUs
-      lora_alpha: 64
+      r: 16  # Reduced rank for memory
+      lora_alpha: 32
       lora_dropout: 0.1
       target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
       init_lora_weights: true
     training:
       num_epochs: 2
-      per_device_batch_size: 64  # Large batch with DeepSpeed
-      gradient_accumulation_steps: 1  # No accumulation needed
+      per_device_batch_size: 8  # Reduced for memory
+      gradient_accumulation_steps: 1
       learning_rate: 5e-4
       warmup_steps: 100
       max_length: 1024
@@ -72,23 +72,23 @@ progressive_stages:
       weight_decay: 0.001
       save_steps: 50
       logging_steps: 10
-      dataloader_num_workers: 8
-      dataloader_pin_memory: true
+      dataloader_num_workers: 4
+      dataloader_pin_memory: false
 
   - name: "math_reasoning"
     description: "Mathematical reasoning with OpenR1-Math-220k dataset"
     dataset_path: "open-r1/OpenR1-Math-220k"
     inherit_from: "basic_cot"
     adapter_config:
-      r: 64  # Larger rank for math reasoning
-      lora_alpha: 128
+      r: 32  # Reduced rank for memory
+      lora_alpha: 64
       lora_dropout: 0.1
       target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
       init_lora_weights: true
     training:
       num_epochs: 1
-      per_device_batch_size: 32  # Reduce for longer sequences
-      gradient_accumulation_steps: 1
+      per_device_batch_size: 4  # Further reduce for longer sequences
+      gradient_accumulation_steps: 2
       learning_rate: 3e-4
       warmup_steps: 200
       max_length: 2048
@@ -97,7 +97,7 @@ progressive_stages:
       weight_decay: 0.001
       save_steps: 100
       logging_steps: 20
-      dataloader_num_workers: 8
+      dataloader_num_workers: 4
     dataset_config:
       streaming: true
       max_samples: 500000  # Process more data with 8 GPUs
@@ -108,15 +108,15 @@ progressive_stages:
     dataset_path: "open-r1/Mixture-of-Thoughts"
     inherit_from: "math_reasoning"
     adapter_config:
-      r: 128  # Maximum rank for complex reasoning
-      lora_alpha: 256
+      r: 64  # Reduced rank for memory
+      lora_alpha: 128
       lora_dropout: 0.1
       target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
       init_lora_weights: true
     training:
       num_epochs: 1
-      per_device_batch_size: 16  # Reduce for very long sequences
-      gradient_accumulation_steps: 2
+      per_device_batch_size: 2  # Very small for long sequences
+      gradient_accumulation_steps: 4
       learning_rate: 2e-4
       warmup_steps: 300
       max_length: 4096
@@ -125,7 +125,7 @@ progressive_stages:
       weight_decay: 0.001
       save_steps: 200
       logging_steps: 50
-      dataloader_num_workers: 8
+      dataloader_num_workers: 4
     dataset_config:
       streaming: true
       max_samples: 800000  # Process even more data