initial

2025-07-10 18:09:14 +09:00 · 2025-07-10 18:09:14 +09:00 · 2c30e06f20
commit 2c30e06f20
49 changed files with 3628 additions and 0 deletions
--- a/.devenv.flake.nix
+++ b/.devenv.flake.nix
@ -0,0 +1,163 @@
 {
  inputs =
    let
      version = "1.6.1";
 system = "x86_64-linux";
 devenv_root = "/home/centra/dev/pnn/progressive-llm-training";
 devenv_dotfile = ./.devenv;
 devenv_dotfile_string = ".devenv";
 container_name = null;
 devenv_tmpdir = "/run/user/1000";
 devenv_runtime = "/run/user/1000/devenv-adeda32";
 devenv_istesting = false;
 devenv_direnvrc_latest_version = 1;
        in {
        git-hooks.url = "github:cachix/git-hooks.nix";
      git-hooks.inputs.nixpkgs.follows = "nixpkgs";
      pre-commit-hooks.follows = "git-hooks";
      nixpkgs.url = "github:cachix/devenv-nixpkgs/rolling";
      devenv.url = "github:cachix/devenv?dir=src/modules";
      } // (if builtins.pathExists (devenv_dotfile + "/flake.json")
      then builtins.fromJSON (builtins.readFile (devenv_dotfile +  "/flake.json"))
      else { });
      outputs = { nixpkgs, ... }@inputs:
        let
          version = "1.6.1";
 system = "x86_64-linux";
 devenv_root = "/home/centra/dev/pnn/progressive-llm-training";
 devenv_dotfile = ./.devenv;
 devenv_dotfile_string = ".devenv";
 container_name = null;
 devenv_tmpdir = "/run/user/1000";
 devenv_runtime = "/run/user/1000/devenv-adeda32";
 devenv_istesting = false;
 devenv_direnvrc_latest_version = 1;
            devenv =
            if builtins.pathExists (devenv_dotfile + "/devenv.json")
            then builtins.fromJSON (builtins.readFile (devenv_dotfile + "/devenv.json"))
            else { };
          getOverlays = inputName: inputAttrs:
            map
              (overlay:
                let
                  input = inputs.${inputName} or (throw "No such input `${inputName}` while trying to configure overlays.");
                in
                  input.overlays.${overlay} or (throw "Input `${inputName}` has no overlay called `${overlay}`. Supported overlays: ${nixpkgs.lib.concatStringsSep ", " (builtins.attrNames input.overlays)}"))
              inputAttrs.overlays or [ ];
          overlays = nixpkgs.lib.flatten (nixpkgs.lib.mapAttrsToList getOverlays (devenv.inputs or { }));
          pkgs = import nixpkgs {
            inherit system;
            config = {
              allowUnfree = devenv.allowUnfree or false;
              allowBroken = devenv.allowBroken or false;
              permittedInsecurePackages = devenv.permittedInsecurePackages or [ ];
            };
            inherit overlays;
          };
          lib = pkgs.lib;
          importModule = path:
            if lib.hasPrefix "./" path
            then if lib.hasSuffix ".nix" path
            then ./. + (builtins.substring 1 255 path)
            else ./. + (builtins.substring 1 255 path) + "/devenv.nix"
            else if lib.hasPrefix "../" path
            then throw "devenv: ../ is not supported for imports"
            else
              let
                paths = lib.splitString "/" path;
                name = builtins.head paths;
                input = inputs.${name} or (throw "Unknown input ${name}");
                subpath = "/${lib.concatStringsSep "/" (builtins.tail paths)}";
                devenvpath = "${input}" + subpath;
                devenvdefaultpath = devenvpath + "/devenv.nix";
              in
              if lib.hasSuffix ".nix" devenvpath
              then devenvpath
              else if builtins.pathExists devenvdefaultpath
              then devenvdefaultpath
              else throw (devenvdefaultpath + " file does not exist for input ${name}.");
          project = pkgs.lib.evalModules {
            specialArgs = inputs // { inherit inputs; };
            modules = [
              ({ config, ... }: {
                _module.args.pkgs = pkgs.appendOverlays (config.overlays or [ ]);
              })
              (inputs.devenv.modules + /top-level.nix)
              {
                devenv.cliVersion = version;
                devenv.root = devenv_root;
                devenv.dotfile = devenv_root + "/" + devenv_dotfile_string;
              }
              (pkgs.lib.optionalAttrs (inputs.devenv.isTmpDir or false) {
                devenv.tmpdir = devenv_tmpdir;
                devenv.runtime = devenv_runtime;
              })
              (pkgs.lib.optionalAttrs (inputs.devenv.hasIsTesting or false) {
                devenv.isTesting = devenv_istesting;
              })
              (pkgs.lib.optionalAttrs (container_name != null) {
                container.isBuilding = pkgs.lib.mkForce true;
                containers.${container_name}.isBuilding = true;
              })
              ({ options, ... }: {
                config.devenv = pkgs.lib.optionalAttrs (builtins.hasAttr "direnvrcLatestVersion" options.devenv) {
                  direnvrcLatestVersion = devenv_direnvrc_latest_version;
                };
              })
            ] ++ (map importModule (devenv.imports or [ ])) ++ [
              (if builtins.pathExists ./devenv.nix then ./devenv.nix else { })
              (devenv.devenv or { })
              (if builtins.pathExists ./devenv.local.nix then ./devenv.local.nix else { })
              (if builtins.pathExists (devenv_dotfile + "/cli-options.nix") then import (devenv_dotfile + "/cli-options.nix") else { })
            ];
          };
          config = project.config;
          options = pkgs.nixosOptionsDoc {
            options = builtins.removeAttrs project.options [ "_module" ];
            warningsAreErrors = false;
            # Unpack Nix types, e.g. literalExpression, mDoc.
            transformOptions =
              let isDocType = v: builtins.elem v [ "literalDocBook" "literalExpression" "literalMD" "mdDoc" ];
              in lib.attrsets.mapAttrs (_: v:
                if v ? _type && isDocType v._type then
                  v.text
                else if v ? _type && v._type == "derivation" then
                  v.name
                else
                  v
              );
          };
          build = options: config:
            lib.concatMapAttrs
              (name: option:
                if builtins.hasAttr "type" option then
                  if option.type.name == "output" || option.type.name == "outputOf" then {
                    ${name} = config.${name};
                  } else { }
                else
                  let v = build option config.${name};
                  in if v != { } then {
                    ${name} = v;
                  } else { }
              )
              options;
          systems = [ "x86_64-linux" "aarch64-linux" "x86_64-darwin" "aarch64-darwin" ];
        in
        {
          devShell = lib.genAttrs systems (system: config.shell);
          packages = lib.genAttrs systems (system: {
            optionsJSON = options.optionsJSON;
            # deprecated
            inherit (config) info procfileScript procfileEnv procfile;
            ci = config.ciDerivation;
          });
          devenv = config;
          build = build project.options project.config;
        };
      }
--- a/.devenv/bash
+++ b/.devenv/bash
@ -0,0 +1 @@
 /nix/store/94lg0shvsfc845zy8gnflvpqxxiyijbz-bash-interactive-5.2p37
--- a/.devenv/devenv.json
+++ b/.devenv/devenv.json
@ -0,0 +1 @@
 {"inputs":{"nixpkgs":{"url":"github:NixOS/nixpkgs/nixos-unstable"},"nixpkgs-python":{"url":"github:cachix/nixpkgs-python","inputs":{"nixpkgs":{"follows":"nixpkgs"}}}},"allowUnfree":true}
--- a/.devenv/flake.json
+++ b/.devenv/flake.json
@ -0,0 +1 @@
 {"nixpkgs":{"url":"github:NixOS/nixpkgs/nixos-unstable"},"nixpkgs-python":{"url":"github:cachix/nixpkgs-python","inputs":{"nixpkgs":{"follows":"nixpkgs"}}}}
--- a/.devenv/gc/shell
+++ b/.devenv/gc/shell
@ -0,0 +1 @@
 shell-1-link
--- a/.devenv/gc/shell-1-link
+++ b/.devenv/gc/shell-1-link
@ -0,0 +1 @@
 /nix/store/7fimdw1in7f1g0wxw5cr9pg26rs4rp5g-devenv-shell-env
--- a/.devenv/imports.txt
+++ b/.devenv/imports.txt
--- a/.devenv/input-paths.txt
+++ b/.devenv/input-paths.txt
@ -0,0 +1,11 @@
 /home/centra/.config/nixpkgs/config.nix
 /home/centra/.config/nixpkgs/overlays
 /home/centra/.config/nixpkgs/overlays.nix
 /home/centra/.nixpkgs/config.nix
 /home/centra/dev/pnn/progressive-llm-training/.devenv/flake.json
 /home/centra/dev/pnn/progressive-llm-training/.devenv.flake.nix
 /home/centra/dev/pnn/progressive-llm-training/.env
 /home/centra/dev/pnn/progressive-llm-training/devenv.local.nix
 /home/centra/dev/pnn/progressive-llm-training/devenv.lock
 /home/centra/dev/pnn/progressive-llm-training/devenv.nix
 /home/centra/dev/pnn/progressive-llm-training/devenv.yaml
--- a/.devenv/load-exports
+++ b/.devenv/load-exports
@ -0,0 +1,3 @@
 export PATH='/home/centra/dev/pnn/progressive-llm-training/.devenv/state/venv/bin:/nix/store/bdqwd2frn9m7n3hj2436s0vlnv7mawpc-python3-3.11.13-env/bin:/nix/store/9w80x8njl1hcp8vlk1f3x17q4hcd2cqp-evaluate/bin:/nix/store/8df6wqahd2fqzl04kcs3xs32yqqimcxb-install-packages/bin:/nix/store/v5rz1h6ci23icfp6y228r2m0fqrdf408-install-packages-cpu/bin:/nix/store/69142b4sjmb4jffmyjb8nar6qzlgxnpg-prepare-data/bin:/nix/store/bhb6l6yfqknnwc7y5j5xc9k866hajv7b-train/bin:/nix/store/pbqah1qk4b5y14fqinr1h8zvhqy71v81-gcc-wrapper-14.3.0/bin:/nix/store/sa7j7cddyblhcb3ch3ds10w7nw75yjj1-gcc-14.3.0/bin:/nix/store/mdmsnfcvxyk5ynz7nx8nhss1wig0gljx-glibc-2.40-66-bin/bin:/nix/store/psy9v2asypgl9ylg8cnzkixc7fv0snj0-coreutils-9.7/bin:/nix/store/cadx5p7c0i06gf6h84iw9mrhx56imbv0-binutils-wrapper-2.44/bin:/nix/store/z3za8hfc24wb117s50p8b10agjkgm039-binutils-2.44/bin:/nix/store/dx4bdrs7mq3jfviqhszrc7l35ps9kg64-cmake-3.31.7/bin:/nix/store/1492q00cm64n0hs5966s8cqj6j0x5nxg-ninja-1.12.1/bin:/nix/store/h5khrpnjj3fb182sc32fx1z75w0lhksy-pkg-config-wrapper-0.29.2/bin:/nix/store/rzqvhv48m3nh8g3j4k6jmz6yqy8apr95-git-2.49.0/bin:/nix/store/nygfbkv0j6fvwwa82mdwxm4qfiq3p4q2-git-lfs-3.6.1/bin:/nix/store/fir4g1m8dvg46mh8silh3wnmm9mc0jix-htop-3.4.1/bin:/nix/store/9mc2m4sacbk4l7sc4w7m08m1x9bf5dgn-tmux-3.5a/bin:/nix/store/cxy72qdk41k3zjs5fw1nw1whv6wf7hv2-vim-9.1.1401/bin:/nix/store/74k8qwbfa6lm8psm2vjh2vj04fpr6c5g-openssl-3.4.1-bin/bin:/nix/store/m9k83ip1yx29xs94sa5x8j70s2vfgj6i-glib-2.84.2-dev/bin:/nix/store/zs5crhr67zp8cxn7dh4mwq08zw3sb31m-gettext-0.22.5/bin:/nix/store/rklrz4rwi03hxvz0kwh75vz55wb9b1qz-glib-2.84.2-bin/bin:/nix/store/xbpwk3xzanxj12157byj6wjagm2wfb3c-cuda-merged-12.8/bin:/nix/store/v0zrnzl3anb71ma5c2kx71dl8kyh0rf6-cuda_cuobjdump-12.8.90-bin/bin:/nix/store/v4mm21f67qki6ss6mqp3anlmaiw0r1zd-pre-commit-bin/bin:/nix/store/mq2i9br9h890bnahlds9jnff1jf6xjpb-python3.13-black-25.1.0/bin:/nix/store/sd81bvmch7njdpwx3lkjslixcbj5mivz-python3-3.13.4/bin:/nix/store/mdzm1l0rnpwp8ha0mbxll0db4r2p0xj3-python3.13-flake8-7.2.0/bin:/nix/store/xs72vlx7i6snrrrqx2zn529fbbqrwlwq-python3.13-pycodestyle-2.13.0/bin:/nix/store/5a8m3p0svp6myq1cz4ww431fsbh3xrg5-python3.13-pyflakes-3.3.2/bin:/nix/store/p6bch581drrxv3dm7vwxqazpbssjz4nv-python3.13-mypy-1.15.0/bin:/nix/store/1c8sm86wj45vwkb3ww2b870h9i9wna6r-patchelf-0.15.0/bin:/nix/store/psy9v2asypgl9ylg8cnzkixc7fv0snj0-coreutils-9.7/bin:/nix/store/c14zwgl8hf1wm0izij2i16xvk8ak70cy-findutils-4.10.0/bin:/nix/store/ibx4jfwlhjg4g0s6rrxrpaxa3ka8ns4m-diffutils-3.12/bin:/nix/store/pr318zsl44jdwpk9wk0sdrn19b6in7ah-gnused-4.9/bin:/nix/store/bc6zxzjnkjp4r9nhz5imy3cypvdh6r4n-gnugrep-3.12/bin:/nix/store/nv3y7zb1cwz1h9qy7nwz0s54j8dl1kqj-gawk-5.3.2/bin:/nix/store/lp82dcnrzljyix6yigwzrlpr1smvpmb0-gnutar-1.35/bin:/nix/store/6ag5dhk7sma61p6vl0kazfmpbrq08nqh-gzip-1.14/bin:/nix/store/ykdv4id6893gmkqwdmbimq237c1xqvq7-bzip2-1.0.8-bin/bin:/nix/store/6bwp1y45zlyvpr4ja2sk1yi9v5mrs94x-gnumake-4.4.1/bin:/nix/store/00zrahbb32nzawrmv9sjxn36h7qk9vrs-bash-5.2p37/bin:/nix/store/c9xmgszbf6i4dfq9r953khk9d7fdqigw-patch-2.8/bin:/nix/store/ikfwx7kbwz9zr7fziiac7f57jgbh3bnv-xz-5.8.1-bin/bin:/nix/store/3pdmbqy86wsbjdazxv1n3vrmj60vn0ri-file-5.45/bin:/run/wrappers/bin:/home/centra/.local/share/flatpak/exports/bin:/var/lib/flatpak/exports/bin:/home/centra/.nix-profile/bin:/nix/profile/bin:/home/centra/.local/state/nix/profile/bin:/etc/profiles/per-user/centra/bin:/nix/var/nix/profiles/default/bin:/run/current-system/sw/bin'
 export VIRTUAL_ENV=/home/centra/dev/pnn/progressive-llm-training/.devenv/state/venv
--- a/.devenv/nix-eval-cache.db
+++ b/.devenv/nix-eval-cache.db
--- a/.devenv/nix-eval-cache.db-shm
+++ b/.devenv/nix-eval-cache.db-shm
--- a/.devenv/nix-eval-cache.db-wal
+++ b/.devenv/nix-eval-cache.db-wal
--- a/.devenv/profile
+++ b/.devenv/profile
@ -0,0 +1 @@
 /nix/store/y2vscmx3lckyzyag6xg8b02pkdsk326d-devenv-profile
--- a/.devenv/run
+++ b/.devenv/run
@ -0,0 +1 @@
 /run/user/1000/devenv-adeda32
--- a/.devenv/state/git-hooks/config.json
+++ b/.devenv/state/git-hooks/config.json
@ -0,0 +1 @@
 {configPath:.pre-commit-config.yaml}
--- a/.devenv/tasks.db
+++ b/.devenv/tasks.db
--- a/.devenv/tasks.db-shm
+++ b/.devenv/tasks.db-shm
--- a/.devenv/tasks.db-wal
+++ b/.devenv/tasks.db-wal
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,32 @@
 # Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 venv/
 ENV/
 env/
 .venv/
 # Nix
 result
 result-*
 # Project specific
 outputs/
 data/
 *.log
 wandb/
 .ipynb_checkpoints/
 *.pt
 *.pth
 *.bin
 *.safetensors
 # IDE
 .vscode/
 .idea/
 *.swp
 *.swo
 *~
--- a/=2.5.0
+++ b/=2.5.0
@ -0,0 +1,33 @@
 Collecting flash-attn
  Using cached flash_attn-2.8.0.post2-cp311-cp311-linux_x86_64.whl
 Requirement already satisfied: torch in ./.devenv/state/venv/lib/python3.11/site-packages (from flash-attn) (2.7.1+cu128)
 Collecting einops (from flash-attn)
  Using cached einops-0.8.1-py3-none-any.whl.metadata (13 kB)
 Requirement already satisfied: filelock in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (3.13.1)
 Requirement already satisfied: typing-extensions>=4.10.0 in /nix/store/x74hdbjsz4ck98w8lyxv8kkwxs1wm2il-python3.13-typing-extensions-4.13.2/lib/python3.13/site-packages (from torch->flash-attn) (4.13.2)
 Requirement already satisfied: sympy>=1.13.3 in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (1.13.3)
 Requirement already satisfied: networkx in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (3.3)
 Requirement already satisfied: jinja2 in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (3.1.4)
 Requirement already satisfied: fsspec in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (2024.6.1)
 Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.8.61 in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (12.8.61)
 Requirement already satisfied: nvidia-cuda-runtime-cu12==12.8.57 in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (12.8.57)
 Requirement already satisfied: nvidia-cuda-cupti-cu12==12.8.57 in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (12.8.57)
 Requirement already satisfied: nvidia-cudnn-cu12==9.7.1.26 in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (9.7.1.26)
 Requirement already satisfied: nvidia-cublas-cu12==12.8.3.14 in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (12.8.3.14)
 Requirement already satisfied: nvidia-cufft-cu12==11.3.3.41 in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (11.3.3.41)
 Requirement already satisfied: nvidia-curand-cu12==10.3.9.55 in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (10.3.9.55)
 Requirement already satisfied: nvidia-cusolver-cu12==11.7.2.55 in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (11.7.2.55)
 Requirement already satisfied: nvidia-cusparse-cu12==12.5.7.53 in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (12.5.7.53)
 Requirement already satisfied: nvidia-cusparselt-cu12==0.6.3 in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (0.6.3)
 Requirement already satisfied: nvidia-nccl-cu12==2.26.2 in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (2.26.2)
 Requirement already satisfied: nvidia-nvtx-cu12==12.8.55 in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (12.8.55)
 Requirement already satisfied: nvidia-nvjitlink-cu12==12.8.61 in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (12.8.61)
 Requirement already satisfied: nvidia-cufile-cu12==1.13.0.11 in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (1.13.0.11)
 Requirement already satisfied: triton==3.3.1 in ./.devenv/state/venv/lib/python3.11/site-packages (from torch->flash-attn) (3.3.1)
 Requirement already satisfied: setuptools>=40.8.0 in ./.devenv/state/venv/lib/python3.11/site-packages (from triton==3.3.1->torch->flash-attn) (80.9.0)
 Requirement already satisfied: mpmath<1.4,>=1.1.0 in ./.devenv/state/venv/lib/python3.11/site-packages (from sympy>=1.13.3->torch->flash-attn) (1.3.0)
 Requirement already satisfied: MarkupSafe>=2.0 in ./.devenv/state/venv/lib/python3.11/site-packages (from jinja2->torch->flash-attn) (2.1.5)
 Using cached einops-0.8.1-py3-none-any.whl (64 kB)
 Installing collected packages: einops, flash-attn
 Successfully installed einops-0.8.1 flash-attn-2.8.0.post2
--- a/LORA_TARGET_MODULES.md
+++ b/LORA_TARGET_MODULES.md
@ -0,0 +1,124 @@
 # LoRA Target Modules Reference
 This document provides the correct target module names for different model architectures when using LoRA (Low-Rank Adaptation).
 ## Model Architecture Detection
 Use the inspection script to find correct target modules:
 ```bash
 # In the nix development environment
 python /home/centra/dev/pnn/inspect_conv1d_model.py [model_name]
 ```
 ## Common Model Architectures
 ### GPT-2 / DialoGPT Models
 - **Model Type**: GPT2LMHeadModel
 - **Layer Type**: Conv1D (not Linear!)
 - **Base Model**: microsoft/DialoGPT-small, gpt2, gpt2-medium, gpt2-large, gpt2-xl
 #### Attention Modules
 - `c_attn` - Combined query, key, value projection (nf=3*hidden_size)
 - `c_proj` - Output projection
 #### MLP Modules
 - `mlp.c_fc` - Feed-forward up projection
 - `mlp.c_proj` - Feed-forward down projection
 #### Recommended Configurations
 ```yaml
 # Basic stage (attention only)
 target_modules: ["c_attn", "c_proj"]
 # Advanced stage (attention + MLP)
 target_modules: ["c_attn", "c_proj", "mlp.c_fc", "mlp.c_proj"]
 ```
 ### LLaMA Models
 - **Model Type**: LlamaForCausalLM
 - **Layer Type**: Linear
 - **Base Model**: meta-llama/Llama-2-7b-hf, meta-llama/Llama-3.2-8B
 #### Attention Modules
 - `q_proj` - Query projection
 - `k_proj` - Key projection
 - `v_proj` - Value projection
 - `o_proj` - Output projection
 #### MLP Modules
 - `gate_proj` - Gate projection
 - `up_proj` - Up projection
 - `down_proj` - Down projection
 #### Recommended Configurations
 ```yaml
 # Basic stage (attention only)
 target_modules: ["q_proj", "v_proj"]
 # Advanced stage (attention + MLP)
 target_modules: ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
 ```
 ### Mistral Models
 - **Model Type**: MistralForCausalLM
 - **Layer Type**: Linear
 - **Base Model**: mistralai/Mistral-7B-v0.1
 #### Target Modules (same as LLaMA)
 ```yaml
 target_modules: ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
 ```
 ### Qwen Models
 - **Model Type**: QWenLMHeadModel
 - **Layer Type**: Linear
 - **Base Model**: Qwen/Qwen-7B
 #### Target Modules
 ```yaml
 target_modules: ["c_attn", "c_proj", "w1", "w2"]
 ```
 ## Important Notes
 1. **Conv1D vs Linear**: GPT-2 based models use `Conv1D` layers, not `Linear` layers
 2. **Module Patterns**: Use simple patterns like `"c_attn"` rather than full paths like `"transformer.h.0.attn.c_attn"`
 3. **Testing**: Always test your configuration before training by creating a PEFT model
 4. **Architecture Variations**: Different model families use different naming conventions
 ## Troubleshooting
 ### Error: "Target module not found"
 - Run the inspection script to find actual module names
 - Check if the model uses Conv1D or Linear layers
 - Verify the module naming pattern for your specific model
 ### Error: "No trainable parameters"
 - Ensure target modules exist in the model
 - Check that the module names match exactly
 - Verify the model architecture is supported by PEFT
 ## Testing Your Configuration
 ```python
 from peft import get_peft_model, LoraConfig, TaskType
 # Test configuration
 lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["c_attn", "c_proj"],  # Your target modules
    bias="none"
 )
 # Try to create PEFT model
 try:
    peft_model = get_peft_model(model, lora_config)
    peft_model.print_trainable_parameters()
    print("✓ Configuration works!")
 except Exception as e:
    print(f"✗ Configuration failed: {e}")
 ```
--- a/config/README.md
+++ b/config/README.md
@ -0,0 +1,85 @@
 # Training Configuration Files
 This directory contains configuration files for different model sizes and use cases.
 ## Available Configurations
 ### Small Models (Testing)
 - `training_config.yaml` - Default configuration for small models (DialoGPT-small)
  - Memory: ~1GB VRAM
  - Batch size: 8
  - No quantization
 ### Medium Models (8B)
 - `training_config_large.yaml` - Configuration for 8B models (Llama-3.2-8B)
  - Memory: ~12GB VRAM with 4-bit quantization
  - Batch size: 1, gradient accumulation: 16-64
  - 4-bit quantization enabled
 ### Large Models (13B)
 - `training_config_13b.yaml` - Configuration for 13B models
  - Memory: ~16GB VRAM with 4-bit quantization
  - Batch size: 1, gradient accumulation: 32-128
  - Higher LoRA ranks (32-128)
 ### Extra Large Models (70B)
 - `training_config_70b.yaml` - Configuration for 70B models
  - Memory: ~40GB+ VRAM with 4-bit quantization
  - Batch size: 1, gradient accumulation: 64-256
  - Maximum LoRA ranks (64-256)
  - Multi-GPU support with FSDP
 ## Configuration Parameters
 ### Model Settings
 - `load_in_4bit`: Enable 4-bit quantization (recommended for large models)
 - `gradient_checkpointing`: Trade compute for memory
 - `use_flash_attention_2`: Faster attention computation if available
 ### Adapter Settings
 - `r`: LoRA rank (higher = more parameters but better capacity)
 - `lora_alpha`: LoRA scaling factor (typically 2x the rank)
 - `init_lora_weights`: Set to `true` for identity initialization
 ### Training Settings
 - `per_device_batch_size`: Usually 1 for large models
 - `gradient_accumulation_steps`: Effective batch size multiplier
 - `learning_rate`: Lower for larger models
 - `bf16`: Use bfloat16 for better numerical stability
 ## Usage
 ```bash
 # For 8B models
 python scripts/train_progressive.py --config config/training_config_large.yaml
 # For 13B models
 python scripts/train_progressive.py --config config/training_config_13b.yaml
 # For 70B models (requires multiple GPUs)
 python scripts/train_progressive.py --config config/training_config_70b.yaml
 ```
 ## Memory Requirements
 | Model Size | VRAM (4-bit) | VRAM (16-bit) | GPUs Recommended |
 |------------|--------------|---------------|------------------|
 | 8B         | 12-16GB      | 32GB          | 1x RTX 4090      |
 | 13B        | 16-20GB      | 52GB          | 1x A100          |
 | 70B        | 40-60GB      | 140GB         | 2x A100          |
 ## Tips for Large Models
 1. **Start with smaller models** to validate your approach
 2. **Use gradient checkpointing** to reduce memory usage
 3. **Monitor GPU memory** during training
 4. **Use lower learning rates** for stability
 5. **Consider multi-GPU setup** for 70B+ models
 6. **Enable flash attention** if available for speed
 ## Troubleshooting
 - **OOM errors**: Reduce batch size or enable gradient checkpointing
 - **Slow training**: Enable flash attention, use bf16
 - **Poor convergence**: Adjust learning rate or warmup steps
 - **Multi-GPU issues**: Check FSDP configuration
--- a/config/training_config.yaml
+++ b/config/training_config.yaml
@ -0,0 +1,36 @@
 experiment:
  name: "progressive_reasoning_experiment"
  base_model: "microsoft/DialoGPT-small"  # Lightweight model for testing
  output_dir: "./outputs"
  use_wandb: false
  wandb_project: "matsuo-llm-comp-2025"
 model:
  load_in_4bit: false  # Disable quantization for small model
  bnb_4bit_compute_dtype: "bfloat16"
  bnb_4bit_use_double_quant: true
  device_map: "auto"
 progressive_stages:
  - name: "basic_cot"
    description: "Basic Chain-of-Thought reasoning"
    dataset_path: "./data/basic_cot/"
    adapter_config:
      r: 8
      lora_alpha: 16
      lora_dropout: 0.1
      target_modules: ["c_attn", "c_proj"]
    training:
      num_epochs: 2
      per_device_batch_size: 8  # Increase batch size for small model
      gradient_accumulation_steps: 2  # Reduce accumulation steps
      learning_rate: 5e-4  # Higher learning rate for faster training
      warmup_steps: 50
      max_length: 1024  # Shorter sequences
 evaluation:
  benchmarks:
    - "HLE"  # Humanity's Last Exam
    - "Do-Not-Answer"
  save_results: true
  results_dir: "./outputs/evaluation_results"
--- a/config/training_config_13b.yaml
+++ b/config/training_config_13b.yaml
@ -0,0 +1,83 @@
 experiment:
  name: "progressive_reasoning_13b"
  base_model: "meta-llama/Llama-3.2-13B"  # 13B model
  output_dir: "./outputs"
  use_wandb: true
  wandb_project: "matsuo-llm-comp-2025"
 model:
  load_in_4bit: true
  bnb_4bit_compute_dtype: "bfloat16"
  bnb_4bit_use_double_quant: true
  bnb_4bit_quant_type: "nf4"
  device_map: "auto"
  gradient_checkpointing: true
  use_flash_attention_2: true
 progressive_stages:
  - name: "basic_cot"
    description: "Basic Chain-of-Thought reasoning"
    dataset_path: "./data/basic_cot/"
    adapter_config:
      r: 32  # Higher rank for 13B models
      lora_alpha: 64
      lora_dropout: 0.05
      target_modules: ["q_proj", "v_proj", "k_proj", "o_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 1
      gradient_accumulation_steps: 32
      learning_rate: 1e-4
      warmup_steps: 100
      max_length: 2048
      bf16: true
      max_grad_norm: 0.3
      weight_decay: 0.001
  - name: "math_reasoning"
    description: "Mathematical reasoning with think tags"
    dataset_path: "./data/math_reasoning/"
    inherit_from: "basic_cot"
    adapter_config:
      r: 64
      lora_alpha: 128
      lora_dropout: 0.05
      target_modules: ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 2
      per_device_batch_size: 1
      gradient_accumulation_steps: 64
      learning_rate: 8e-5
      warmup_steps: 200
      max_length: 4096
      bf16: true
      max_grad_norm: 0.3
  - name: "complex_reasoning"
    description: "Complex multi-step reasoning"
    dataset_path: "./data/complex_reasoning/"
    inherit_from: "math_reasoning"
    adapter_config:
      r: 128  # Maximum rank for 13B models
      lora_alpha: 256
      lora_dropout: 0.05
      target_modules: ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 1
      gradient_accumulation_steps: 128
      learning_rate: 5e-5
      warmup_steps: 300
      max_length: 8192
      bf16: true
      max_grad_norm: 0.3
 evaluation:
  benchmarks:
    - "HLE"
    - "Do-Not-Answer"
  save_results: true
  results_dir: "./outputs/evaluation_results"
--- a/config/training_config_70b.yaml
+++ b/config/training_config_70b.yaml
@ -0,0 +1,101 @@
 experiment:
  name: "progressive_reasoning_70b"
  base_model: "meta-llama/Llama-3.2-70B"  # 70B model - requires significant resources
  output_dir: "./outputs"
  use_wandb: true
  wandb_project: "matsuo-llm-comp-2025"
 model:
  load_in_4bit: true
  bnb_4bit_compute_dtype: "bfloat16"
  bnb_4bit_use_double_quant: true
  bnb_4bit_quant_type: "nf4"
  device_map: "auto"
  gradient_checkpointing: true
  use_flash_attention_2: true
 progressive_stages:
  - name: "basic_cot"
    description: "Basic Chain-of-Thought reasoning"
    dataset_path: "./data/basic_cot/"
    adapter_config:
      r: 64  # Even higher rank for 70B models
      lora_alpha: 128
      lora_dropout: 0.05
      target_modules: ["q_proj", "v_proj", "k_proj", "o_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 1
      gradient_accumulation_steps: 64
      learning_rate: 5e-5  # Lower learning rate for stability
      warmup_steps: 200
      max_length: 2048
      bf16: true
      max_grad_norm: 0.3
      weight_decay: 0.001
      dataloader_num_workers: 2
  - name: "math_reasoning"
    description: "Mathematical reasoning with think tags"
    dataset_path: "./data/math_reasoning/"
    inherit_from: "basic_cot"
    adapter_config:
      r: 128
      lora_alpha: 256
      lora_dropout: 0.05
      target_modules: ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 1
      gradient_accumulation_steps: 128
      learning_rate: 3e-5
      warmup_steps: 300
      max_length: 4096
      bf16: true
      max_grad_norm: 0.3
      dataloader_num_workers: 2
  - name: "complex_reasoning"
    description: "Complex multi-step reasoning"
    dataset_path: "./data/complex_reasoning/"
    inherit_from: "math_reasoning"
    adapter_config:
      r: 256  # Maximum rank for 70B models
      lora_alpha: 512
      lora_dropout: 0.05
      target_modules: ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 1
      gradient_accumulation_steps: 256
      learning_rate: 2e-5
      warmup_steps: 500
      max_length: 8192
      bf16: true
      max_grad_norm: 0.3
      dataloader_num_workers: 2
 evaluation:
  benchmarks:
    - "HLE"
    - "Do-Not-Answer"
  save_results: true
  results_dir: "./outputs/evaluation_results"
 # Additional settings for 70B models
 optimization:
  gradient_checkpointing: true
  gradient_checkpointing_kwargs:
    use_reentrant: false
  ddp_find_unused_parameters: false
  # Multi-GPU settings
  fsdp: "full_shard auto_wrap"
  fsdp_transformer_layer_cls_to_wrap: "LlamaDecoderLayer"
  fsdp_min_num_params: 1000000
  fsdp_config:
    min_num_params: 1000000
    sharding_strategy: "FULL_SHARD"
    cpu_offload: false
--- a/config/training_config_gemma2_small.yaml
+++ b/config/training_config_gemma2_small.yaml
@ -0,0 +1,91 @@
 experiment:
  name: "progressive_reasoning_gemma2_small"
  base_model: "google/gemma-2-2b-it"  # Instruction-tuned version
  output_dir: "./outputs"
  use_wandb: true
  wandb_project: "matsuo-llm-comp-2025"
 model:
  load_in_4bit: false  # 2B model is manageable without quantization
  bnb_4bit_compute_dtype: "bfloat16"
  bnb_4bit_use_double_quant: true
  device_map: "auto"
  gradient_checkpointing: false
  use_flash_attention_2: false
  use_eager_attention: true  # Required for Gemma 3 models
 progressive_stages:
  - name: "basic_cot"
    description: "Basic Chain-of-Thought reasoning"
    dataset_path: "./data/basic_cot/"
    adapter_config:
      r: 8  # Start with smaller rank for small model
      lora_alpha: 16
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
      init_lora_weights: true
    training:
      num_epochs: 3
      per_device_batch_size: 8  # Larger batch size for small model
      gradient_accumulation_steps: 2
      learning_rate: 5e-4  # Higher learning rate for small model
      warmup_steps: 50
      max_length: 1024
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 50
      logging_steps: 10
  - name: "math_reasoning"
    description: "Mathematical reasoning with think tags"
    dataset_path: "./data/math_reasoning/"
    inherit_from: "basic_cot"
    adapter_config:
      r: 16
      lora_alpha: 32
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 3
      per_device_batch_size: 4
      gradient_accumulation_steps: 4
      learning_rate: 3e-4
      warmup_steps: 100
      max_length: 2048
      bf16: true
      max_grad_norm: 1.0
  - name: "complex_reasoning"
    description: "Complex multi-step reasoning with Mixture-of-Thoughts"
    dataset_path: "open-r1/Mixture-of-Thoughts"  # HuggingFace dataset
    inherit_from: "math_reasoning"
    adapter_config:
      r: 32
      lora_alpha: 64
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1  # Large dataset, fewer epochs
      per_device_batch_size: 2
      gradient_accumulation_steps: 8
      learning_rate: 2e-4
      warmup_steps: 200
      max_length: 4096
      bf16: true
      max_grad_norm: 1.0
      save_steps: 500
      logging_steps: 50
    dataset_config:
      streaming: true
      max_samples: 30000
      split: "train"
 evaluation:
  benchmarks:
    - "HLE"
    - "Do-Not-Answer"
  save_results: true
  results_dir: "./outputs/evaluation_results"
--- a/config/training_config_gemma3_1b.yaml
+++ b/config/training_config_gemma3_1b.yaml
@ -0,0 +1,102 @@
 experiment:
  name: "progressive_reasoning_gemma3_1b"
  base_model: "google/gemma-3-1b-pt"  # Using Gemma 2 2B (1B might not be available)
  output_dir: "./outputs"
  use_wandb: true
  wandb_project: "matsuo-llm-comp-2025"
 model:
  load_in_4bit: false
  bnb_4bit_compute_dtype: "bfloat16"
  bnb_4bit_use_double_quant: true
  device_map: "auto"
  gradient_checkpointing: false  # Not needed for small models
  use_flash_attention_2: false
  use_eager_attention: true
 progressive_stages:
  - name: "basic_cot"
    description: "Basic Chain-of-Thought reasoning"
    dataset_path: "./data/basic_cot/"
    adapter_config:
      r: 8
      lora_alpha: 16
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]  # Gemma attention modules
      init_lora_weights: true
    training:
      num_epochs: 2
      per_device_batch_size: 8
      gradient_accumulation_steps: 2
      learning_rate: 5e-4
      warmup_steps: 50
      max_length: 1024
      fp16: false
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 100
      logging_steps: 10
  - name: "math_reasoning"
    description: "Mathematical reasoning with OpenR1-Math-220k dataset"
    dataset_path: "open-r1/OpenR1-Math-220k"  # HuggingFace dataset
    inherit_from: "basic_cot"
    adapter_config:
      r: 16
      lora_alpha: 32
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1  # Large dataset, fewer epochs
      per_device_batch_size: 4
      gradient_accumulation_steps: 4
      learning_rate: 3e-4
      warmup_steps: 100
      max_length: 2048
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 1000
      logging_steps: 100
    dataset_config:
      # OpenR1-Math-220k specific settings
      streaming: true  # Use streaming for large dataset
      max_samples: 200000  # Limit samples for faster training
      split: "train"
  - name: "complex_reasoning"
    description: "Complex multi-step reasoning with Mixture-of-Thoughts"
    dataset_path: "open-r1/Mixture-of-Thoughts"  # HuggingFace dataset
    inherit_from: "math_reasoning"
    adapter_config:
      r: 32
      lora_alpha: 64
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1  # Large dataset, fewer epochs
      per_device_batch_size: 2
      gradient_accumulation_steps: 8
      learning_rate: 2e-4
      warmup_steps: 200
      max_length: 4096
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 500
      logging_steps: 50
    dataset_config:
      # Mixture-of-Thoughts specific settings
      streaming: true  # Use streaming for large dataset
      max_samples: 30000  # Limit samples for faster training
      split: "train"
 evaluation:
  benchmarks:
    - "HLE"
    - "Do-Not-Answer"
  save_results: true
  results_dir: "./outputs/evaluation_results"
--- a/config/training_config_gemma3_1b_cpu_offload.yaml
+++ b/config/training_config_gemma3_1b_cpu_offload.yaml
@ -0,0 +1,133 @@
 experiment:
  name: "progressive_reasoning_gemma3_1b_cpu_offload"
  base_model: "google/gemma-3-1b-pt"  # Using Gemma 3 1B
  output_dir: "./outputs"
  use_wandb: true
  wandb_project: "matsuo-llm-comp-2025"
 model:
  load_in_4bit: true  # Enable 4-bit quantization for QLoRA
  bnb_4bit_compute_dtype: "bfloat16"
  bnb_4bit_use_double_quant: true
  bnb_4bit_quant_type: "nf4"
  device_map: "auto"  # Let accelerate handle device placement
  max_memory:
    0: "5GB"  # Limit GPU memory to 3GB (leave room for CUDA kernels)
    "cpu": "32GB"  # Allow up to 32GB CPU RAM
  offload_folder: "./offload"  # Directory for disk offloading if needed
  gradient_checkpointing: true  # Trade compute for memory
  use_flash_attention_2: false
  use_eager_attention: true
 progressive_stages:
  - name: "basic_cot"
    description: "Basic Chain-of-Thought reasoning"
    dataset_path: "./data/basic_cot/"
    adapter_config:
      r: 8  # Lower rank for memory efficiency
      lora_alpha: 16
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
      init_lora_weights: true
    training:
      num_epochs: 2
      per_device_batch_size: 2  # Smaller batch size
      gradient_accumulation_steps: 8  # Compensate with gradient accumulation
      learning_rate: 5e-4
      warmup_steps: 50
      max_length: 512  # Shorter sequences for memory
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 100
      logging_steps: 10
      dataloader_num_workers: 0  # Disable multiprocessing to save memory
      optim: "paged_adamw_8bit"  # Use 8-bit optimizer
  - name: "math_reasoning"
    description: "Mathematical reasoning with OpenR1-Math-220k dataset"
    dataset_path: "open-r1/OpenR1-Math-220k"
    inherit_from: "basic_cot"
    adapter_config:
      r: 16
      lora_alpha: 32
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 1  # Minimal batch size
      gradient_accumulation_steps: 16
      learning_rate: 3e-4
      warmup_steps: 100
      max_length: 1024
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      save_steps: 1000
      logging_steps: 100
      optim: "paged_adamw_8bit"
    dataset_config:
      streaming: true
      max_samples: 200000  # Reduced for testing
      split: "train"
  - name: "complex_reasoning"
    description: "Complex multi-step reasoning with Mixture-of-Thoughts"
    dataset_path: "open-r1/Mixture-of-Thoughts"  # HuggingFace dataset
    inherit_from: "math_reasoning"
    adapter_config:
      r: 32
      lora_alpha: 64
      lora_dropout: 0.1
      target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 1
      gradient_accumulation_steps: 32
      learning_rate: 2e-4
      warmup_steps: 200
      max_length: 2048
      bf16: true
      max_grad_norm: 1.0
      weight_decay: 0.001
      optim: "paged_adamw_8bit"
      save_steps: 500
      logging_steps: 50
    dataset_config:
      streaming: true
      max_samples: 300000  # Limited for CPU offload config
      split: "train"
 evaluation:
  benchmarks:
    - "HLE"
    - "Do-Not-Answer"
  save_results: true
  results_dir: "./outputs/evaluation_results"
 # DeepSpeed configuration for advanced CPU offloading (optional)
 # Uncomment to use DeepSpeed ZeRO-2 with CPU offload
 # deepspeed:
 #   zero_optimization:
 #     stage: 2
 #     offload_optimizer:
 #       device: "cpu"
 #       pin_memory: true
 #     offload_param:
 #       device: "cpu"
 #       pin_memory: true
 #     overlap_comm: true
 #     contiguous_gradients: true
 #     sub_group_size: 1e9
 #     reduce_bucket_size: 1e6
 # FSDP configuration for distributed training (optional)
 # Uncomment to use FSDP with CPU offload
 # fsdp:
 #   sharding_strategy: "FULL_SHARD"
 #   cpu_offload: true
 #   auto_wrap_policy: "TRANSFORMER_BASED_WRAP"
 #   transformer_layer_cls_to_wrap: "GemmaDecoderLayer"
 #   min_num_params: 1e6
--- a/config/training_config_large.yaml
+++ b/config/training_config_large.yaml
@ -0,0 +1,98 @@
 experiment:
  name: "progressive_reasoning_large_model"
  base_model: "meta-llama/Llama-3.2-8B"  # Or other whitelisted models
  output_dir: "./outputs"
  use_wandb: true
  wandb_project: "matsuo-llm-comp-2025"
 model:
  load_in_4bit: true  # Enable 4-bit quantization for memory efficiency
  bnb_4bit_compute_dtype: "bfloat16"
  bnb_4bit_use_double_quant: true
  bnb_4bit_quant_type: "nf4"
  device_map: "auto"
  # Additional memory optimizations
  gradient_checkpointing: true
  use_flash_attention_2: true  # If available
 progressive_stages:
  - name: "basic_cot"
    description: "Basic Chain-of-Thought reasoning"
    dataset_path: "./data/basic_cot/"
    adapter_config:
      r: 16  # Larger rank for bigger models
      lora_alpha: 32
      lora_dropout: 0.05
      target_modules: ["q_proj", "v_proj", "k_proj", "o_proj"]
      init_lora_weights: true  # Identity initialization
    training:
      num_epochs: 1
      per_device_batch_size: 1  # Small batch size for large models
      gradient_accumulation_steps: 16  # Effective batch size = 16
      learning_rate: 2e-4
      warmup_steps: 100
      max_length: 2048
      fp16: false
      bf16: true
      max_grad_norm: 0.3
      weight_decay: 0.001
      save_steps: 50
      logging_steps: 10
  - name: "math_reasoning"
    description: "Mathematical reasoning with think tags"
    dataset_path: "./data/math_reasoning/"
    inherit_from: "basic_cot"
    adapter_config:
      r: 32  # Increase rank for more complex tasks
      lora_alpha: 64
      lora_dropout: 0.05
      target_modules: ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 2
      per_device_batch_size: 1
      gradient_accumulation_steps: 32  # Effective batch size = 32
      learning_rate: 1e-4
      warmup_steps: 200
      max_length: 4096
      bf16: true
      max_grad_norm: 0.3
      weight_decay: 0.001
  - name: "complex_reasoning"
    description: "Complex multi-step reasoning"
    dataset_path: "./data/complex_reasoning/"
    inherit_from: "math_reasoning"
    adapter_config:
      r: 64  # Maximum rank for most complex tasks
      lora_alpha: 128
      lora_dropout: 0.05
      target_modules: ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 2
      per_device_batch_size: 1
      gradient_accumulation_steps: 64  # Effective batch size = 64
      learning_rate: 5e-5
      warmup_steps: 300
      max_length: 8192
      bf16: true
      max_grad_norm: 0.3
      weight_decay: 0.001
 evaluation:
  benchmarks:
    - "HLE"
    - "Do-Not-Answer"
  save_results: true
  results_dir: "./outputs/evaluation_results"
 # Memory optimization settings
 optimization:
  gradient_checkpointing: true
  gradient_checkpointing_kwargs:
    use_reentrant: false
  ddp_find_unused_parameters: false
  fsdp: "full_shard auto_wrap"  # For multi-GPU setups
  fsdp_transformer_layer_cls_to_wrap: "LlamaDecoderLayer"
--- a/config/training_config_llama_auth.yaml
+++ b/config/training_config_llama_auth.yaml
@ -0,0 +1,85 @@
 experiment:
  name: "progressive_reasoning_llama_auth"
  base_model: "meta-llama/Llama-3.2-8B"
  output_dir: "./outputs"
  use_wandb: true
  wandb_project: "matsuo-llm-comp-2025"
 model:
  load_in_4bit: true
  bnb_4bit_compute_dtype: "bfloat16"
  bnb_4bit_use_double_quant: true
  bnb_4bit_quant_type: "nf4"
  device_map: "auto"
  gradient_checkpointing: true
  use_flash_attention_2: true
  # Add your HuggingFace token here, or set HF_TOKEN environment variable
  # hf_token: "your_token_here"
 progressive_stages:
  - name: "basic_cot"
    description: "Basic Chain-of-Thought reasoning"
    dataset_path: "./data/basic_cot/"
    adapter_config:
      r: 16
      lora_alpha: 32
      lora_dropout: 0.05
      target_modules: ["q_proj", "v_proj", "k_proj", "o_proj"]
      init_lora_weights: true
    training:
      num_epochs: 1
      per_device_batch_size: 1
      gradient_accumulation_steps: 16
      learning_rate: 2e-4
      warmup_steps: 100
      max_length: 2048
      bf16: true
      max_grad_norm: 0.3
      weight_decay: 0.001
  - name: "math_reasoning"
    description: "Mathematical reasoning with think tags"
    dataset_path: "./data/math_reasoning/"
    inherit_from: "basic_cot"
    adapter_config:
      r: 32
      lora_alpha: 64
      lora_dropout: 0.05
      target_modules: ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 2
      per_device_batch_size: 1
      gradient_accumulation_steps: 32
      learning_rate: 1e-4
      warmup_steps: 200
      max_length: 4096
      bf16: true
      max_grad_norm: 0.3
  - name: "complex_reasoning"
    description: "Complex multi-step reasoning"
    dataset_path: "./data/complex_reasoning/"
    inherit_from: "math_reasoning"
    adapter_config:
      r: 64
      lora_alpha: 128
      lora_dropout: 0.05
      target_modules: ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
      init_lora_weights: true
    training:
      num_epochs: 2
      per_device_batch_size: 1
      gradient_accumulation_steps: 64
      learning_rate: 5e-5
      warmup_steps: 300
      max_length: 8192
      bf16: true
      max_grad_norm: 0.3
 evaluation:
  benchmarks:
    - "HLE"
    - "Do-Not-Answer"
  save_results: true
  results_dir: "./outputs/evaluation_results"
--- a/config/training_config_public.yaml
+++ b/config/training_config_public.yaml
@ -0,0 +1,82 @@
 experiment:
  name: "progressive_reasoning_public_model"
  base_model: "microsoft/DialoGPT-medium"  # Public model, no authentication needed
  output_dir: "./outputs"
  use_wandb: false
  wandb_project: "matsuo-llm-comp-2025"
 model:
  load_in_4bit: false  # DialoGPT is smaller, quantization not needed
  bnb_4bit_compute_dtype: "bfloat16"
  bnb_4bit_use_double_quant: true
  device_map: "auto"
  gradient_checkpointing: false
 progressive_stages:
  - name: "basic_cot"
    description: "Basic Chain-of-Thought reasoning"
    dataset_path: "./data/basic_cot/"
    adapter_config:
      r: 16
      lora_alpha: 32
      lora_dropout: 0.1
      target_modules: ["c_attn", "c_proj"]  # GPT-2 style attention modules
      init_lora_weights: true
    training:
      num_epochs: 2
      per_device_batch_size: 4
      gradient_accumulation_steps: 4
      learning_rate: 2e-4
      warmup_steps: 100
      max_length: 1024
      fp16: false
      bf16: false  # Use fp32 for smaller models
      max_grad_norm: 1.0
      weight_decay: 0.001
  - name: "math_reasoning"
    description: "Mathematical reasoning with think tags"
    dataset_path: "./data/math_reasoning/"
    inherit_from: "basic_cot"
    adapter_config:
      r: 32
      lora_alpha: 64
      lora_dropout: 0.1
      target_modules: ["c_attn", "c_proj"]
      init_lora_weights: true
    training:
      num_epochs: 3
      per_device_batch_size: 2
      gradient_accumulation_steps: 8
      learning_rate: 1e-4
      warmup_steps: 200
      max_length: 2048
      bf16: false
      max_grad_norm: 1.0
  - name: "complex_reasoning"
    description: "Complex multi-step reasoning"
    dataset_path: "./data/complex_reasoning/"
    inherit_from: "math_reasoning"
    adapter_config:
      r: 64
      lora_alpha: 128
      lora_dropout: 0.1
      target_modules: ["c_attn", "c_proj"]
      init_lora_weights: true
    training:
      num_epochs: 2
      per_device_batch_size: 1
      gradient_accumulation_steps: 16
      learning_rate: 5e-5
      warmup_steps: 300
      max_length: 4096
      bf16: false
      max_grad_norm: 1.0
 evaluation:
  benchmarks:
    - "HLE"
    - "Do-Not-Answer"
  save_results: true
  results_dir: "./outputs/evaluation_results"
--- a/devenv.lock
+++ b/devenv.lock
@ -0,0 +1,139 @@
 {
  "nodes": {
    "devenv": {
      "locked": {
        "dir": "src/modules",
        "lastModified": 1751909516,
        "owner": "cachix",
        "repo": "devenv",
        "rev": "36e4cf7d6cb89862e69efce4e5c147ac2e4d38f9",
        "type": "github"
      },
      "original": {
        "dir": "src/modules",
        "owner": "cachix",
        "repo": "devenv",
        "type": "github"
      }
    },
    "flake-compat": {
      "flake": false,
      "locked": {
        "lastModified": 1747046372,
        "owner": "edolstra",
        "repo": "flake-compat",
        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
        "type": "github"
      },
      "original": {
        "owner": "edolstra",
        "repo": "flake-compat",
        "type": "github"
      }
    },
    "flake-compat_2": {
      "flake": false,
      "locked": {
        "lastModified": 1747046372,
        "owner": "edolstra",
        "repo": "flake-compat",
        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
        "type": "github"
      },
      "original": {
        "owner": "edolstra",
        "repo": "flake-compat",
        "type": "github"
      }
    },
    "git-hooks": {
      "inputs": {
        "flake-compat": "flake-compat",
        "gitignore": "gitignore",
        "nixpkgs": [
          "nixpkgs"
        ]
      },
      "locked": {
        "lastModified": 1750779888,
        "owner": "cachix",
        "repo": "git-hooks.nix",
        "rev": "16ec914f6fb6f599ce988427d9d94efddf25fe6d",
        "type": "github"
      },
      "original": {
        "owner": "cachix",
        "repo": "git-hooks.nix",
        "type": "github"
      }
    },
    "gitignore": {
      "inputs": {
        "nixpkgs": [
          "git-hooks",
          "nixpkgs"
        ]
      },
      "locked": {
        "lastModified": 1709087332,
        "owner": "hercules-ci",
        "repo": "gitignore.nix",
        "rev": "637db329424fd7e46cf4185293b9cc8c88c95394",
        "type": "github"
      },
      "original": {
        "owner": "hercules-ci",
        "repo": "gitignore.nix",
        "type": "github"
      }
    },
    "nixpkgs": {
      "locked": {
        "lastModified": 1751792365,
        "owner": "NixOS",
        "repo": "nixpkgs",
        "rev": "1fd8bada0b6117e6c7eb54aad5813023eed37ccb",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
        "ref": "nixos-unstable",
        "repo": "nixpkgs",
        "type": "github"
      }
    },
    "nixpkgs-python": {
      "inputs": {
        "flake-compat": "flake-compat_2",
        "nixpkgs": [
          "nixpkgs"
        ]
      },
      "locked": {
        "lastModified": 1749760516,
        "owner": "cachix",
        "repo": "nixpkgs-python",
        "rev": "908dbb466af5955ea479ac95953333fd64387216",
        "type": "github"
      },
      "original": {
        "owner": "cachix",
        "repo": "nixpkgs-python",
        "type": "github"
      }
    },
    "root": {
      "inputs": {
        "devenv": "devenv",
        "git-hooks": "git-hooks",
        "nixpkgs": "nixpkgs",
        "nixpkgs-python": "nixpkgs-python",
        "pre-commit-hooks": [
          "git-hooks"
        ]
      }
    }
  },
  "root": "root",
  "version": 7
 }
--- a/flake-minimal.nix
+++ b/flake-minimal.nix
@ -0,0 +1,95 @@
 {
  description = "Progressive LLM Training for 松尾研LLMコンペ2025 (Minimal)";
  inputs = {
    nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
    flake-utils.url = "github:numtide/flake-utils";
  };
  outputs = { self, nixpkgs, flake-utils }:
    flake-utils.lib.eachDefaultSystem (system:
      let
        pkgs = import nixpkgs {
          inherit system;
          config = {
            allowUnfree = true;
            cudaSupport = true;
          };
        };
        # Python 3.11 for better compatibility
        python = pkgs.python311;
        # Minimal Python packages
        pythonWithPackages = python.withPackages (ps: with ps; [
          # Core essentials only
          torch
          transformers
          numpy
          # Essential dependencies
          pyyaml
          # Build tools
          pip
          setuptools
          wheel
        ]);
      in
      {
        devShells.default = pkgs.mkShell {
          buildInputs = with pkgs; [
            # Python with packages
            pythonWithPackages
            # Build tools
            gcc
            cmake
            ninja
            pkg-config
            # Git
            git
            git-lfs
            # Libraries needed for Python packages
            openssl
            zlib
            glib
            stdenv.cc.cc.lib
            # CUDA support
            cudaPackages.cudatoolkit
            cudaPackages.cudnn
          ];
          shellHook = ''
            echo "🚀 Progressive LLM Training Environment (Minimal)"
            echo "Python version: $(python --version)"
            echo "PyTorch version: $(python -c 'import torch; print(torch.__version__)')"
            echo "CUDA available: $(python -c 'import torch; print(torch.cuda.is_available())')"
            # Set up CUDA environment
            export CUDA_HOME=${pkgs.cudaPackages.cudatoolkit}
            export CUDA_PATH=${pkgs.cudaPackages.cudatoolkit}
            export LD_LIBRARY_PATH=${pkgs.cudaPackages.cudatoolkit}/lib:${pkgs.cudaPackages.cudnn}/lib:${pkgs.stdenv.cc.cc.lib}/lib:$LD_LIBRARY_PATH
            # Set Python path
            export PYTHONPATH=$PWD/src:$PYTHONPATH
            echo ""
            echo "Note: This is a minimal configuration. Install additional packages with pip as needed:"
            echo "  pip install accelerate peft trl datasets bitsandbytes wandb jsonlines scikit-learn sentencepiece protobuf"
            echo "  pip install flash-attn --no-build-isolation"
          '';
          # Environment variables
          CUDA_HOME = "${pkgs.cudaPackages.cudatoolkit}";
          CUDA_PATH = "${pkgs.cudaPackages.cudatoolkit}";
          NIX_SHELL_PRESERVE_PROMPT = 1;
          LOCALE_ARCHIVE = "${pkgs.glibcLocales}/lib/locale/locale-archive";
          LC_ALL = "en_US.UTF-8";
        };
      });
 }
--- a/flake.lock
+++ b/flake.lock
@ -0,0 +1,61 @@
 {
  "nodes": {
    "flake-utils": {
      "inputs": {
        "systems": "systems"
      },
      "locked": {
        "lastModified": 1731533236,
        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
        "owner": "numtide",
        "repo": "flake-utils",
        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
        "type": "github"
      },
      "original": {
        "owner": "numtide",
        "repo": "flake-utils",
        "type": "github"
      }
    },
    "nixpkgs": {
      "locked": {
        "lastModified": 1751792365,
        "narHash": "sha256-J1kI6oAj25IG4EdVlg2hQz8NZTBNYvIS0l4wpr9KcUo=",
        "owner": "NixOS",
        "repo": "nixpkgs",
        "rev": "1fd8bada0b6117e6c7eb54aad5813023eed37ccb",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
        "ref": "nixos-unstable",
        "repo": "nixpkgs",
        "type": "github"
      }
    },
    "root": {
      "inputs": {
        "flake-utils": "flake-utils",
        "nixpkgs": "nixpkgs"
      }
    },
    "systems": {
      "locked": {
        "lastModified": 1681028828,
        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
        "owner": "nix-systems",
        "repo": "default",
        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
        "type": "github"
      },
      "original": {
        "owner": "nix-systems",
        "repo": "default",
        "type": "github"
      }
    }
  },
  "root": "root",
  "version": 7
 }
--- a/flake.nix
+++ b/flake.nix
@ -0,0 +1,195 @@
 {
  description = "Progressive LLM Training for 松尾研LLMコンペ2025";
  inputs = {
    nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
    flake-utils.url = "github:numtide/flake-utils";
  };
  outputs = { self, nixpkgs, flake-utils }:
    flake-utils.lib.eachDefaultSystem (system:
      let
        pkgs = import nixpkgs {
          inherit system;
          config = {
            allowUnfree = true;
            cudaSupport = true;
          };
          overlays = [
            (final: prev: {
              python311 = prev.python311.override {
                packageOverrides = python-self: python-super: {
                  # Disable tests for problematic packages
                  pytest-doctestplus = python-super.pytest-doctestplus.overrideAttrs (oldAttrs: {
                    doCheck = false;
                    doInstallCheck = false;
                    pytestCheckPhase = "echo 'Skipping tests'";
                  });
                  # Also disable tests for jupyter-related packages if they cause issues
                  jupyter = python-super.jupyter.overrideAttrs (oldAttrs: {
                    doCheck = false;
                    doInstallCheck = false;
                  });
                  notebook = python-super.notebook.overrideAttrs (oldAttrs: {
                    doCheck = false;
                    doInstallCheck = false;
                  });
                  # Disable tests for psycopg and psycopg2
                  psycopg = python-super.psycopg.overrideAttrs (oldAttrs: {
                    doCheck = false;
                    doInstallCheck = false;
                    pytestCheckPhase = "echo 'Skipping tests'";
                    pythonImportsCheck = [];  # Disable import checks
                  });
                  psycopg2 = python-super.psycopg2.overrideAttrs (oldAttrs: {
                    doCheck = false;
                    doInstallCheck = false;
                    pytestCheckPhase = "echo 'Skipping tests'";
                    pythonImportsCheck = [];  # Disable import checks
                  });
                  # Disable tests for sqlframe
                  sqlframe = python-super.sqlframe.overrideAttrs (oldAttrs: {
                    doCheck = false;
                    doInstallCheck = false;
                    pytestCheckPhase = "echo 'Skipping tests'";
                    pythonImportsCheck = [];  # Disable import checks
                  });
                  # Disable tests for accelerate
                  accelerate = python-super.accelerate.overrideAttrs (oldAttrs: {
                    doCheck = false;
                    doInstallCheck = false;
                    pytestCheckPhase = "echo 'Skipping tests'";
                    pythonImportsCheck = [];  # Disable import checks
                  });
                };
              };
            })
          ];
        };
        # Python 3.11 for better compatibility
        python = pkgs.python311;
        # Python packages
        pythonWithPackages = python.withPackages (ps: with ps; [
          # Core ML packages
          torch
          torchvision
          torchaudio
          transformers
          accelerate
          datasets
          tokenizers
          scikit-learn
          # Required dependencies from requirements.txt
          pyyaml
          jsonlines
          sentencepiece
          protobuf
          # Additional useful packages
          numpy
          scipy
          matplotlib
          jupyter
          notebook
          ipython
          pandas
          rich  # For TUI
          # Development tools
          black
          flake8
          pytest
          mypy
          # Build tools
          pip
          setuptools
          wheel
          # LLM specific packages
          peft
          trl
          bitsandbytes
          wandb
        ]);
      in
      {
        devShells.default = pkgs.mkShell {
          buildInputs = with pkgs; [
            # Python with packages
            pythonWithPackages
            # Build tools
            gcc
            cmake
            ninja
            pkg-config
            # Git
            git
            git-lfs
            # Development tools
            htop
            tmux
            vim
            # Libraries needed for Python packages
            openssl
            zlib
            glib
            stdenv.cc.cc.lib
            # CUDA support
            cudaPackages.cudatoolkit
            cudaPackages.cudnn
          ];
          shellHook = ''
            echo "🚀 Progressive LLM Training Environment"
            echo "Python version: $(python --version)"
            echo "PyTorch version: $(python -c 'import torch; print(torch.__version__)')"
            echo "CUDA available: $(python -c 'import torch; print(torch.cuda.is_available())')"
            # Set up CUDA environment
            export CUDA_HOME=${pkgs.cudaPackages.cudatoolkit}
            export CUDA_PATH=${pkgs.cudaPackages.cudatoolkit}
            export LD_LIBRARY_PATH=${pkgs.cudaPackages.cudatoolkit}/lib:${pkgs.cudaPackages.cudnn}/lib:${pkgs.stdenv.cc.cc.lib}/lib:$LD_LIBRARY_PATH
            # Set Python path
            export PYTHONPATH=$PWD/src:$PYTHONPATH
            echo ""
            echo "Available commands:"
            echo "  python scripts/train_progressive.py  # Start training"
            echo "  python scripts/evaluate.py          # Evaluate model"
            echo "  jupyter notebook                     # Start Jupyter"
            echo ""
            # Create data directory if not exists
            mkdir -p data
            # Prepare sample data if not exists
            if [ ! -f "data/basic_cot/train.jsonl" ]; then
              echo "Preparing sample datasets..."
              python -c "from src.data_utils import prepare_sample_datasets; prepare_sample_datasets()" || echo "Sample data preparation skipped"
            fi
            # Note about flash-attn
            echo "Note: flash-attn is not included in nixpkgs. If needed, install manually with:"
            echo "  pip install flash-attn --no-build-isolation"
          '';
          # Environment variables
          CUDA_HOME = "${pkgs.cudaPackages.cudatoolkit}";
          CUDA_PATH = "${pkgs.cudaPackages.cudatoolkit}";
          NIX_SHELL_PRESERVE_PROMPT = 1;
          LOCALE_ARCHIVE = "${pkgs.glibcLocales}/lib/locale/locale-archive";
          LC_ALL = "en_US.UTF-8";
        };
      });
 }
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@ -0,0 +1,15 @@
 # CPU version of PyTorch
 torch>=2.0.0 --index-url https://download.pytorch.org/whl/cpu
 transformers>=4.40.0
 accelerate>=0.27.0
 peft>=0.11.0
 trl>=0.9.0
 datasets>=2.18.0
 bitsandbytes>=0.43.0
 wandb>=0.16.0
 pyyaml>=6.0
 jsonlines>=4.0.0
 scikit-learn>=1.3.0
 # flash-attn is not needed for CPU version
 sentencepiece>=0.2.0
 protobuf>=4.25.0
--- a/requirements-torch.txt
+++ b/requirements-torch.txt
@ -0,0 +1,3 @@
 --index-url https://download.pytorch.org/whl/cu128
 torch>=2.0.0
 torchaudio>=2.0.0
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,13 @@
 transformers>=4.40.0
 accelerate>=0.27.0
 peft>=0.11.0
 trl>=0.9.0
 datasets>=2.18.0
 bitsandbytes>=0.43.0
 wandb>=0.16.0
 pyyaml>=6.0
 jsonlines>=4.0.0
 scikit-learn>=1.3.0
 # flash-attn>=2.5.0  # Install separately with --no-build-isolation
 sentencepiece>=0.2.0
 protobuf>=4.25.0
--- a/scripts/analyze_adapter_size.py
+++ b/scripts/analyze_adapter_size.py
@ -0,0 +1,137 @@
 #!/usr/bin/env python3
 """
 Analyze the size and structure of LoRA adapters
 """
 import sys
 from pathlib import Path
 import torch
 import yaml
 from peft import PeftModel, LoraConfig
 # Add src to path
 sys.path.append(str(Path(__file__).parent.parent))
 from src.progressive_model import ProgressiveReasoningModel
 def analyze_adapter_sizes():
    # Load configuration
    with open("config/training_config.yaml") as f:
        config = yaml.safe_load(f)
    print("=" * 60)
    print("LoRA Adapter Size Analysis")
    print("=" * 60)
    # Get adapter configuration from config
    basic_cot_config = config["progressive_stages"][0]
    adapter_config = basic_cot_config["adapter_config"]
    print(f"\nConfiguration for 'basic_cot' adapter:")
    print(f"  - r (rank): {adapter_config['r']}")
    print(f"  - lora_alpha: {adapter_config['lora_alpha']}")
    print(f"  - lora_dropout: {adapter_config['lora_dropout']}")
    print(f"  - target_modules: {adapter_config['target_modules']}")
    # Load the base model to get dimensions
    print("\nLoading base model to analyze dimensions...")
    model_wrapper = ProgressiveReasoningModel(config)
    model_wrapper.setup_base_model()
    # Analyze model architecture
    print(f"\nBase model: {config['experiment']['base_model']}")
    # Count parameters in base model
    total_params = sum(p.numel() for p in model_wrapper.model.parameters())
    print(f"Total base model parameters: {total_params:,}")
    # Load saved adapter if it exists
    adapter_path = Path(config["experiment"]["output_dir"]) / "adapters" / "basic_cot"
    if adapter_path.exists():
        print(f"\nLoading saved adapter from: {adapter_path}")
        # Load adapter state dict
        adapter_model_path = adapter_path / "adapter_model.safetensors"
        if not adapter_model_path.exists():
            adapter_model_path = adapter_path / "adapter_model.bin"
        if adapter_model_path.exists():
            if adapter_model_path.suffix == ".safetensors":
                from safetensors.torch import load_file
                adapter_weights = load_file(adapter_model_path)
            else:
                adapter_weights = torch.load(adapter_model_path, map_location="cpu")
            print("\nLoRA Adapter Layer Details:")
            print("-" * 60)
            total_lora_params = 0
            layer_info = {}
            for name, tensor in adapter_weights.items():
                size = tensor.numel()
                total_lora_params += size
                # Parse layer name
                parts = name.split('.')
                if 'lora_A' in name or 'lora_B' in name:
                    # Extract module info
                    module_name = '.'.join(parts[:-2])
                    lora_type = parts[-2]  # lora_A or lora_B
                    if module_name not in layer_info:
                        layer_info[module_name] = {}
                    layer_info[module_name][lora_type] = {
                        'shape': list(tensor.shape),
                        'params': size
                    }
            # Display layer information
            for module, info in sorted(layer_info.items()):
                print(f"\nModule: {module}")
                if 'lora_A' in info and 'lora_B' in info:
                    shape_a = info['lora_A']['shape']
                    shape_b = info['lora_B']['shape']
                    params_a = info['lora_A']['params']
                    params_b = info['lora_B']['params']
                    print(f"  LoRA A: {shape_a} = {params_a:,} parameters")
                    print(f"  LoRA B: {shape_b} = {params_b:,} parameters")
                    print(f"  Total: {params_a + params_b:,} parameters")
                    # Calculate original layer size (approximation)
                    original_size = shape_a[1] * shape_b[0]
                    compression_ratio = original_size / (params_a + params_b)
                    print(f"  Original layer size (approx): {original_size:,} parameters")
                    print(f"  Compression ratio: {compression_ratio:.1f}x")
            print("\n" + "=" * 60)
            print(f"Total LoRA parameters: {total_lora_params:,}")
            print(f"Percentage of base model: {(total_lora_params / total_params) * 100:.2f}%")
            # Calculate theoretical size
            r = adapter_config['r']
            num_modules = len(adapter_config['target_modules'])
            # For GPT models, typical dimensions
            if "DialoGPT" in config['experiment']['base_model']:
                hidden_size = 768  # DialoGPT-small uses 768
                print(f"\nTheoretical calculation (hidden_size={hidden_size}, r={r}):")
                print(f"  Per module: 2 * {hidden_size} * {r} = {2 * hidden_size * r:,} parameters")
                print(f"  Total ({num_modules} modules): {2 * hidden_size * r * num_modules:,} parameters")
    else:
        print(f"\nNo saved adapter found at: {adapter_path}")
        print("Run training first to generate the adapter.")
        # Show theoretical sizes based on config
        r = adapter_config['r']
        print(f"\nTheoretical LoRA sizes with r={r}:")
        print(f"  For hidden_size=768 (DialoGPT-small): {2 * 768 * r:,} params per module")
        print(f"  For hidden_size=1024 (medium models): {2 * 1024 * r:,} params per module")
        print(f"  For hidden_size=1280 (GPT-2 large): {2 * 1280 * r:,} params per module")
 if __name__ == "__main__":
    analyze_adapter_sizes()
--- a/scripts/check_vram.py
+++ b/scripts/check_vram.py
@ -0,0 +1,199 @@
 #!/usr/bin/env python3
 """
 Check VRAM usage and model memory requirements
 """
 import torch
 import psutil
 import sys
 from pathlib import Path
 import yaml
 # Add src to path
 sys.path.append(str(Path(__file__).parent.parent))
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 def get_memory_info():
    """Get current memory usage"""
    if torch.cuda.is_available():
        print("=== CUDA Information ===")
        print(f"CUDA available: {torch.cuda.is_available()}")
        print(f"CUDA device: {torch.cuda.get_device_name(0)}")
        print(f"CUDA device count: {torch.cuda.device_count()}")
        # Get VRAM info
        vram_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
        vram_reserved = torch.cuda.memory_reserved(0) / 1024**3
        vram_allocated = torch.cuda.memory_allocated(0) / 1024**3
        vram_free = vram_total - vram_allocated
        print(f"\n=== VRAM Usage ===")
        print(f"Total VRAM: {vram_total:.2f} GB")
        print(f"Allocated VRAM: {vram_allocated:.2f} GB")
        print(f"Reserved VRAM: {vram_reserved:.2f} GB")
        print(f"Free VRAM: {vram_free:.2f} GB")
    else:
        print("CUDA not available!")
    # Get system RAM info
    ram = psutil.virtual_memory()
    print(f"\n=== System RAM ===")
    print(f"Total RAM: {ram.total / 1024**3:.2f} GB")
    print(f"Available RAM: {ram.available / 1024**3:.2f} GB")
    print(f"Used RAM: {ram.used / 1024**3:.2f} GB ({ram.percent}%)")
 def estimate_model_size(model_name: str, quantization: str = None):
    """Estimate model memory requirements"""
    print(f"\n=== Model Memory Estimation ===")
    print(f"Model: {model_name}")
    # Common model sizes (in billions of parameters)
    model_sizes = {
        "gemma-2-2b": 2.5,
        "gemma-3-1b": 1.2,
        "llama-3.2-8b": 8,
        "llama-3.2-13b": 13,
        "llama-3.2-70b": 70,
    }
    # Find model size
    model_key = None
    for key in model_sizes:
        if key in model_name.lower():
            model_key = key
            break
    if model_key:
        params_billions = model_sizes[model_key]
        # Memory estimates (rough)
        fp32_gb = params_billions * 4  # 4 bytes per parameter
        fp16_gb = params_billions * 2  # 2 bytes per parameter
        int8_gb = params_billions * 1  # 1 byte per parameter
        int4_gb = params_billions * 0.5  # 0.5 bytes per parameter
        print(f"Estimated parameters: {params_billions}B")
        print(f"Memory requirements:")
        print(f"  FP32: ~{fp32_gb:.1f} GB")
        print(f"  FP16/BF16: ~{fp16_gb:.1f} GB")
        print(f"  INT8: ~{int8_gb:.1f} GB")
        print(f"  INT4 (QLoRA): ~{int4_gb:.1f} GB")
        # Add overhead for activations and gradients
        print(f"\nWith training overhead:")
        print(f"  FP16 + LoRA: ~{fp16_gb * 1.5:.1f} GB")
        print(f"  INT4 + QLoRA: ~{int4_gb * 1.5:.1f} GB")
    else:
        print("Model size not recognized, unable to estimate memory requirements")
 def suggest_offloading_strategies():
    """Suggest CPU offloading strategies"""
    print("\n=== CPU Offloading Strategies ===")
    print("\n1. **Device Map Auto with CPU Offload**")
    print("   ```python")
    print("   device_map = {")
    print("       'model.embed_tokens': 'cpu',")
    print("       'model.layers.0': 0,  # GPU")
    print("       'model.layers.1': 0,  # GPU")
    print("       'model.layers.2': 'cpu',  # CPU")
    print("       # ... distribute layers between GPU and CPU")
    print("   }")
    print("   ```")
    print("\n2. **Accelerate's CPU Offload**")
    print("   ```yaml")
    print("   model:")
    print("     device_map: 'auto'")
    print("     max_memory:")
    print("       0: '4GB'  # Limit GPU memory")
    print("       'cpu': '20GB'  # Allow CPU memory")
    print("   ```")
    print("\n3. **DeepSpeed ZeRO-Offload**")
    print("   - ZeRO-2: Offload optimizer states to CPU")
    print("   - ZeRO-3: Offload optimizer states and parameters to CPU")
    print("   ```yaml")
    print("   deepspeed:")
    print("     zero_optimization:")
    print("       stage: 2")
    print("       offload_optimizer:")
    print("         device: 'cpu'")
    print("   ```")
    print("\n4. **Gradient Checkpointing + CPU Offload**")
    print("   - Trade compute for memory")
    print("   - Combine with layer-wise CPU offloading")
    print("\n5. **QLoRA with CPU Offload**")
    print("   - 4-bit quantization reduces base model size")
    print("   - Only LoRA parameters on GPU")
    print("   - Base model layers can be on CPU")
 def check_config_compatibility(config_path: str):
    """Check if config is compatible with CPU offloading"""
    if Path(config_path).exists():
        with open(config_path) as f:
            config = yaml.safe_load(f)
        print(f"\n=== Config Analysis: {config_path} ===")
        model_config = config.get("model", {})
        print(f"Current settings:")
        print(f"  4-bit quantization: {model_config.get('load_in_4bit', False)}")
        print(f"  Gradient checkpointing: {model_config.get('gradient_checkpointing', False)}")
        print(f"  Device map: {model_config.get('device_map', 'None')}")
        if model_config.get('load_in_4bit', False):
            print("✓ Already using 4-bit quantization (good for memory)")
        else:
            print("✗ Consider enabling 4-bit quantization")
        if not model_config.get('gradient_checkpointing', False):
            print("✗ Consider enabling gradient checkpointing")
 def main():
    """Main function"""
    print("VRAM and Memory Analysis for Progressive LLM Training")
    print("=" * 60)
    # Get memory info
    get_memory_info()
    # Estimate model sizes
    models = [
        "google/gemma-2-2b-it",
        "google/gemma-3-1b-pt",
        "meta-llama/Llama-3.2-8B",
    ]
    for model in models:
        estimate_model_size(model)
    # Suggest strategies
    suggest_offloading_strategies()
    # Check configs
    configs = [
        "config/training_config_gemma3_1b.yaml",
        "config/training_config_gemma2_small.yaml",
    ]
    for config in configs:
        check_config_compatibility(config)
    print("\n=== Recommendations ===")
    print("1. Start with QLoRA (4-bit) if not already enabled")
    print("2. Use device_map with max_memory limits")
    print("3. Enable gradient checkpointing")
    print("4. Consider DeepSpeed for advanced offloading")
    print("5. Monitor actual usage during training")
 if __name__ == "__main__":
    main()
--- a/scripts/compare_models_tui.py
+++ b/scripts/compare_models_tui.py
@ -0,0 +1,183 @@
 #!/usr/bin/env python3
 """
 TUI for comparing original and trained models
 """
 import sys
 from pathlib import Path
 import yaml
 import torch
 from rich.console import Console
 from rich.panel import Panel
 from rich.columns import Columns
 from rich.prompt import Prompt
 from rich.text import Text
 from rich.layout import Layout
 from rich.live import Live
 from rich.table import Table
 import time
 # Add src to path
 sys.path.append(str(Path(__file__).parent.parent))
 from src.progressive_model import ProgressiveReasoningModel
 class ModelCompareTUI:
    def __init__(self, config_path: str = "config/training_config.yaml"):
        self.console = Console()
        # Load configuration
        with open(config_path) as f:
            self.config = yaml.safe_load(f)
        # Initialize models
        self.console.print("[yellow]Loading models...[/yellow]")
        # Original model
        self.original_model = ProgressiveReasoningModel(self.config)
        self.original_model.setup_base_model()
        # Trained model
        self.trained_model = ProgressiveReasoningModel(self.config)
        self.trained_model.setup_base_model()
        # Load the trained adapter if it exists
        adapter_path = Path(self.config["experiment"]["output_dir"]) / "adapters" / "basic_cot"
        if adapter_path.exists():
            self.console.print(f"[green]Loading trained adapter from: {adapter_path}[/green]")
            self.trained_model.load_for_inference(["basic_cot"])
        else:
            self.console.print("[red]No trained adapter found. Please run training first.[/red]")
            self.console.print("[yellow]Both models will show original behavior.[/yellow]")
        self.console.print("[green]Models loaded successfully![/green]\n")
    def generate_response(self, model, prompt: str, with_think_tags: bool = True) -> str:
        """Generate response from a model"""
        # For trained model, encourage think tags
        if with_think_tags and model == self.trained_model:
            formatted_prompt = f"{prompt}\n\nPlease think step by step."
        else:
            formatted_prompt = prompt
        inputs = model.tokenizer(formatted_prompt, return_tensors="pt").to(model.model.device)
        with torch.no_grad():
            outputs = model.model.generate(
                **inputs,
                max_length=512,
                temperature=0.7,
                do_sample=True,
                top_p=0.95,
                pad_token_id=model.tokenizer.pad_token_id,
                eos_token_id=model.tokenizer.eos_token_id
            )
        response = model.tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract response after prompt
        response = response[len(formatted_prompt):].strip()
        return response
    def create_comparison_panel(self, prompt: str, original_response: str, trained_response: str) -> Panel:
        """Create a panel showing the comparison"""
        # Create table
        table = Table(show_header=True, header_style="bold magenta", expand=True)
        table.add_column("Original Model", style="cyan", width=50)
        table.add_column("Trained Model (with CoT)", style="green", width=50)
        table.add_row(original_response, trained_response)
        return Panel(
            table,
            title=f"[bold yellow]Prompt: {prompt}[/bold yellow]",
            border_style="blue"
        )
    def run_interactive_mode(self):
        """Run interactive comparison mode"""
        self.console.print("\n[bold cyan]Model Comparison TUI[/bold cyan]")
        self.console.print("Compare responses from original and trained models\n")
        self.console.print("[dim]Type 'quit' or 'exit' to leave[/dim]\n")
        while True:
            # Get user prompt
            prompt = Prompt.ask("\n[bold yellow]Enter your prompt[/bold yellow]")
            if prompt.lower() in ['quit', 'exit']:
                self.console.print("\n[yellow]Goodbye![/yellow]")
                break
            # Generate responses
            self.console.print("\n[dim]Generating responses...[/dim]")
            start_time = time.time()
            original_response = self.generate_response(self.original_model, prompt, with_think_tags=False)
            original_time = time.time() - start_time
            start_time = time.time()
            trained_response = self.generate_response(self.trained_model, prompt, with_think_tags=True)
            trained_time = time.time() - start_time
            # Display comparison
            panel = self.create_comparison_panel(prompt, original_response, trained_response)
            self.console.print(panel)
            # Show generation times
            self.console.print(f"\n[dim]Generation times - Original: {original_time:.2f}s, Trained: {trained_time:.2f}s[/dim]")
    def run_benchmark_mode(self):
        """Run benchmark with predefined prompts"""
        test_prompts = [
            "What is 156 + 389?",
            "If I have 23 apples and buy 17 more, how many do I have?",
            "A store has 145 items. If 38 are sold, how many remain?",
            "What is 45 * 12?",
            "Explain why 2 + 2 = 4",
            "If a train travels 80 km/h for 2.5 hours, how far does it go?",
            "What is the sum of all numbers from 1 to 10?",
            "How many minutes are in 3.5 hours?",
        ]
        self.console.print("\n[bold cyan]Running Benchmark Comparison[/bold cyan]\n")
        for i, prompt in enumerate(test_prompts, 1):
            self.console.print(f"[bold]Test {i}/{len(test_prompts)}[/bold]")
            # Generate responses
            original_response = self.generate_response(self.original_model, prompt, with_think_tags=False)
            trained_response = self.generate_response(self.trained_model, prompt, with_think_tags=True)
            # Display comparison
            panel = self.create_comparison_panel(prompt, original_response, trained_response)
            self.console.print(panel)
            self.console.print("")
        self.console.print("[green]Benchmark completed![/green]")
 def main():
    import argparse
    parser = argparse.ArgumentParser(description="Compare original and trained models")
    parser.add_argument("--mode", choices=["interactive", "benchmark"], default="interactive",
                       help="Mode to run the comparison")
    parser.add_argument("--config", default="config/training_config.yaml",
                       help="Path to configuration file")
    args = parser.parse_args()
    # Create TUI
    tui = ModelCompareTUI(args.config)
    # Run in selected mode
    if args.mode == "interactive":
        tui.run_interactive_mode()
    else:
        tui.run_benchmark_mode()
 if __name__ == "__main__":
    main()
--- a/scripts/evaluate.py
+++ b/scripts/evaluate.py
@ -0,0 +1,59 @@
 #!/usr/bin/env python3
 """
 Evaluation script for progressive model
 """
 import sys
 from pathlib import Path
 sys.path.append(str(Path(__file__).parent.parent))
 from src.progressive_model import ProgressiveReasoningModel
 import yaml
 def evaluate_reasoning(model_wrapper, test_prompts):
    """Evaluate model on test prompts"""
    results = []
    for prompt in test_prompts:
        print(f"\nPrompt: {prompt}")
        response = model_wrapper.generate_with_reasoning(prompt)
        print(f"Response: {response}")
        results.append({
            "prompt": prompt,
            "response": response
        })
    return results
 def main():
    # Load config
    with open("config/training_config.yaml") as f:
        config = yaml.safe_load(f)
    # Initialize model
    model_wrapper = ProgressiveReasoningModel(config)
    model_wrapper.setup_base_model()
    # Test different adapters
    test_prompts = [
        "What is 156 + 389?",
        "If a train travels 80 km/h for 2.5 hours, how far does it go?",
        "Explain why the sky is blue.",
    ]
    # Test each adapter
    for adapter_name in ["basic_cot", "math_reasoning", "complex_reasoning"]:
        if adapter_name in model_wrapper.adapters:
            print(f"\n{'='*50}")
            print(f"Testing adapter: {adapter_name}")
            print(f"{'='*50}")
            model_wrapper.load_for_inference([adapter_name])
            results = evaluate_reasoning(model_wrapper, test_prompts)
 if __name__ == "__main__":
    main()
--- a/scripts/simple_compare.py
+++ b/scripts/simple_compare.py
@ -0,0 +1,189 @@
 #!/usr/bin/env python3
 """
 Simple comparison script without rich TUI
 """
 import sys
 from pathlib import Path
 import yaml
 import torch
 import argparse
 # Add src to path
 sys.path.append(str(Path(__file__).parent.parent))
 from src.progressive_model import ProgressiveReasoningModel
 def parse_args():
    parser = argparse.ArgumentParser(description="Compare original and trained models")
    parser.add_argument(
        "--config", "-c",
        type=str,
        default="config/training_config_gemma2_small.yaml",
        help="Path to configuration file"
    )
    parser.add_argument(
        "--adapter", "-a",
        type=str,
        default="basic_cot",
        help="Adapter name to load for comparison"
    )
    parser.add_argument(
        "--max-length",
        type=int,
        default=512,
        help="Maximum generation length"
    )
    return parser.parse_args()
 def load_config(config_path):
    """Load configuration from file"""
    config_path = Path(config_path)
    if not config_path.exists():
        raise FileNotFoundError(f"Configuration file not found: {config_path}")
    with open(config_path) as f:
        config = yaml.safe_load(f)
    return config
 def generate_response(model, tokenizer, prompt, max_length=512):
    """Generate response using the model"""
    # Format prompt for Gemma
    formatted_prompt = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
    # Tokenize
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=len(inputs["input_ids"][0]) + max_length,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1,
        )
    # Decode
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract only the model's response
    if "<start_of_turn>model" in response:
        response = response.split("<start_of_turn>model")[-1].strip()
    return response
 def main():
    args = parse_args()
    try:
        config = load_config(args.config)
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return
    print(f"Progressive Model Comparison")
    print(f"Config: {args.config}")
    print(f"Base model: {config['experiment']['base_model']}")
    print(f"Adapter: {args.adapter}")
    print("="*60)
    print("Loading models...")
    # Original model (no adapter)
    print("Loading original model...")
    original_model = ProgressiveReasoningModel(config)
    original_model.setup_base_model()
    # Trained model (with adapter)
    print("Loading trained model...")
    trained_model = ProgressiveReasoningModel(config)
    trained_model.setup_base_model()
    # Load the trained adapter if it exists
    adapter_path = Path(config["experiment"]["output_dir"]) / "adapters" / args.adapter
    if adapter_path.exists():
        print(f"Loading trained adapter from: {adapter_path}")
        try:
            trained_model.load_for_inference([args.adapter])
            print("Adapter loaded successfully!")
        except Exception as e:
            print(f"Error loading adapter: {e}")
            print("Will compare with base model instead.")
    else:
        print(f"No trained adapter found at: {adapter_path}")
        print("Available adapters:")
        adapters_dir = Path(config["experiment"]["output_dir"]) / "adapters"
        if adapters_dir.exists():
            for adapter_dir in adapters_dir.iterdir():
                if adapter_dir.is_dir():
                    print(f"  - {adapter_dir.name}")
        else:
            print("  No adapters directory found.")
        print("Both models will show original behavior.")
    print("\nModels loaded! Enter prompts to compare (type 'quit' to exit)")
    print("Examples:")
    print("  - What is 25 + 17?")
    print("  - Explain why the sky is blue")
    print("  - Solve this step by step: If I have 10 apples and give away 3, how many do I have left?")
    print()
    while True:
        try:
            prompt = input("\nPrompt: ").strip()
            if prompt.lower() in ['quit', 'exit', 'q']:
                break
            if not prompt:
                continue
            print(f"\n{'='*60}")
            print("ORIGINAL MODEL (No fine-tuning)")
            print("="*60)
            try:
                original_response = generate_response(
                    original_model.model, 
                    original_model.tokenizer, 
                    prompt,
                    args.max_length
                )
                print(original_response)
            except Exception as e:
                print(f"Error generating original response: {e}")
            print(f"\n{'='*60}")
            print(f"TRAINED MODEL (With {args.adapter} adapter)")
            print("="*60)
            try:
                # Add CoT prompt for trained model
                cot_prompt = f"{prompt}\n\nPlease think step by step using <think> tags."
                trained_response = generate_response(
                    trained_model.model, 
                    trained_model.tokenizer, 
                    cot_prompt,
                    args.max_length
                )
                print(trained_response)
            except Exception as e:
                print(f"Error generating trained response: {e}")
        except KeyboardInterrupt:
            print("\nExiting...")
            break
        except Exception as e:
            print(f"Error: {e}")
            continue
 if __name__ == "__main__":
    main()
--- a/scripts/train_progressive.py
+++ b/scripts/train_progressive.py
@ -0,0 +1,131 @@
 #!/usr/bin/env python3
 """
 Main training script for progressive reasoning model
 """
 import sys
 import yaml
 import argparse
 from pathlib import Path
 # Add src to path
 sys.path.append(str(Path(__file__).parent.parent))
 from src.progressive_model import ProgressiveReasoningModel
 from src.training import ProgressiveTrainer
 from src.data_utils import prepare_sample_datasets
 def parse_args():
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(
        description="Progressive LLM Training for 松尾研LLMコンペ2025",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
  # Use default config
  python scripts/train_progressive.py
  # Use specific config file
  python scripts/train_progressive.py --config config/training_config_large.yaml
  # Use config with custom path
  python scripts/train_progressive.py --config /path/to/my_config.yaml
  # Prepare sample datasets
  python scripts/train_progressive.py --prepare-data
        """
    )
    parser.add_argument(
        "--config", "-c",
        type=str,
        default="config/training_config.yaml",
        help="Path to the training configuration file (default: config/training_config.yaml)"
    )
    parser.add_argument(
        "--prepare-data",
        action="store_true",
        help="Prepare sample datasets before training"
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Load config and model but skip training (for testing)"
    )
    return parser.parse_args()
 def load_config(config_path: str) -> dict:
    """Load configuration from file"""
    config_path = Path(config_path)
    if not config_path.exists():
        raise FileNotFoundError(f"Configuration file not found: {config_path}")
    print(f"Loading configuration from: {config_path}")
    with open(config_path) as f:
        config = yaml.safe_load(f)
    return config
 def main():
    args = parse_args()
    print("Progressive LLM Training for 松尾研LLMコンペ2025")
    print("=" * 50)
    # Load configuration
    try:
        config = load_config(args.config)
    except FileNotFoundError as e:
        print(f"Error: {e}")
        print("Available config files:")
        config_dir = Path("config")
        if config_dir.exists():
            for config_file in config_dir.glob("*.yaml"):
                print(f"  {config_file}")
        sys.exit(1)
    except Exception as e:
        print(f"Error loading config: {e}")
        sys.exit(1)
    # Print configuration info
    print(f"Experiment: {config['experiment']['name']}")
    print(f"Base model: {config['experiment']['base_model']}")
    print(f"Output directory: {config['experiment']['output_dir']}")
    print(f"Stages: {len(config['progressive_stages'])}")
    # Prepare sample datasets if requested
    if args.prepare_data:
        print("\nPreparing sample datasets...")
        prepare_sample_datasets()
        print("Sample datasets prepared.")
    # Initialize model wrapper
    print("\nInitializing model...")
    model_wrapper = ProgressiveReasoningModel(config)
    model_wrapper.setup_base_model()
    if args.dry_run:
        print("\nDry run completed. Model loaded successfully.")
        return
    # Initialize trainer
    print("\nInitializing trainer...")
    trainer = ProgressiveTrainer(model_wrapper, config)
    # Run progressive training
    print("\nStarting progressive training...")
    trainer.run_progressive_training()
    print("\nTraining completed successfully!")
 if __name__ == "__main__":
    main()
--- a/src/init.py
+++ b/src/init.py
--- a/src/data_utils.py
+++ b/src/data_utils.py
@ -0,0 +1,88 @@
 import json
 import jsonlines
 from typing import List, Dict
 from pathlib import Path
 import random
 def create_think_tag_example(question: str, reasoning: str, answer: str) -> Dict:
    """Create training example with think tags"""
    output = f"<think>\n{reasoning}\n</think>\n\n{answer}"
    return {
        "input": question,
        "output": output
    }
 def prepare_basic_cot_data(output_dir: str, num_examples: int = 1000):
    """Create basic Chain-of-Thought examples"""
    output_path = Path(output_dir) / "basic_cot"
    output_path.mkdir(parents=True, exist_ok=True)
    examples = []
    # Simple arithmetic examples
    for i in range(num_examples // 2):
        a = random.randint(10, 100)
        b = random.randint(10, 100)
        question = f"What is {a} + {b}?"
        reasoning = f"To find {a} + {b}, I need to add these two numbers together.\n{a} + {b} = {a + b}"
        answer = f"The answer is {a + b}."
        examples.append(create_think_tag_example(question, reasoning, answer))
    # Simple word problems
    templates = [
        {
            "question": "If I have {a} apples and buy {b} more, how many apples do I have?",
            "reasoning": "Starting with {a} apples, then adding {b} more apples.\nTotal: {a} + {b} = {result}",
            "answer": "I have {result} apples."
        },
        {
            "question": "A store has {a} items. If {b} are sold, how many remain?",
            "reasoning": "Starting amount: {a} items\nSold: {b} items\nRemaining: {a} - {b} = {result}",
            "answer": "There are {result} items remaining."
        }
    ]
    for i in range(num_examples // 2):
        template = random.choice(templates)
        a = random.randint(20, 200)
        b = random.randint(10, min(50, a))
        if "+" in template["reasoning"]:
            result = a + b
        else:
            result = a - b
        question = template["question"].format(a=a, b=b)
        reasoning = template["reasoning"].format(a=a, b=b, result=result)
        answer = template["answer"].format(result=result)
        examples.append(create_think_tag_example(question, reasoning, answer))
    # Save to jsonl
    output_file = output_path / "train.jsonl"
    with jsonlines.open(output_file, "w") as writer:
        writer.write_all(examples)
    print(f"Created {len(examples)} basic CoT examples at: {output_file}")
 def prepare_sample_datasets(base_dir: str = "./data"):
    """Prepare sample datasets for all stages"""
    base_path = Path(base_dir)
    # Basic CoT
    prepare_basic_cot_data(base_path)
    # Math reasoning (placeholder)
    math_path = base_path / "math_reasoning"
    math_path.mkdir(parents=True, exist_ok=True)
    # Complex reasoning (placeholder)
    complex_path = base_path / "complex_reasoning"
    complex_path.mkdir(parents=True, exist_ok=True)
    print(f"Sample datasets prepared in: {base_path}")
--- a/src/progressive_model.py
+++ b/src/progressive_model.py
@ -0,0 +1,366 @@
 import torch
 from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
 )
 from peft import (
    LoraConfig,
    PeftModel,
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training
 )
 from typing import Dict, List, Optional, Tuple
 import json
 from pathlib import Path
 class ProgressiveReasoningModel:
    """Progressive training approach for reasoning models"""
    def __init__(self, config: dict):
        self.config = config
        self.base_model_name = config["experiment"]["base_model"]
        self.output_dir = Path(config["experiment"]["output_dir"])
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.model = None
        self.tokenizer = None
        self.adapters = {}
        self.training_history = []
    def setup_base_model(self):
        """Initialize base model with quantization"""
        print(f"Loading base model: {self.base_model_name}")
        # Check if quantization is enabled
        if self.config["model"].get("load_in_4bit", False):
            # BitsAndBytes config for 4-bit quantization
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=getattr(torch, self.config["model"]["bnb_4bit_compute_dtype"]),
                bnb_4bit_use_double_quant=self.config["model"]["bnb_4bit_use_double_quant"],
                bnb_4bit_quant_type=self.config["model"].get("bnb_4bit_quant_type", "nf4")
            )
            quantization_config = bnb_config
        else:
            quantization_config = None
        # Model loading arguments
        model_kwargs = {
            "device_map": self.config["model"]["device_map"],
            "trust_remote_code": True,
            "torch_dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32,
        }
        # Add authentication token if provided
        if "hf_token" in self.config["model"] and self.config["model"]["hf_token"]:
            model_kwargs["token"] = self.config["model"]["hf_token"]
        # Add max_memory configuration for CPU offloading
        if "max_memory" in self.config["model"]:
            model_kwargs["max_memory"] = self.config["model"]["max_memory"]
            print(f"Using max_memory configuration: {model_kwargs['max_memory']}")
        # Add offload folder if specified
        if "offload_folder" in self.config["model"]:
            model_kwargs["offload_folder"] = self.config["model"]["offload_folder"]
            model_kwargs["offload_state_dict"] = True
            print(f"Using offload folder: {model_kwargs['offload_folder']}")
        # Note: llm_int8_enable_fp32_cpu_offload is not supported for all models
        # Only add it if we're not using Gemma models
        if (quantization_config and 
            self.config["model"].get("llm_int8_enable_fp32_cpu_offload", False) and 
            "gemma" not in self.base_model_name.lower()):
            model_kwargs["llm_int8_enable_fp32_cpu_offload"] = True
            print("Enabled FP32 CPU offload for quantized model")
        # Add quantization config if enabled
        if quantization_config:
            model_kwargs["quantization_config"] = quantization_config
        # Add attention implementation
        if self.config["model"].get("use_flash_attention_2", False):
            model_kwargs["attn_implementation"] = "flash_attention_2"
        elif self.config["model"].get("use_eager_attention", False):
            model_kwargs["attn_implementation"] = "eager"
        # Load model
        print("Loading model with the following kwargs:")
        for k, v in model_kwargs.items():
            if k != "quantization_config":
                print(f"  {k}: {v}")
            else:
                print(f"  {k}: <BitsAndBytesConfig>")
        try:
            self.model = AutoModelForCausalLM.from_pretrained(
                self.base_model_name,
                **model_kwargs
            )
        except Exception as e:
            print(f"Error loading model: {e}")
            # Try without some problematic kwargs
            if "offload_folder" in model_kwargs:
                print("Retrying without offload_folder...")
                del model_kwargs["offload_folder"]
                del model_kwargs["offload_state_dict"]
            self.model = AutoModelForCausalLM.from_pretrained(
                self.base_model_name,
                **model_kwargs
            )
        # Prepare for k-bit training if using quantization
        if quantization_config:
            self.model = prepare_model_for_kbit_training(self.model)
        # Disable gradient checkpointing for now to avoid conflicts
        # Enable gradient checkpointing if requested (but disable use_cache)
        # if self.config["model"].get("gradient_checkpointing", False):
        #     self.model.gradient_checkpointing_enable()
        #     self.model.config.use_cache = False
        #     print("Gradient checkpointing enabled, use_cache disabled")
        # Explicitly disable use_cache to avoid conflicts
        if hasattr(self.model, 'config'):
            self.model.config.use_cache = False
        # Load tokenizer
        tokenizer_kwargs = {"trust_remote_code": True}
        if "hf_token" in self.config["model"] and self.config["model"]["hf_token"]:
            tokenizer_kwargs["token"] = self.config["model"]["hf_token"]
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.base_model_name,
            **tokenizer_kwargs
        )
        # Set padding token and other special tokens for Gemma
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        # For Gemma models, ensure special tokens are set
        if "gemma" in self.base_model_name.lower():
            print("Configuring Gemma-specific tokenizer settings")
            # Add special tokens if they don't exist
            special_tokens = {
                "bos_token": "<bos>",
                "eos_token": "<eos>",
                "pad_token": "<pad>",
            }
            # Only add tokens that don't already exist
            tokens_to_add = {}
            for token_name, token_value in special_tokens.items():
                if getattr(self.tokenizer, token_name, None) is None:
                    tokens_to_add[token_name] = token_value
            if tokens_to_add:
                num_added = self.tokenizer.add_special_tokens(tokens_to_add)
                print(f"Added special tokens: {tokens_to_add}")
                if num_added > 0:
                    # Resize model embeddings to accommodate new tokens
                    self.model.resize_token_embeddings(len(self.tokenizer))
                    print(f"Resized model embeddings to {len(self.tokenizer)} tokens")
            # Set appropriate model_max_length for Gemma
            if hasattr(self.tokenizer, 'model_max_length') and self.tokenizer.model_max_length > 8192:
                self.tokenizer.model_max_length = 8192
                print(f"Set tokenizer model_max_length to {self.tokenizer.model_max_length}")
        # Debug: print model structure for target module identification
        print("Model structure:")
        for name, module in self.model.named_modules():
            if any(target in name for target in ['attn', 'proj', 'mlp', 'gate', 'up', 'down']):
                print(f"  {name}: {type(module).__name__}")
        print("Base model loaded successfully")
    def get_target_modules(self, suggested_modules):
        """Auto-detect valid target modules for the model"""
        valid_modules = []
        all_modules = [name for name, _ in self.model.named_modules()]
        # Check each suggested module
        for module_name in suggested_modules:
            # Find modules that contain this name
            matching_modules = [name for name in all_modules if module_name in name]
            if matching_modules:
                valid_modules.append(module_name)
                print(f"  Found target module: {module_name} (matches: {len(matching_modules)} modules)")
            else:
                print(f"  Warning: target module '{module_name}' not found in model")
        # If no valid modules found, try common alternatives
        if not valid_modules:
            print("  No suggested modules found, trying common alternatives...")
            common_alternatives = [
                "q_proj", "k_proj", "v_proj", "o_proj",  # Common attention
                "gate_proj", "up_proj", "down_proj",  # Common MLP
                "self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj",  # Full path
                "mlp.gate_proj", "mlp.up_proj", "mlp.down_proj",  # Full MLP path
            ]
            for module_name in common_alternatives:
                matching_modules = [name for name in all_modules if module_name in name]
                if matching_modules:
                    valid_modules.append(module_name)
                    print(f"  Found alternative target module: {module_name}")
                if len(valid_modules) >= 2:  # At least 2 modules
                    break
        if not valid_modules:
            print("  ERROR: No valid target modules found!")
            print("  Available modules containing 'proj' or 'attn':")
            for name in all_modules:
                if any(keyword in name.lower() for keyword in ['proj', 'attn', 'mlp']):
                    print(f"    {name}")
            # Fallback to a basic module that should exist
            valid_modules = ["embed_tokens"]
        return valid_modules
    def create_adapter(self, stage_config: dict) -> LoraConfig:
        """Create LoRA adapter configuration"""
        adapter_config = stage_config["adapter_config"]
        # Get initialization method from config, default to True for identity init
        init_method = adapter_config.get("init_lora_weights", True)
        # Auto-detect valid target modules
        suggested_modules = adapter_config["target_modules"]
        valid_modules = self.get_target_modules(suggested_modules)
        print(f"Using target modules: {valid_modules}")
        return LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            r=adapter_config["r"],
            lora_alpha=adapter_config["lora_alpha"],
            lora_dropout=adapter_config["lora_dropout"],
            target_modules=valid_modules,
            bias="none",
            init_lora_weights=init_method  # Initialize LoRA weights (True = identity, "gaussian" = random)
        )
    def add_progressive_adapter(self, stage_name: str, stage_config: dict):
        """Add a new adapter for progressive training"""
        print(f"\nAdding adapter for stage: {stage_name}")
        # Check if we should inherit from previous adapter
        if "inherit_from" in stage_config and stage_config["inherit_from"] in self.adapters:
            print(f"Inheriting from: {stage_config['inherit_from']}")
            # Load previous adapter as base
            prev_adapter_path = self.adapters[stage_config["inherit_from"]]
            self.model = PeftModel.from_pretrained(
                self.model,
                prev_adapter_path,
                is_trainable=True
            )
            # Merge and unload to incorporate previous learning
            self.model = self.model.merge_and_unload()
        # Create new adapter config
        lora_config = self.create_adapter(stage_config)
        # Add adapter to model
        self.model = get_peft_model(self.model, lora_config)
        # Ensure model is in training mode
        self.model.train()
        # Print trainable parameters
        self.model.print_trainable_parameters()
        # Debug: check if any parameters require gradients
        trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
        total_params = sum(p.numel() for p in self.model.parameters())
        print(f"Trainable parameters: {trainable_params:,} / {total_params:,} ({100 * trainable_params / total_params:.2f}%)")
        # List parameters that require gradients
        grad_params = [name for name, param in self.model.named_parameters() if param.requires_grad]
        print(f"Parameters requiring gradients: {len(grad_params)} parameters")
        if len(grad_params) > 0:
            print(f"First few: {grad_params[:5]}")
        else:
            print("WARNING: No parameters require gradients!")
        # Save adapter path
        adapter_path = self.output_dir / "adapters" / stage_name
        adapter_path.mkdir(parents=True, exist_ok=True)
        self.adapters[stage_name] = str(adapter_path)
    def save_adapter(self, stage_name: str):
        """Save current adapter"""
        if stage_name in self.adapters:
            print(f"Saving adapter: {stage_name}")
            self.model.save_pretrained(self.adapters[stage_name])
            # Also save tokenizer for convenience
            self.tokenizer.save_pretrained(self.adapters[stage_name])
    def load_for_inference(self, adapter_names: List[str], weights: Optional[Dict[str, float]] = None):
        """Load model with specific adapters for inference"""
        if len(adapter_names) == 1:
            # Single adapter
            adapter_name = adapter_names[0]
            # Check if adapter path is in memory
            if adapter_name in self.adapters:
                adapter_path = self.adapters[adapter_name]
            else:
                # Try to find adapter in output directory
                adapter_path = self.output_dir / "adapters" / adapter_name
                if not adapter_path.exists():
                    raise ValueError(f"Adapter {adapter_name} not found at {adapter_path}")
                adapter_path = str(adapter_path)
            print(f"Loading adapter from: {adapter_path}")
            self.model = PeftModel.from_pretrained(
                self.model,
                adapter_path
            )
        else:
            # Multiple adapters - load and combine
            # This is a simplified version - real implementation would need adapter composition
            print("Multi-adapter inference not fully implemented in this bootstrap")
            # For now, just load the last adapter
            adapter_name = adapter_names[-1]
            if adapter_name in self.adapters:
                adapter_path = self.adapters[adapter_name]
            else:
                adapter_path = str(self.output_dir / "adapters" / adapter_name)
            self.model = PeftModel.from_pretrained(
                self.model,
                adapter_path
            )
    def generate_with_reasoning(self, prompt: str, max_length: int = 2048) -> str:
        """Generate response with reasoning"""
        # Format prompt with think tags expectation
        formatted_prompt = f"{prompt}\n\nPlease think step by step using <think> tags before providing your answer."
        # Tokenize
        inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.model.device)
        # Generate
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_length=max_length,
                temperature=0.7,
                do_sample=True,
                top_p=0.95,
                pad_token_id=self.tokenizer.pad_token_id,
                eos_token_id=self.tokenizer.eos_token_id
            )
        # Decode
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract response after prompt
        response = response[len(formatted_prompt):].strip()
        return response
--- a/src/training.py
+++ b/src/training.py
@ -0,0 +1,450 @@
 from transformers import TrainingArguments
 from trl import SFTTrainer
 from datasets import load_dataset, Dataset
 import torch
 from typing import Dict, List
 import json
 import jsonlines
 from pathlib import Path
 class ProgressiveTrainer:
    """Handle progressive training stages"""
    def __init__(self, model_wrapper, config: dict):
        self.model_wrapper = model_wrapper
        self.config = config
        self.training_history = []
    def load_dataset(self, dataset_path: str, stage_config: dict = None) -> Dataset:
        """Load dataset from jsonl files or HuggingFace datasets"""
        print(f"Loading dataset from path: {dataset_path}")
        # Check if it's a HuggingFace dataset (contains '/')
        if '/' in dataset_path and not Path(dataset_path).exists():
            print(f"Loading HuggingFace dataset: {dataset_path}")
            return self.load_huggingface_dataset(dataset_path, stage_config)
        # Load local dataset
        data = []
        print(f"Current working directory: {Path.cwd()}")
        # Support both single file and directory
        path = Path(dataset_path)
        print(f"Path exists: {path.exists()}")
        print(f"Is file: {path.is_file()}")
        print(f"Is directory: {path.is_dir()}")
        if path.is_file():
            files = [path]
        else:
            files = list(path.glob("*.jsonl"))
        print(f"Found {len(files)} files to load")
        for f in files:
            print(f"  - {f}")
        for file_path in files:
            print(f"Loading file: {file_path}")
            try:
                with jsonlines.open(file_path) as reader:
                    count = 0
                    for item in reader:
                        # Format for chat template
                        formatted = {
                            "messages": [
                                {"role": "user", "content": item["input"]},
                                {"role": "assistant", "content": item["output"]}
                            ]
                        }
                        data.append(formatted)
                        count += 1
                    print(f"  Loaded {count} examples from {file_path}")
            except Exception as e:
                print(f"  Error loading file {file_path}: {e}")
        print(f"Total examples loaded: {len(data)}")
        return Dataset.from_list(data)
    def load_huggingface_dataset(self, dataset_name: str, stage_config: dict) -> Dataset:
        """Load dataset from HuggingFace"""
        try:
            dataset_config = stage_config.get("dataset_config", {}) if stage_config else {}
            # Default settings
            split = dataset_config.get("split", "train")
            max_samples = dataset_config.get("max_samples", None)
            streaming = dataset_config.get("streaming", False)
            print(f"Loading HuggingFace dataset: {dataset_name}")
            print(f"  Split: {split}")
            print(f"  Max samples: {max_samples}")
            print(f"  Streaming: {streaming}")
            # Load dataset
            if streaming:
                dataset = load_dataset(dataset_name, split=split, streaming=True)
                if max_samples:
                    dataset = dataset.take(max_samples)
                # Convert streaming dataset to regular dataset
                data = []
                count = 0
                for item in dataset:
                    data.append(item)
                    count += 1
                    if count % 1000 == 0:
                        print(f"  Loaded {count} examples...")
                    if max_samples and count >= max_samples:
                        break
                dataset = Dataset.from_list(data)
            else:
                dataset = load_dataset(dataset_name, split=split)
                if max_samples:
                    dataset = dataset.select(range(min(max_samples, len(dataset))))
            print(f"  Loaded dataset with {len(dataset)} examples")
            print(f"  Dataset columns: {dataset.column_names}")
            if len(dataset) > 0:
                print(f"  First example: {dataset[0]}")
            # Convert to our expected format based on dataset name
            if "math" in dataset_name.lower():
                return self.convert_math_dataset(dataset)
            elif "mixture-of-thoughts" in dataset_name.lower():
                return self.convert_mixture_of_thoughts_dataset(dataset)
            else:
                return self.convert_generic_dataset(dataset)
        except Exception as e:
            print(f"Error loading HuggingFace dataset {dataset_name}: {e}")
            print("Falling back to empty dataset")
            return Dataset.from_list([])
    def convert_math_dataset(self, dataset: Dataset) -> Dataset:
        """Convert OpenR1-Math-220k format to our training format"""
        def format_math_example(example):
            # OpenR1-Math-220k format has different column names
            # Try to find the right columns
            input_text = None
            output_text = None
            # Common column names in math datasets
            if "question" in example:
                input_text = example["question"]
            elif "problem" in example:
                input_text = example["problem"]
            elif "input" in example:
                input_text = example["input"]
            elif "query" in example:
                input_text = example["query"]
            if "answer" in example:
                output_text = example["answer"]
            elif "solution" in example:
                output_text = example["solution"]
            elif "output" in example:
                output_text = example["output"]
            elif "response" in example:
                output_text = example["response"]
            # If we can't find the right columns, use the raw example
            if input_text is None or output_text is None:
                print(f"Warning: Could not parse example columns: {list(example.keys())}")
                # Try to use the first two string fields
                string_fields = [k for k, v in example.items() if isinstance(v, str) and len(v) > 10]
                if len(string_fields) >= 2:
                    input_text = example[string_fields[0]]
                    output_text = example[string_fields[1]]
                else:
                    # Skip this example
                    return None
            # Format with think tags for math reasoning
            formatted_output = f"<think>\nLet me solve this step by step.\n\n{output_text}\n</think>\n\n{output_text}"
            return {
                "messages": [
                    {"role": "user", "content": input_text},
                    {"role": "assistant", "content": formatted_output}
                ]
            }
        # Convert and filter out None results
        converted = dataset.map(format_math_example, desc="Converting math dataset")
        converted = converted.filter(lambda x: x is not None, desc="Filtering valid examples")
        print(f"Converted {len(converted)} math examples")
        if len(converted) > 0:
            print(f"First converted example: {converted[0]}")
        return converted
    def convert_mixture_of_thoughts_dataset(self, dataset: Dataset) -> Dataset:
        """Convert Mixture-of-Thoughts format to our training format"""
        def format_mot_example(example):
            # Mixture-of-Thoughts typically has complex reasoning patterns
            # Check for common column names in the dataset
            input_text = None
            output_text = None
            # Try to identify input/output columns
            if "prompt" in example:
                input_text = example["prompt"]
            elif "question" in example:
                input_text = example["question"]
            elif "input" in example:
                input_text = example["input"]
            elif "instruction" in example:
                input_text = example["instruction"]
            if "response" in example:
                output_text = example["response"]
            elif "output" in example:
                output_text = example["output"]
            elif "completion" in example:
                output_text = example["completion"]
            elif "answer" in example:
                output_text = example["answer"]
            # If columns not found, look for thinking patterns
            if input_text is None or output_text is None:
                # Try to find columns with substantial text
                for key, value in example.items():
                    if isinstance(value, str) and len(value) > 20:
                        if input_text is None and any(q in key.lower() for q in ["prompt", "question", "input"]):
                            input_text = value
                        elif output_text is None and any(a in key.lower() for a in ["response", "answer", "output"]):
                            output_text = value
            if input_text is None or output_text is None:
                print(f"Warning: Could not parse MoT example columns: {list(example.keys())}")
                return None
            # Check if output already contains thinking tags
            if "<think>" in output_text or "思考" in output_text:
                # Already formatted with thinking
                formatted_output = output_text
            else:
                # Add thinking structure for complex reasoning
                formatted_output = f"<think>\nLet me break this down step by step.\n\n{output_text}\n</think>\n\nBased on my analysis, {output_text}"
            return {
                "messages": [
                    {"role": "user", "content": input_text},
                    {"role": "assistant", "content": formatted_output}
                ]
            }
        # Convert and filter
        converted = dataset.map(format_mot_example, desc="Converting Mixture-of-Thoughts dataset")
        converted = converted.filter(lambda x: x is not None, desc="Filtering valid examples")
        print(f"Converted {len(converted)} Mixture-of-Thoughts examples")
        if len(converted) > 0:
            print(f"First converted example: {converted[0]}")
        return converted
    def convert_generic_dataset(self, dataset: Dataset) -> Dataset:
        """Convert generic dataset format to our training format"""
        def format_generic_example(example):
            # Generic conversion for unknown dataset formats
            input_text = None
            output_text = None
            # Look for any text columns
            text_columns = [(k, v) for k, v in example.items() if isinstance(v, str) and len(v) > 10]
            if len(text_columns) >= 2:
                # Use first two substantial text columns
                input_text = text_columns[0][1]
                output_text = text_columns[1][1]
            elif len(text_columns) == 1:
                # Only one text column - skip this example
                return None
            else:
                return None
            return {
                "messages": [
                    {"role": "user", "content": input_text},
                    {"role": "assistant", "content": output_text}
                ]
            }
        converted = dataset.map(format_generic_example, desc="Converting generic dataset")
        converted = converted.filter(lambda x: x is not None, desc="Filtering valid examples")
        print(f"Converted {len(converted)} generic examples")
        return converted
    def format_dataset(self, dataset: Dataset) -> Dataset:
        """Format dataset for training"""
        print(f"Dataset before formatting: {len(dataset)} examples")
        print(f"First example: {dataset[0] if len(dataset) > 0 else 'No data'}")
        # Check if tokenizer has chat template
        has_chat_template = (
            hasattr(self.model_wrapper.tokenizer, 'chat_template') and 
            self.model_wrapper.tokenizer.chat_template is not None
        )
        if not has_chat_template:
            print("No chat template found, setting default Gemma chat template")
            # Set a simple chat template for Gemma
            self.model_wrapper.tokenizer.chat_template = "{% for message in messages %}<start_of_turn>{{ message['role'] }}\n{{ message['content'] }}<end_of_turn>\n{% endfor %}"
        def format_chat(example):
            # Try to use chat template if available
            if has_chat_template or self.model_wrapper.tokenizer.chat_template:
                try:
                    text = self.model_wrapper.tokenizer.apply_chat_template(
                        example["messages"],
                        tokenize=False,
                        add_generation_prompt=False
                    )
                    return {"text": text}
                except Exception as e:
                    print(f"Chat template failed: {e}, using fallback")
            # Fallback: create simple formatted text
            if "messages" in example:
                user_msg = example["messages"][0]["content"]
                assistant_msg = example["messages"][1]["content"]
                return {"text": f"<start_of_turn>user\n{user_msg}<end_of_turn>\n<start_of_turn>model\n{assistant_msg}<end_of_turn>\n"}
            elif "input" in example and "output" in example:
                return {"text": f"<start_of_turn>user\n{example['input']}<end_of_turn>\n<start_of_turn>model\n{example['output']}<end_of_turn>\n"}
            else:
                return {"text": str(example)}
        # Format dataset
        formatted = dataset.map(format_chat, batched=False, desc="Formatting dataset")
        print(f"Dataset after formatting: {len(formatted)} examples")
        if len(formatted) > 0:
            print(f"Columns: {formatted.column_names}")
            print(f"First formatted example: {formatted[0]}")
        # Keep only the 'text' column for SFTTrainer
        if 'text' in formatted.column_names:
            columns_to_remove = [col for col in formatted.column_names if col != 'text']
            if columns_to_remove:
                formatted = formatted.remove_columns(columns_to_remove)
        return formatted
    def filter_by_length(self, dataset: Dataset, max_length: int) -> Dataset:
        """Filter dataset by sequence length"""
        def is_valid_length(example):
            # Tokenize and check length
            tokens = self.model_wrapper.tokenizer(
                example["text"], 
                truncation=False, 
                return_length=True
            )
            return len(tokens["input_ids"]) <= max_length
        filtered = dataset.filter(is_valid_length, desc="Filtering by length")
        print(f"Filtered dataset: {len(filtered)} examples (max_length={max_length})")
        return filtered
    def train_stage(self, stage_name: str, stage_config: dict):
        """Train a single stage"""
        print(f"\n{'='*50}")
        print(f"Training stage: {stage_name}")
        print(f"Description: {stage_config['description']}")
        print(f"{'='*50}\n")
        # Add adapter
        self.model_wrapper.add_progressive_adapter(stage_name, stage_config)
        # Load and format dataset
        dataset = self.load_dataset(stage_config["dataset_path"], stage_config)
        dataset = self.format_dataset(dataset)
        # Filter by sequence length if specified
        if "max_length" in stage_config["training"]:
            dataset = self.filter_by_length(dataset, stage_config["training"]["max_length"])
        print(f"Final dataset size: {len(dataset)} examples")
        # Training arguments - with CPU offload optimizations
        training_args = TrainingArguments(
            output_dir=f"./outputs/checkpoints/{stage_name}",
            num_train_epochs=stage_config["training"]["num_epochs"],
            per_device_train_batch_size=stage_config["training"]["per_device_batch_size"],
            gradient_accumulation_steps=stage_config["training"]["gradient_accumulation_steps"],
            learning_rate=float(stage_config["training"]["learning_rate"]),  # Ensure it's a float
            warmup_steps=stage_config["training"]["warmup_steps"],
            logging_steps=stage_config["training"].get("logging_steps", 10),
            save_strategy="epoch",
            eval_strategy="no",
            bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
            gradient_checkpointing=self.config["model"].get("gradient_checkpointing", False),
            max_grad_norm=stage_config["training"].get("max_grad_norm", 1.0),
            report_to="wandb" if self.config["experiment"]["use_wandb"] else "none",
            run_name=f"{self.config['experiment']['name']}_{stage_name}",
            dataloader_pin_memory=False,  # Reduce memory usage
            remove_unused_columns=False,  # Keep all columns
            optim=stage_config["training"].get("optim", "adamw_torch"),  # Support 8-bit optimizers
            dataloader_num_workers=stage_config["training"].get("dataloader_num_workers", 2),
        )
        # Print dataset info for debugging
        print(f"Dataset columns: {dataset.column_names}")
        print(f"Dataset first example: {dataset[0]}")
        # Ensure model is in training mode before creating trainer
        self.model_wrapper.model.train()
        # Final check of trainable parameters
        trainable_params = sum(p.numel() for p in self.model_wrapper.model.parameters() if p.requires_grad)
        print(f"Final check - Trainable parameters: {trainable_params:,}")
        # Create trainer with minimal configuration
        try:
            trainer = SFTTrainer(
                model=self.model_wrapper.model,
                processing_class=self.model_wrapper.tokenizer,
                train_dataset=dataset,
                args=training_args,
                packing=False,  # Disable packing for better gradient flow
            )
        except Exception as e:
            print(f"Error creating SFTTrainer: {e}")
            print("Trying with basic configuration...")
            trainer = SFTTrainer(
                model=self.model_wrapper.model,
                processing_class=self.model_wrapper.tokenizer,
                train_dataset=dataset,
                args=training_args,
            )
        # Train
        trainer.train()
        # Save adapter
        self.model_wrapper.save_adapter(stage_name)
        # Record history
        self.training_history.append({
            "stage": stage_name,
            "config": stage_config,
            "metrics": trainer.state.log_history
        })
        print(f"\nCompleted training stage: {stage_name}")
    def run_progressive_training(self):
        """Run all training stages progressively"""
        stages = self.config["progressive_stages"]
        for stage_config in stages:
            stage_name = stage_config["name"]
            self.train_stage(stage_name, stage_config)
        # Save training history
        history_path = Path(self.config["experiment"]["output_dir"]) / "training_history.json"
        with open(history_path, "w") as f:
            json.dump(self.training_history, f, indent=2)
        print(f"\nAll stages completed! Training history saved to: {history_path}")
--- a/test_data_load.py
+++ b/test_data_load.py
@ -0,0 +1,35 @@
 #!/usr/bin/env python3
 """Test data loading"""
 import sys
 from pathlib import Path
 sys.path.append(str(Path(__file__).parent))
 from src.training import ProgressiveTrainer
 from src.progressive_model import ProgressiveReasoningModel
 import yaml
 # Load config
 with open("config/training_config.yaml") as f:
    config = yaml.safe_load(f)
 # Create dummy model wrapper
 class DummyModelWrapper:
    def __init__(self):
        self.tokenizer = None
 model_wrapper = DummyModelWrapper()
 # Create trainer
 trainer = ProgressiveTrainer(model_wrapper, config)
 # Test data loading
 stage_config = config["progressive_stages"][0]
 dataset_path = stage_config["dataset_path"]
 print(f"Loading dataset from: {dataset_path}")
 dataset = trainer.load_dataset(dataset_path)
 print(f"Loaded {len(dataset)} examples")
 if len(dataset) > 0:
    print(f"First example: {dataset[0]}")
		`@ -0,0 +1 @@`
							`/nix/store/94lg0shvsfc845zy8gnflvpqxxiyijbz-bash-interactive-5.2p37`
		`@ -0,0 +1 @@`
							`{"inputs":{"nixpkgs":{"url":"github:NixOS/nixpkgs/nixos-unstable"},"nixpkgs-python":{"url":"github:cachix/nixpkgs-python","inputs":{"nixpkgs":{"follows":"nixpkgs"}}}},"allowUnfree":true}`
		`@ -0,0 +1 @@`
							`{"nixpkgs":{"url":"github:NixOS/nixpkgs/nixos-unstable"},"nixpkgs-python":{"url":"github:cachix/nixpkgs-python","inputs":{"nixpkgs":{"follows":"nixpkgs"}}}}`
		`@ -0,0 +1 @@`
							`/nix/store/7fimdw1in7f1g0wxw5cr9pg26rs4rp5g-devenv-shell-env`
		`@ -0,0 +1,3 @@`
							export PATH='/home/centra/dev/pnn/progressive-llm-training/.devenv/state/venv/bin:/nix/store/bdqwd2frn9m7n3hj2436s0vlnv7mawpc-python3-3.11.13-env/bin:/nix/store/9w80x8njl1hcp8vlk1f3x17q4hcd2cqp-evaluate/bin:/nix/store/8df6wqahd2fqzl04kcs3xs32yqqimcxb-install-packages/bin:/nix/store/v5rz1h6ci23icfp6y228r2m0fqrdf408-install-packages-cpu/bin:/nix/store/69142b4sjmb4jffmyjb8nar6qzlgxnpg-prepare-data/bin:/nix/store/bhb6l6yfqknnwc7y5j5xc9k866hajv7b-train/bin:/nix/store/pbqah1qk4b5y14fqinr1h8zvhqy71v81-gcc-wrapper-14.3.0/bin:/nix/store/sa7j7cddyblhcb3ch3ds10w7nw75yjj1-gcc-14.3.0/bin:/nix/store/mdmsnfcvxyk5ynz7nx8nhss1wig0gljx-glibc-2.40-66-bin/bin:/nix/store/psy9v2asypgl9ylg8cnzkixc7fv0snj0-coreutils-9.7/bin:/nix/store/cadx5p7c0i06gf6h84iw9mrhx56imbv0-binutils-wrapper-2.44/bin:/nix/store/z3za8hfc24wb117s50p8b10agjkgm039-binutils-2.44/bin:/nix/store/dx4bdrs7mq3jfviqhszrc7l35ps9kg64-cmake-3.31.7/bin:/nix/store/1492q00cm64n0hs5966s8cqj6j0x5nxg-ninja-1.12.1/bin:/nix/store/h5khrpnjj3fb182sc32fx1z75w0lhksy-pkg-config-wrapper-0.29.2/bin:/nix/store/rzqvhv48m3nh8g3j4k6jmz6yqy8apr95-git-2.49.0/bin:/nix/store/nygfbkv0j6fvwwa82mdwxm4qfiq3p4q2-git-lfs-3.6.1/bin:/nix/store/fir4g1m8dvg46mh8silh3wnmm9mc0jix-htop-3.4.1/bin:/nix/store/9mc2m4sacbk4l7sc4w7m08m1x9bf5dgn-tmux-3.5a/bin:/nix/store/cxy72qdk41k3zjs5fw1nw1whv6wf7hv2-vim-9.1.1401/bin:/nix/store/74k8qwbfa6lm8psm2vjh2vj04fpr6c5g-openssl-3.4.1-bin/bin:/nix/store/m9k83ip1yx29xs94sa5x8j70s2vfgj6i-glib-2.84.2-dev/bin:/nix/store/zs5crhr67zp8cxn7dh4mwq08zw3sb31m-gettext-0.22.5/bin:/nix/store/rklrz4rwi03hxvz0kwh75vz55wb9b1qz-glib-2.84.2-bin/bin:/nix/store/xbpwk3xzanxj12157byj6wjagm2wfb3c-cuda-merged-12.8/bin:/nix/store/v0zrnzl3anb71ma5c2kx71dl8kyh0rf6-cuda_cuobjdump-12.8.90-bin/bin:/nix/store/v4mm21f67qki6ss6mqp3anlmaiw0r1zd-pre-commit-bin/bin:/nix/store/mq2i9br9h890bnahlds9jnff1jf6xjpb-python3.13-black-25.1.0/bin:/nix/store/sd81bvmch7njdpwx3lkjslixcbj5mivz-python3-3.13.4/bin:/nix/store/mdzm1l0rnpwp8ha0mbxll0db4r2p0xj3-python3.13-flake8-7.2.0/bin:/nix/store/xs72vlx7i6snrrrqx2zn529fbbqrwlwq-python3.13-pycodestyle-2.13.0/bin:/nix/store/5a8m3p0svp6myq1cz4ww431fsbh3xrg5-python3.13-pyflakes-3.3.2/bin:/nix/store/p6bch581drrxv3dm7vwxqazpbssjz4nv-python3.13-mypy-1.15.0/bin:/nix/store/1c8sm86wj45vwkb3ww2b870h9i9wna6r-patchelf-0.15.0/bin:/nix/store/psy9v2asypgl9ylg8cnzkixc7fv0snj0-coreutils-9.7/bin:/nix/store/c14zwgl8hf1wm0izij2i16xvk8ak70cy-findutils-4.10.0/bin:/nix/store/ibx4jfwlhjg4g0s6rrxrpaxa3ka8ns4m-diffutils-3.12/bin:/nix/store/pr318zsl44jdwpk9wk0sdrn19b6in7ah-gnused-4.9/bin:/nix/store/bc6zxzjnkjp4r9nhz5imy3cypvdh6r4n-gnugrep-3.12/bin:/nix/store/nv3y7zb1cwz1h9qy7nwz0s54j8dl1kqj-gawk-5.3.2/bin:/nix/store/lp82dcnrzljyix6yigwzrlpr1smvpmb0-gnutar-1.35/bin:/nix/store/6ag5dhk7sma61p6vl0kazfmpbrq08nqh-gzip-1.14/bin:/nix/store/ykdv4id6893gmkqwdmbimq237c1xqvq7-bzip2-1.0.8-bin/bin:/nix/store/6bwp1y45zlyvpr4ja2sk1yi9v5mrs94x-gnumake-4.4.1/bin:/nix/store/00zrahbb32nzawrmv9sjxn36h7qk9vrs-bash-5.2p37/bin:/nix/store/c9xmgszbf6i4dfq9r953khk9d7fdqigw-patch-2.8/bin:/nix/store/ikfwx7kbwz9zr7fziiac7f57jgbh3bnv-xz-5.8.1-bin/bin:/nix/store/3pdmbqy86wsbjdazxv1n3vrmj60vn0ri-file-5.45/bin:/run/wrappers/bin:/home/centra/.local/share/flatpak/exports/bin:/var/lib/flatpak/exports/bin:/home/centra/.nix-profile/bin:/nix/profile/bin:/home/centra/.local/state/nix/profile/bin:/etc/profiles/per-user/centra/bin:/nix/var/nix/profiles/default/bin:/run/current-system/sw/bin'
							`export VIRTUAL_ENV=/home/centra/dev/pnn/progressive-llm-training/.devenv/state/venv`
		`@ -0,0 +1 @@`
							`/nix/store/y2vscmx3lckyzyag6xg8b02pkdsk326d-devenv-profile`