photoncloud-monorepo/nix/test-cluster/work-root-budget.sh

#!/usr/bin/env bash
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
WORK_ROOT="${ULTRACLOUD_WORK_ROOT:-${REPO_ROOT}/work}"
PHOTON_CLUSTER_WORK_ROOT="${PHOTON_CLUSTER_WORK_ROOT:-${WORK_ROOT}/test-cluster}"

usage() {
  cat <<'EOF'
Usage:
  ./nix/test-cluster/work-root-budget.sh status
  ./nix/test-cluster/work-root-budget.sh enforce
  ./nix/test-cluster/work-root-budget.sh cleanup-advice
  ./nix/test-cluster/work-root-budget.sh prune-proof-logs [keep-count] [--apply]
EOF
}

size_bytes() {
  local path="$1"
  if [[ -e "${path}" ]]; then
    du -sb "${path}" | awk '{print $1}'
  else
    printf '0\n'
  fi
}

human_size() {
  numfmt --to=iec --suffix=B "$1"
}

proof_roots() {
  printf '%s\n' \
    "${WORK_ROOT}/publishable-kvm-suite" \
    "${WORK_ROOT}/final-proofs" \
    "${WORK_ROOT}/durability-proof" \
    "${WORK_ROOT}/rollout-soak" \
    "${WORK_ROOT}/provider-vm-reality-proof" \
    "${WORK_ROOT}/baremetal-iso-e2e" \
    "${WORK_ROOT}/core-control-plane-ops-proof" \
    "${WORK_ROOT}/hardware-smoke"
}

report_path() {
  local label="$1"
  local path="$2"
  local budget_bytes="$3"
  local size

  size="$(size_bytes "${path}")"
  printf '%-28s %10s  %s' "${label}" "$(human_size "${size}")" "${path}"
  if (( budget_bytes > 0 )); then
    printf '  [soft budget %s' "$(human_size "${budget_bytes}")"
    if (( size > budget_bytes )); then
      printf ', over budget'
    fi
    printf ']'
  fi
  printf '\n'
}

status() {
  local work_budget=$((60 * 1024 * 1024 * 1024))
  local state_budget=$((35 * 1024 * 1024 * 1024))
  local transient_budget=$((10 * 1024 * 1024 * 1024))
  local proof_budget=$((20 * 1024 * 1024 * 1024))

  echo "UltraCloud work-root disk budget status"
  echo "repo_root=${REPO_ROOT}"
  echo "work_root=${WORK_ROOT}"
  echo "photon_cluster_work_root=${PHOTON_CLUSTER_WORK_ROOT}"
  echo

  report_path "work root" "${WORK_ROOT}" "${work_budget}"
  report_path "cluster state" "${PHOTON_CLUSTER_WORK_ROOT}/state" "${state_budget}"
  report_path "tmp" "${WORK_ROOT}/tmp" 0
  report_path "publishable runtime" "${WORK_ROOT}/publishable-kvm-runtime" 0
  report_path "publishable logs" "${WORK_ROOT}/publishable-kvm-suite" 0
  report_path "final proofs" "${WORK_ROOT}/final-proofs" 0
  report_path "durability proof" "${WORK_ROOT}/durability-proof" 0
  report_path "rollout soak" "${WORK_ROOT}/rollout-soak" 0
  report_path "provider or vm proof" "${WORK_ROOT}/provider-vm-reality-proof" 0
  report_path "baremetal exact proof" "${WORK_ROOT}/baremetal-iso-e2e" 0
  report_path "control-plane proof" "${WORK_ROOT}/core-control-plane-ops-proof" 0
  report_path "hardware smoke" "${WORK_ROOT}/hardware-smoke" 0
  echo

  local transient_size proof_size
  transient_size=$(( $(size_bytes "${WORK_ROOT}/tmp") + $(size_bytes "${WORK_ROOT}/publishable-kvm-runtime") ))
  proof_size=$(( $(size_bytes "${WORK_ROOT}/publishable-kvm-suite") + $(size_bytes "${WORK_ROOT}/final-proofs") + $(size_bytes "${WORK_ROOT}/durability-proof") + $(size_bytes "${WORK_ROOT}/rollout-soak") + $(size_bytes "${WORK_ROOT}/provider-vm-reality-proof") + $(size_bytes "${WORK_ROOT}/baremetal-iso-e2e") + $(size_bytes "${WORK_ROOT}/core-control-plane-ops-proof") + $(size_bytes "${WORK_ROOT}/hardware-smoke") ))

  printf 'transient total: %s (soft budget %s)\n' "$(human_size "${transient_size}")" "$(human_size "${transient_budget}")"
  printf 'proof logs total: %s (soft budget %s)\n' "$(human_size "${proof_size}")" "$(human_size "${proof_budget}")"
}

budget_overages() {
  local work_budget=$((60 * 1024 * 1024 * 1024))
  local state_budget=$((35 * 1024 * 1024 * 1024))
  local transient_budget=$((10 * 1024 * 1024 * 1024))
  local proof_budget=$((20 * 1024 * 1024 * 1024))
  local work_size state_size transient_size proof_size overages=0

  work_size="$(size_bytes "${WORK_ROOT}")"
  state_size="$(size_bytes "${PHOTON_CLUSTER_WORK_ROOT}/state")"
  transient_size=$(( $(size_bytes "${WORK_ROOT}/tmp") + $(size_bytes "${WORK_ROOT}/publishable-kvm-runtime") ))
  proof_size=$(( $(size_bytes "${WORK_ROOT}/publishable-kvm-suite") + $(size_bytes "${WORK_ROOT}/final-proofs") + $(size_bytes "${WORK_ROOT}/durability-proof") + $(size_bytes "${WORK_ROOT}/rollout-soak") + $(size_bytes "${WORK_ROOT}/provider-vm-reality-proof") + $(size_bytes "${WORK_ROOT}/baremetal-iso-e2e") + $(size_bytes "${WORK_ROOT}/core-control-plane-ops-proof") + $(size_bytes "${WORK_ROOT}/hardware-smoke") ))

  (( work_size > work_budget )) && ((overages += 1))
  (( state_size > state_budget )) && ((overages += 1))
  (( transient_size > transient_budget )) && ((overages += 1))
  (( proof_size > proof_budget )) && ((overages += 1))

  printf '%s\n' "${overages}"
}

enforce() {
  local overages
  overages="$(budget_overages)"

  status
  echo

  if (( overages > 0 )); then
    echo "Budget enforcement failed: one or more tracked work-root areas are over the configured soft budget."
    echo "Use cleanup-advice for the safe runtime cleanup sequence, or use prune-proof-logs for dated proof roots."
    echo
    cleanup_advice
    echo
    echo "Safer dated-proof cleanup dry-run:"
    echo "  ./nix/test-cluster/work-root-budget.sh prune-proof-logs 2"
    return 1
  fi

  echo "Budget enforcement passed: all tracked work-root areas are within the configured soft budgets."
}

cleanup_advice() {
  cat <<EOF
Safe cleanup sequence for the current checkout:

1. Stop running lab VMs:
   nix run ./nix/test-cluster#cluster -- stop

2. Remove disposable VM runtime state:
   nix run ./nix/test-cluster#cluster -- clean
   rm -rf ${WORK_ROOT}/tmp ${WORK_ROOT}/publishable-kvm-runtime

3. Trim old proof roots you no longer need:
   rm -rf ${WORK_ROOT}/publishable-kvm-suite/<old-run>
   rm -rf ${WORK_ROOT}/final-proofs/<old-run>
   rm -rf ${WORK_ROOT}/durability-proof/<old-run>
   rm -rf ${WORK_ROOT}/rollout-soak/<old-run>
   rm -rf ${WORK_ROOT}/provider-vm-reality-proof/<old-run>
   rm -rf ${WORK_ROOT}/core-control-plane-ops-proof/<old-run>
   rm -rf ${WORK_ROOT}/hardware-smoke/<old-run>

4. Run a Nix store GC after old result symlinks are gone:
   nix store gc
EOF
}

prune_proof_logs() {
  local keep="${1:-2}"
  local apply="${2:-}"
  local mode="dry-run"

  if ! [[ "${keep}" =~ ^[0-9]+$ ]]; then
    echo "keep-count must be a non-negative integer" >&2
    exit 1
  fi

  if [[ "${apply}" == "--apply" ]]; then
    mode="apply"
    if [[ "${ULTRACLOUD_WORK_ROOT_PRUNE_ACK:-}" != "YES" ]]; then
      echo "Refusing to delete proof logs without ULTRACLOUD_WORK_ROOT_PRUNE_ACK=YES" >&2
      exit 1
    fi
  elif [[ -n "${apply}" ]]; then
    echo "unknown prune-proof-logs flag: ${apply}" >&2
    exit 1
  fi

  local root
  while IFS= read -r root; do
    [[ -d "${root}" ]] || continue

    local -a dated_dirs=()
    mapfile -t dated_dirs < <(find "${root}" -mindepth 1 -maxdepth 1 -type d -printf '%P\n' | sort -r)
    if (( ${#dated_dirs[@]} <= keep )); then
      continue
    fi

    echo "${root}:"
    local idx candidate
    for (( idx = keep; idx < ${#dated_dirs[@]}; idx += 1 )); do
      candidate="${root}/${dated_dirs[$idx]}"
      if [[ "${mode}" == "apply" ]]; then
        rm -rf -- "${candidate}"
        echo "  deleted ${candidate}"
      else
        echo "  would delete ${candidate}"
      fi
    done
  done < <(proof_roots)

  if [[ "${mode}" == "dry-run" ]]; then
    echo
    echo "Dry-run only. Re-run with:"
    echo "  ULTRACLOUD_WORK_ROOT_PRUNE_ACK=YES ./nix/test-cluster/work-root-budget.sh prune-proof-logs ${keep} --apply"
  fi
}

main() {
  local cmd="${1:-status}"
  case "${cmd}" in
    status)
      status
      ;;
    enforce)
      enforce
      ;;
    cleanup-advice)
      cleanup_advice
      ;;
    prune-proof-logs)
      prune_proof_logs "${2:-2}" "${3:-}"
      ;;
    -h|--help|help)
      usage
      ;;
    *)
      usage >&2
      exit 1
      ;;
  esac
}

main "$@"