photoncloud-monorepo/nix/test-cluster/work-root-budget.sh

238 lines
7.7 KiB
Bash
Executable file

#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
WORK_ROOT="${ULTRACLOUD_WORK_ROOT:-${REPO_ROOT}/work}"
PHOTON_CLUSTER_WORK_ROOT="${PHOTON_CLUSTER_WORK_ROOT:-${WORK_ROOT}/test-cluster}"
usage() {
cat <<'EOF'
Usage:
./nix/test-cluster/work-root-budget.sh status
./nix/test-cluster/work-root-budget.sh enforce
./nix/test-cluster/work-root-budget.sh cleanup-advice
./nix/test-cluster/work-root-budget.sh prune-proof-logs [keep-count] [--apply]
EOF
}
size_bytes() {
local path="$1"
if [[ -e "${path}" ]]; then
du -sb "${path}" | awk '{print $1}'
else
printf '0\n'
fi
}
human_size() {
numfmt --to=iec --suffix=B "$1"
}
proof_roots() {
printf '%s\n' \
"${WORK_ROOT}/publishable-kvm-suite" \
"${WORK_ROOT}/final-proofs" \
"${WORK_ROOT}/durability-proof" \
"${WORK_ROOT}/rollout-soak" \
"${WORK_ROOT}/provider-vm-reality-proof" \
"${WORK_ROOT}/baremetal-iso-e2e" \
"${WORK_ROOT}/core-control-plane-ops-proof" \
"${WORK_ROOT}/hardware-smoke"
}
report_path() {
local label="$1"
local path="$2"
local budget_bytes="$3"
local size
size="$(size_bytes "${path}")"
printf '%-28s %10s %s' "${label}" "$(human_size "${size}")" "${path}"
if (( budget_bytes > 0 )); then
printf ' [soft budget %s' "$(human_size "${budget_bytes}")"
if (( size > budget_bytes )); then
printf ', over budget'
fi
printf ']'
fi
printf '\n'
}
status() {
local work_budget=$((60 * 1024 * 1024 * 1024))
local state_budget=$((35 * 1024 * 1024 * 1024))
local transient_budget=$((10 * 1024 * 1024 * 1024))
local proof_budget=$((20 * 1024 * 1024 * 1024))
echo "UltraCloud work-root disk budget status"
echo "repo_root=${REPO_ROOT}"
echo "work_root=${WORK_ROOT}"
echo "photon_cluster_work_root=${PHOTON_CLUSTER_WORK_ROOT}"
echo
report_path "work root" "${WORK_ROOT}" "${work_budget}"
report_path "cluster state" "${PHOTON_CLUSTER_WORK_ROOT}/state" "${state_budget}"
report_path "tmp" "${WORK_ROOT}/tmp" 0
report_path "publishable runtime" "${WORK_ROOT}/publishable-kvm-runtime" 0
report_path "publishable logs" "${WORK_ROOT}/publishable-kvm-suite" 0
report_path "final proofs" "${WORK_ROOT}/final-proofs" 0
report_path "durability proof" "${WORK_ROOT}/durability-proof" 0
report_path "rollout soak" "${WORK_ROOT}/rollout-soak" 0
report_path "provider or vm proof" "${WORK_ROOT}/provider-vm-reality-proof" 0
report_path "baremetal exact proof" "${WORK_ROOT}/baremetal-iso-e2e" 0
report_path "control-plane proof" "${WORK_ROOT}/core-control-plane-ops-proof" 0
report_path "hardware smoke" "${WORK_ROOT}/hardware-smoke" 0
echo
local transient_size proof_size
transient_size=$(( $(size_bytes "${WORK_ROOT}/tmp") + $(size_bytes "${WORK_ROOT}/publishable-kvm-runtime") ))
proof_size=$(( $(size_bytes "${WORK_ROOT}/publishable-kvm-suite") + $(size_bytes "${WORK_ROOT}/final-proofs") + $(size_bytes "${WORK_ROOT}/durability-proof") + $(size_bytes "${WORK_ROOT}/rollout-soak") + $(size_bytes "${WORK_ROOT}/provider-vm-reality-proof") + $(size_bytes "${WORK_ROOT}/baremetal-iso-e2e") + $(size_bytes "${WORK_ROOT}/core-control-plane-ops-proof") + $(size_bytes "${WORK_ROOT}/hardware-smoke") ))
printf 'transient total: %s (soft budget %s)\n' "$(human_size "${transient_size}")" "$(human_size "${transient_budget}")"
printf 'proof logs total: %s (soft budget %s)\n' "$(human_size "${proof_size}")" "$(human_size "${proof_budget}")"
}
budget_overages() {
local work_budget=$((60 * 1024 * 1024 * 1024))
local state_budget=$((35 * 1024 * 1024 * 1024))
local transient_budget=$((10 * 1024 * 1024 * 1024))
local proof_budget=$((20 * 1024 * 1024 * 1024))
local work_size state_size transient_size proof_size overages=0
work_size="$(size_bytes "${WORK_ROOT}")"
state_size="$(size_bytes "${PHOTON_CLUSTER_WORK_ROOT}/state")"
transient_size=$(( $(size_bytes "${WORK_ROOT}/tmp") + $(size_bytes "${WORK_ROOT}/publishable-kvm-runtime") ))
proof_size=$(( $(size_bytes "${WORK_ROOT}/publishable-kvm-suite") + $(size_bytes "${WORK_ROOT}/final-proofs") + $(size_bytes "${WORK_ROOT}/durability-proof") + $(size_bytes "${WORK_ROOT}/rollout-soak") + $(size_bytes "${WORK_ROOT}/provider-vm-reality-proof") + $(size_bytes "${WORK_ROOT}/baremetal-iso-e2e") + $(size_bytes "${WORK_ROOT}/core-control-plane-ops-proof") + $(size_bytes "${WORK_ROOT}/hardware-smoke") ))
(( work_size > work_budget )) && ((overages += 1))
(( state_size > state_budget )) && ((overages += 1))
(( transient_size > transient_budget )) && ((overages += 1))
(( proof_size > proof_budget )) && ((overages += 1))
printf '%s\n' "${overages}"
}
enforce() {
local overages
overages="$(budget_overages)"
status
echo
if (( overages > 0 )); then
echo "Budget enforcement failed: one or more tracked work-root areas are over the configured soft budget."
echo "Use cleanup-advice for the safe runtime cleanup sequence, or use prune-proof-logs for dated proof roots."
echo
cleanup_advice
echo
echo "Safer dated-proof cleanup dry-run:"
echo " ./nix/test-cluster/work-root-budget.sh prune-proof-logs 2"
return 1
fi
echo "Budget enforcement passed: all tracked work-root areas are within the configured soft budgets."
}
cleanup_advice() {
cat <<EOF
Safe cleanup sequence for the current checkout:
1. Stop running lab VMs:
nix run ./nix/test-cluster#cluster -- stop
2. Remove disposable VM runtime state:
nix run ./nix/test-cluster#cluster -- clean
rm -rf ${WORK_ROOT}/tmp ${WORK_ROOT}/publishable-kvm-runtime
3. Trim old proof roots you no longer need:
rm -rf ${WORK_ROOT}/publishable-kvm-suite/<old-run>
rm -rf ${WORK_ROOT}/final-proofs/<old-run>
rm -rf ${WORK_ROOT}/durability-proof/<old-run>
rm -rf ${WORK_ROOT}/rollout-soak/<old-run>
rm -rf ${WORK_ROOT}/provider-vm-reality-proof/<old-run>
rm -rf ${WORK_ROOT}/core-control-plane-ops-proof/<old-run>
rm -rf ${WORK_ROOT}/hardware-smoke/<old-run>
4. Run a Nix store GC after old result symlinks are gone:
nix store gc
EOF
}
prune_proof_logs() {
local keep="${1:-2}"
local apply="${2:-}"
local mode="dry-run"
if ! [[ "${keep}" =~ ^[0-9]+$ ]]; then
echo "keep-count must be a non-negative integer" >&2
exit 1
fi
if [[ "${apply}" == "--apply" ]]; then
mode="apply"
if [[ "${ULTRACLOUD_WORK_ROOT_PRUNE_ACK:-}" != "YES" ]]; then
echo "Refusing to delete proof logs without ULTRACLOUD_WORK_ROOT_PRUNE_ACK=YES" >&2
exit 1
fi
elif [[ -n "${apply}" ]]; then
echo "unknown prune-proof-logs flag: ${apply}" >&2
exit 1
fi
local root
while IFS= read -r root; do
[[ -d "${root}" ]] || continue
local -a dated_dirs=()
mapfile -t dated_dirs < <(find "${root}" -mindepth 1 -maxdepth 1 -type d -printf '%P\n' | sort -r)
if (( ${#dated_dirs[@]} <= keep )); then
continue
fi
echo "${root}:"
local idx candidate
for (( idx = keep; idx < ${#dated_dirs[@]}; idx += 1 )); do
candidate="${root}/${dated_dirs[$idx]}"
if [[ "${mode}" == "apply" ]]; then
rm -rf -- "${candidate}"
echo " deleted ${candidate}"
else
echo " would delete ${candidate}"
fi
done
done < <(proof_roots)
if [[ "${mode}" == "dry-run" ]]; then
echo
echo "Dry-run only. Re-run with:"
echo " ULTRACLOUD_WORK_ROOT_PRUNE_ACK=YES ./nix/test-cluster/work-root-budget.sh prune-proof-logs ${keep} --apply"
fi
}
main() {
local cmd="${1:-status}"
case "${cmd}" in
status)
status
;;
enforce)
enforce
;;
cleanup-advice)
cleanup_advice
;;
prune-proof-logs)
prune_proof_logs "${2:-2}" "${3:-}"
;;
-h|--help|help)
usage
;;
*)
usage >&2
exit 1
;;
esac
}
main "$@"