centra 5c6eb04a46 T036: Add VM cluster deployment configs for nixos-anywhere

- netboot-base.nix with SSH key auth
- Launch scripts for node01/02/03
- Node configuration.nix and disko.nix
- Nix modules for first-boot automation

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

2025-12-11 09:59:19 +09:00

22 KiB

Raw Blame History

Command Reference Guide

Document Version: 1.0 Last Updated: 2025-12-10

PXE Server Operations
Image Building
Node Provisioning
Cluster Management
Service Management
Health Checks
BMC/IPMI Operations
Network Diagnostics
Log Querying
Backup and Restore

PXE Server Operations

Start/Stop Services

# Start all PXE services
sudo systemctl start dhcpd4.service atftpd.service nginx.service

# Stop all PXE services
sudo systemctl stop dhcpd4.service atftpd.service nginx.service

# Restart all PXE services
sudo systemctl restart dhcpd4.service atftpd.service nginx.service

# Enable services at boot
sudo systemctl enable dhcpd4.service atftpd.service nginx.service

# Check status
sudo systemctl status dhcpd4.service
sudo systemctl status atftpd.service
sudo systemctl status nginx.service

DHCP Server Management

# Test DHCP configuration syntax
sudo dhcpd -t -cf /etc/dhcp/dhcpd.conf

# View DHCP leases
sudo cat /var/lib/dhcp/dhcpd.leases

# Watch DHCP leases in real-time
sudo tail -f /var/lib/dhcp/dhcpd.leases

# View DHCP server logs
sudo journalctl -u dhcpd4.service -f

# Check DHCP server is listening
sudo ss -ulnp | grep :67

# Send DHCP discover (from client)
sudo nmap --script broadcast-dhcp-discover -e eth0

TFTP Server Management

# Test TFTP download locally
tftp localhost -c get undionly.kpxe /tmp/test.kpxe

# Test TFTP from remote host
tftp 10.0.100.10 -c get ipxe.efi /tmp/test.efi

# Check TFTP server is listening
sudo ss -ulnp | grep :69

# View TFTP logs
sudo journalctl -u atftpd.service -f

# Monitor TFTP traffic
sudo tcpdump -i eth0 -n port 69 -vv

# List TFTP root directory
ls -lh /var/lib/tftpboot/

HTTP Server Management

# Test HTTP server
curl http://localhost:8080/health

# Test boot script availability
curl http://localhost:8080/boot/ipxe/boot.ipxe

# Test netboot image availability
curl -I http://localhost:8080/boot/nixos/control-plane/bzImage

# Check nginx configuration syntax
sudo nginx -t

# Reload nginx configuration (without restart)
sudo nginx -s reload

# View nginx access logs
sudo tail -f /var/log/nginx/access.log

# View nginx error logs
sudo tail -f /var/log/nginx/error.log

# Monitor HTTP traffic
sudo tcpdump -i eth0 -n port 80 or port 8080 -A | grep -E "GET|POST|HTTP"

PXE Boot Debugging

# Monitor all PXE-related traffic
sudo tcpdump -i eth0 -n '(port 67 or port 68 or port 69 or port 80)' -vv

# Watch for DHCP discover packets
sudo tcpdump -i eth0 -n 'udp port 67 or udp port 68' -vv

# Watch for specific MAC address
sudo tcpdump -i eth0 -n 'ether host 52:54:00:12:34:56'

# Check PXE server health
curl http://10.0.100.10:8080/health | jq

# View comprehensive logs (all services)
sudo journalctl -u dhcpd4 -u atftpd -u nginx -f --since "5 minutes ago"

Image Building

Build Netboot Images

# Build all profiles
cd /home/centra/cloud/baremetal/image-builder
./build-images.sh

# Build specific profile
./build-images.sh --profile control-plane
./build-images.sh --profile worker
./build-images.sh --profile all-in-one

# Build and deploy to PXE server
./build-images.sh --deploy

# Build with custom output directory
./build-images.sh --output-dir /srv/pxe/images

# Build with verbose output
./build-images.sh --verbose

Manual Nix Builds

# Build initrd
nix build .#nixosConfigurations.netboot-control-plane.config.system.build.netbootRamdisk

# Build kernel
nix build .#nixosConfigurations.netboot-control-plane.config.system.build.kernel

# Build complete system
nix build .#nixosConfigurations.netboot-control-plane.config.system.build.toplevel

# Check build artifacts
ls -lh result/

# Copy artifacts manually
sudo cp result/bzImage /var/lib/pxe-boot/nixos/control-plane/
sudo cp result/initrd /var/lib/pxe-boot/nixos/control-plane/

Image Verification

# Check image sizes
ls -lh /var/lib/pxe-boot/nixos/*/

# Verify bzImage is a valid kernel
file /var/lib/pxe-boot/nixos/control-plane/bzImage
# Expected: Linux kernel x86 boot executable ...

# Verify initrd is compressed
file /var/lib/pxe-boot/nixos/control-plane/initrd
# Expected: gzip compressed data ...

# Check HTTP accessibility
curl -I http://10.0.100.10:8080/boot/nixos/control-plane/bzImage
# Expected: HTTP/1.1 200 OK

# Calculate checksums
sha256sum /var/lib/pxe-boot/nixos/control-plane/{bzImage,initrd}

Node Provisioning

nixos-anywhere Commands

# Basic provisioning
nix run github:nix-community/nixos-anywhere -- \
  --flake /srv/provisioning#node01 \
  root@10.0.100.50

# Provision with remote build (faster on slow local machine)
nix run github:nix-community/nixos-anywhere -- \
  --flake /srv/provisioning#node01 \
  --build-on-remote \
  root@10.0.100.50

# Provision with disk encryption
nix run github:nix-community/nixos-anywhere -- \
  --flake /srv/provisioning#node01 \
  --disk-encryption-keys /tmp/luks.key <(cat /srv/provisioning/secrets/node01-luks.key) \
  root@10.0.100.50

# Debug mode (verbose output, no reboot)
nix run github:nix-community/nixos-anywhere -- \
  --flake /srv/provisioning#node01 \
  --debug \
  --no-reboot \
  root@10.0.100.50

# Use specific SSH key
nix run github:nix-community/nixos-anywhere -- \
  --flake /srv/provisioning#node01 \
  --ssh-key ~/.ssh/id_ed25519_provisioning \
  root@10.0.100.50

# Use specific Nix binary
nix run github:nix-community/nixos-anywhere -- \
  --flake /srv/provisioning#node01 \
  --nix-path /run/current-system/sw/bin/nix \
  root@10.0.100.50

Batch Provisioning

# Provision multiple nodes in parallel
for node in node01 node02 node03; do
  nix run github:nix-community/nixos-anywhere -- \
    --flake /srv/provisioning#${node} \
    --build-on-remote \
    root@<node-ip> &
done
wait
echo "All nodes provisioned"

# Provision with logging
for node in node01 node02 node03; do
  nix run github:nix-community/nixos-anywhere -- \
    --flake /srv/provisioning#${node} \
    root@<node-ip> 2>&1 | tee /var/log/provision-${node}.log &
done
wait

SSH to Installer

# SSH to PXE-booted installer
ssh root@10.0.100.50

# Check available disks
ssh root@10.0.100.50 'lsblk'

# Check network configuration
ssh root@10.0.100.50 'ip addr show'

# Check internet connectivity
ssh root@10.0.100.50 'ping -c 3 cache.nixos.org'

# Manual disk wipe (if needed)
ssh root@10.0.100.50 'wipefs -a /dev/sda && sgdisk --zap-all /dev/sda'

# Test disko configuration
ssh root@10.0.100.50 'nix-shell -p disko --run "disko --mode test /tmp/disko.nix"'

Cluster Management

Cluster Member Operations

# List cluster members (Chainfire)
curl -k https://node01.example.com:2379/admin/cluster/members | jq

# List cluster members (FlareDB)
curl -k https://node01.example.com:2479/admin/cluster/members | jq

# Get cluster leader
curl -k https://node01.example.com:2379/admin/cluster/leader | jq

# Get cluster status
curl -k https://node01.example.com:2379/admin/cluster/status | jq

Add Node to Cluster

# Add member to Chainfire cluster
curl -k -X POST https://node01.example.com:2379/admin/member/add \
  -H "Content-Type: application/json" \
  -d '{
    "id": "node04",
    "raft_addr": "10.0.200.13:2380"
  }'

# Add member to FlareDB cluster
curl -k -X POST https://node01.example.com:2479/admin/member/add \
  -H "Content-Type: application/json" \
  -d '{
    "id": "node04",
    "raft_addr": "10.0.200.13:2480"
  }'

Remove Node from Cluster

# Remove member from Chainfire cluster
curl -k -X DELETE https://node01.example.com:2379/admin/member/node04

# Remove member from FlareDB cluster
curl -k -X DELETE https://node01.example.com:2479/admin/member/node04

# Verify removal
curl -k https://node01.example.com:2379/admin/cluster/members | jq '.members[] | select(.id=="node04")'
# Expected: empty (no output)

Cluster Health Checks

# Check all nodes health (Chainfire)
for node in node01 node02 node03; do
  echo "$node:"
  curl -k https://${node}.example.com:2379/health | jq -c
done

# Check cluster has quorum
MEMBER_COUNT=$(curl -sk https://node01.example.com:2379/admin/cluster/members | jq '.members | length')
echo "Cluster has $MEMBER_COUNT members"
if [ $MEMBER_COUNT -ge 2 ]; then
  echo "Quorum achieved"
else
  echo "WARNING: No quorum"
fi

# Check Raft leader exists
LEADER=$(curl -sk https://node01.example.com:2379/admin/cluster/leader | jq -r '.id')
if [ -n "$LEADER" ]; then
  echo "Leader: $LEADER"
else
  echo "ERROR: No leader elected"
fi

Service Management

Systemd Service Control

# Start service
sudo systemctl start chainfire.service

# Stop service
sudo systemctl stop chainfire.service

# Restart service
sudo systemctl restart chainfire.service

# Reload configuration (without restart)
sudo systemctl reload chainfire.service

# Enable service at boot
sudo systemctl enable chainfire.service

# Disable service at boot
sudo systemctl disable chainfire.service

# Check service status
sudo systemctl status chainfire.service

# View service dependencies
sudo systemctl list-dependencies chainfire.service

Multi-Service Operations

# Start all PlasmaCloud services
sudo systemctl start chainfire.service flaredb.service iam.service \
  plasmavmc.service novanet.service flashdns.service

# Stop all PlasmaCloud services
sudo systemctl stop chainfire.service flaredb.service iam.service \
  plasmavmc.service novanet.service flashdns.service

# Check status of all services
systemctl status 'chainfire.service' 'flaredb.service' 'iam.service' \
  'plasmavmc.service' 'novanet.service' 'flashdns.service' --no-pager

# Restart services in order
sudo systemctl restart chainfire.service && sleep 10
sudo systemctl restart flaredb.service && sleep 10
sudo systemctl restart iam.service

NixOS Configuration Management

# Build new configuration (test)
sudo nixos-rebuild test --flake /srv/provisioning#node01

# Build and apply new configuration
sudo nixos-rebuild switch --flake /srv/provisioning#node01

# Build and set as boot default (no activation)
sudo nixos-rebuild boot --flake /srv/provisioning#node01

# Rollback to previous generation
sudo nixos-rebuild switch --rollback

# List generations
sudo nixos-rebuild list-generations

# Boot into specific generation (next boot only)
sudo nixos-rebuild boot --switch-generation 3

# Delete old generations
sudo nix-collect-garbage --delete-older-than 30d

Health Checks

Service Health Endpoints

# Chainfire health
curl -k https://node01.example.com:2379/health | jq

# FlareDB health
curl -k https://node01.example.com:2479/health | jq

# IAM health
curl -k https://node01.example.com:8080/health | jq

# PlasmaVMC health
curl -k https://node01.example.com:9090/health | jq

# NovaNET health
curl -k https://node01.example.com:9091/health | jq

# FlashDNS health (via HTTP)
curl -k https://node01.example.com:853/health | jq

# FiberLB health
curl -k https://node01.example.com:9092/health | jq

# K8sHost health
curl -k https://node01.example.com:10250/healthz

Comprehensive Health Check Script

#!/bin/bash
# /srv/provisioning/scripts/health-check-all.sh

NODES=("node01" "node02" "node03")
SERVICES=("2379:Chainfire" "2479:FlareDB" "8080:IAM" "9090:PlasmaVMC")

for node in "${NODES[@]}"; do
  echo "Checking $node..."
  for service in "${SERVICES[@]}"; do
    port=$(echo $service | cut -d: -f1)
    name=$(echo $service | cut -d: -f2)

    status=$(curl -sk https://${node}.example.com:${port}/health | jq -r '.status' 2>/dev/null)
    if [ "$status" = "healthy" ]; then
      echo "  ✓ $name: healthy"
    else
      echo "  ✗ $name: unhealthy or unreachable"
    fi
  done
  echo ""
done

System Health Checks

# Check system load
ssh root@node01.example.com 'uptime'

# Check memory usage
ssh root@node01.example.com 'free -h'

# Check disk usage
ssh root@node01.example.com 'df -h'

# Check disk I/O
ssh root@node01.example.com 'iostat -x 1 5'

# Check network bandwidth
ssh root@node01.example.com 'iftop -i eth1 -t -s 5'

# Check process list
ssh root@node01.example.com 'ps aux --sort=-%mem | head -20'

# Check for OOM kills
ssh root@node01.example.com 'dmesg | grep -i "out of memory"'

BMC/IPMI Operations

Power Control

# Power on
ipmitool -I lanplus -H 10.0.10.50 -U admin -P password chassis power on

# Power off (graceful)
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis power soft

# Power off (force)
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis power off

# Power cycle
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis power cycle

# Power status
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis power status

Boot Device Control

# Set next boot to PXE
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis bootdev pxe

# Set next boot to disk
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis bootdev disk

# Set next boot to CDROM
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis bootdev cdrom

# Set persistent PXE boot (all future boots)
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis bootdev pxe options=persistent

# Clear persistent boot device
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis bootdev none

Serial-over-LAN (SOL)

# Activate SOL session
ipmitool -I lanplus -H 10.0.10.50 -U admin sol activate

# Deactivate SOL session (from another terminal)
ipmitool -I lanplus -H 10.0.10.50 -U admin sol deactivate

# Configure SOL settings
ipmitool -I lanplus -H 10.0.10.50 -U admin sol set enabled true 1
ipmitool -I lanplus -H 10.0.10.50 -U admin sol set volatile-bit-rate 115.2 1

# View SOL configuration
ipmitool -I lanplus -H 10.0.10.50 -U admin sol info 1

System Information

# Get sensor readings
ipmitool -I lanplus -H 10.0.10.50 -U admin sdr list

# Get specific sensor
ipmitool -I lanplus -H 10.0.10.50 -U admin sdr get "CPU Temp"

# Get system event log
ipmitool -I lanplus -H 10.0.10.50 -U admin sel list

# Clear system event log
ipmitool -I lanplus -H 10.0.10.50 -U admin sel clear

# Get BMC info
ipmitool -I lanplus -H 10.0.10.50 -U admin bmc info

# Get FRU (Field Replaceable Unit) info
ipmitool -I lanplus -H 10.0.10.50 -U admin fru print

Batch Operations

# Power on all nodes
for ip in 10.0.10.{50..55}; do
  echo "Powering on $ip..."
  ipmitool -I lanplus -H $ip -U admin -P password chassis power on
done

# Check power status all nodes
for ip in 10.0.10.{50..55}; do
  echo -n "$ip: "
  ipmitool -I lanplus -H $ip -U admin -P password chassis power status
done

# Set all nodes to PXE boot
for ip in 10.0.10.{50..55}; do
  echo "Setting $ip to PXE boot..."
  ipmitool -I lanplus -H $ip -U admin -P password chassis bootdev pxe options=persistent
done

Network Diagnostics

Connectivity Tests

# Ping test
ping -c 5 node01.example.com

# TCP port test
nc -zv node01.example.com 2379

# TCP port test with timeout
timeout 5 bash -c '</dev/tcp/node01.example.com/2379' && echo "Port open" || echo "Port closed"

# Multiple port test
for port in 2379 2380 2479 2480 8080; do
  nc -zv node01.example.com $port
done

# DNS resolution test
dig node01.example.com
dig -x 10.0.200.10

# Route test
traceroute node01.example.com

# MTU test
ping -M do -s 1472 -c 5 node01.example.com

Bandwidth Tests

# iperf3 server (on target node)
ssh root@node02.example.com 'iperf3 -s -D'

# iperf3 client (from source node)
ssh root@node01.example.com 'iperf3 -c node02.example.com -t 10'

# iperf3 bidirectional test
ssh root@node01.example.com 'iperf3 -c node02.example.com -t 10 --bidir'

# iperf3 UDP test
ssh root@node01.example.com 'iperf3 -c node02.example.com -u -b 10G -t 10'

Packet Capture

# Capture all traffic on interface
ssh root@node01.example.com 'tcpdump -i eth1 -w /tmp/capture.pcap -c 1000'

# Capture specific port
ssh root@node01.example.com 'tcpdump -i eth1 port 2379 -w /tmp/chainfire.pcap'

# Capture and display
ssh root@node01.example.com 'tcpdump -i eth1 port 2379 -A -vv'

# Capture with filters
ssh root@node01.example.com 'tcpdump -i eth1 "tcp port 2379 and host node02.example.com" -vv'

# Download capture file
scp root@node01.example.com:/tmp/capture.pcap ./

Firewall Diagnostics

# List all firewall rules
ssh root@node01.example.com 'iptables -L -n -v'

# List specific chain
ssh root@node01.example.com 'iptables -L INPUT -n -v'

# Check if port is allowed
ssh root@node01.example.com 'iptables -L -n | grep 2379'

# Test firewall rule (temporarily add)
ssh root@node01.example.com 'iptables -I INPUT -p tcp --dport 2379 -j ACCEPT'

# Count packets on specific rule
ssh root@node01.example.com 'iptables -L INPUT -n -v | grep 2379'

Log Querying

Journalctl Commands

# View logs for specific service
sudo journalctl -u chainfire.service

# Follow logs in real-time
sudo journalctl -u chainfire.service -f

# View logs from current boot
sudo journalctl -u chainfire.service -b

# View logs from previous boot
sudo journalctl -u chainfire.service -b -1

# View logs from specific time range
sudo journalctl -u chainfire.service --since "2025-12-10 10:00" --until "2025-12-10 11:00"

# View logs from last hour
sudo journalctl -u chainfire.service --since "1 hour ago"

# View last N lines
sudo journalctl -u chainfire.service -n 100

# View logs in JSON format
sudo journalctl -u chainfire.service -o json

# View logs with no pager
sudo journalctl -u chainfire.service --no-pager

# Search for keyword
sudo journalctl -u chainfire.service | grep "error"

# Show logs from multiple services
sudo journalctl -u chainfire.service -u flaredb.service -f

Log Analysis

# Count error messages
sudo journalctl -u chainfire.service | grep -c "ERROR"

# Extract error messages
sudo journalctl -u chainfire.service | grep "ERROR"

# Group by error type
sudo journalctl -u chainfire.service -o json | \
  jq -r 'select(.MESSAGE | contains("ERROR")) | .MESSAGE' | \
  sort | uniq -c | sort -rn

# Find slow operations
sudo journalctl -u chainfire.service | grep "slow operation"

# Check for restarts
sudo journalctl -u chainfire.service | grep "Started\|Stopped"

# Extract structured logs
sudo journalctl -u chainfire.service -o json | \
  jq 'select(.level == "ERROR") | {time: .__REALTIME_TIMESTAMP, message: .MESSAGE}'

Remote Log Querying

# Query logs on remote node
ssh root@node01.example.com 'journalctl -u chainfire.service -n 100'

# Follow remote logs
ssh root@node01.example.com 'journalctl -u chainfire.service -f'

# Query logs from all nodes
for node in node01 node02 node03; do
  echo "=== $node ==="
  ssh root@${node}.example.com 'journalctl -u chainfire.service -n 10'
done

Backup and Restore

Backup Commands

# Backup Chainfire data
ssh root@node01.example.com 'tar -czf - /var/lib/chainfire' > chainfire-backup-$(date +%Y%m%d).tar.gz

# Backup FlareDB data
ssh root@node01.example.com 'tar -czf - /var/lib/flaredb' > flaredb-backup-$(date +%Y%m%d).tar.gz

# Backup configuration files
tar -czf provisioning-config-$(date +%Y%m%d).tar.gz /srv/provisioning/nodes/

# Backup TLS certificates
tar -czf tls-certs-$(date +%Y%m%d).tar.gz /srv/provisioning/secrets/*.pem

Automated Backup Script

#!/bin/bash
# /srv/provisioning/scripts/backup-cluster.sh

BACKUP_DIR="/backup/cluster-$(date +%Y%m%d-%H%M%S)"
mkdir -p "$BACKUP_DIR"

# Backup cluster data from all nodes
for node in node01 node02 node03; do
  echo "Backing up $node..."
  ssh root@$node.example.com "tar -czf - /var/lib/chainfire" > "$BACKUP_DIR/chainfire-$node.tar.gz"
  ssh root@$node.example.com "tar -czf - /var/lib/flaredb" > "$BACKUP_DIR/flaredb-$node.tar.gz"
done

# Backup configurations
cp -r /srv/provisioning/nodes "$BACKUP_DIR/configs"

# Create manifest
cat > "$BACKUP_DIR/manifest.txt" <<EOF
Backup Date: $(date)
Nodes: node01 node02 node03
Contents:
- Chainfire data (all nodes)
- FlareDB data (all nodes)
- Node configurations
EOF

echo "Backup complete: $BACKUP_DIR"

Restore Commands

# Stop services before restore
ssh root@node01.example.com 'systemctl stop chainfire.service flaredb.service'

# Restore Chainfire data
cat chainfire-backup-20251210.tar.gz | \
  ssh root@node01.example.com 'cd / && tar -xzf -'

# Restore FlareDB data
cat flaredb-backup-20251210.tar.gz | \
  ssh root@node01.example.com 'cd / && tar -xzf -'

# Fix permissions
ssh root@node01.example.com 'chown -R chainfire:chainfire /var/lib/chainfire'
ssh root@node01.example.com 'chown -R flaredb:flaredb /var/lib/flaredb'

# Start services
ssh root@node01.example.com 'systemctl start chainfire.service flaredb.service'

# Verify restore
ssh root@node01.example.com 'systemctl status chainfire.service flaredb.service'
curl -k https://node01.example.com:2379/health | jq

Snapshot Management

# Create Chainfire snapshot
curl -k -X POST https://node01.example.com:2379/admin/snapshot/create \
  -H "Content-Type: application/json" \
  -d '{"name":"manual-snapshot-'$(date +%Y%m%d-%H%M%S)'"}'

# List snapshots
curl -k https://node01.example.com:2379/admin/snapshot/list | jq

# Restore from snapshot
curl -k -X POST https://node01.example.com:2379/admin/snapshot/restore \
  -H "Content-Type: application/json" \
  -d '{"name":"manual-snapshot-20251210-120000"}'

# Delete old snapshots
curl -k -X DELETE https://node01.example.com:2379/admin/snapshot/manual-snapshot-20251201-120000

Quick Reference Cheat Sheet

Most Common Commands

# PXE Server
sudo systemctl restart dhcpd4 atftpd nginx
curl http://localhost:8080/health

# Build Images
cd /home/centra/cloud/baremetal/image-builder && ./build-images.sh

# Provision Node
nix run github:nix-community/nixos-anywhere -- --flake /srv/provisioning#node01 root@10.0.100.50

# Cluster Status
curl -k https://node01.example.com:2379/admin/cluster/members | jq

# Service Status
sudo systemctl status chainfire.service
sudo journalctl -u chainfire.service -f

# Health Check
curl -k https://node01.example.com:2379/health | jq

# Power Control
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis power on
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis bootdev pxe

# NixOS Update
sudo nixos-rebuild switch --flake /srv/provisioning#node01

# Logs
sudo journalctl -u chainfire.service -f

Document End

22 KiB Raw Blame History

Command Reference Guide

Table of Contents

PXE Server Operations

Start/Stop Services

DHCP Server Management

TFTP Server Management

HTTP Server Management

PXE Boot Debugging

Image Building

Build Netboot Images

Manual Nix Builds

Image Verification

Node Provisioning

nixos-anywhere Commands

Batch Provisioning

SSH to Installer

Cluster Management

Cluster Member Operations

Add Node to Cluster

Remove Node from Cluster

Cluster Health Checks

Service Management

Systemd Service Control

Multi-Service Operations

NixOS Configuration Management

Health Checks

Service Health Endpoints

Comprehensive Health Check Script

System Health Checks

BMC/IPMI Operations

Power Control

Boot Device Control

Serial-over-LAN (SOL)

System Information

Batch Operations

Network Diagnostics

Connectivity Tests

Bandwidth Tests

Packet Capture

Firewall Diagnostics

Log Querying

Journalctl Commands

Log Analysis

Remote Log Querying

Backup and Restore

Backup Commands

Automated Backup Script

Restore Commands

Snapshot Management

Quick Reference Cheat Sheet

Most Common Commands

22 KiB

Raw Blame History