- netboot-base.nix with SSH key auth - Launch scripts for node01/02/03 - Node configuration.nix and disko.nix - Nix modules for first-boot automation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
22 KiB
22 KiB
Command Reference Guide
Document Version: 1.0 Last Updated: 2025-12-10
Table of Contents
- PXE Server Operations
- Image Building
- Node Provisioning
- Cluster Management
- Service Management
- Health Checks
- BMC/IPMI Operations
- Network Diagnostics
- Log Querying
- Backup and Restore
PXE Server Operations
Start/Stop Services
# Start all PXE services
sudo systemctl start dhcpd4.service atftpd.service nginx.service
# Stop all PXE services
sudo systemctl stop dhcpd4.service atftpd.service nginx.service
# Restart all PXE services
sudo systemctl restart dhcpd4.service atftpd.service nginx.service
# Enable services at boot
sudo systemctl enable dhcpd4.service atftpd.service nginx.service
# Check status
sudo systemctl status dhcpd4.service
sudo systemctl status atftpd.service
sudo systemctl status nginx.service
DHCP Server Management
# Test DHCP configuration syntax
sudo dhcpd -t -cf /etc/dhcp/dhcpd.conf
# View DHCP leases
sudo cat /var/lib/dhcp/dhcpd.leases
# Watch DHCP leases in real-time
sudo tail -f /var/lib/dhcp/dhcpd.leases
# View DHCP server logs
sudo journalctl -u dhcpd4.service -f
# Check DHCP server is listening
sudo ss -ulnp | grep :67
# Send DHCP discover (from client)
sudo nmap --script broadcast-dhcp-discover -e eth0
TFTP Server Management
# Test TFTP download locally
tftp localhost -c get undionly.kpxe /tmp/test.kpxe
# Test TFTP from remote host
tftp 10.0.100.10 -c get ipxe.efi /tmp/test.efi
# Check TFTP server is listening
sudo ss -ulnp | grep :69
# View TFTP logs
sudo journalctl -u atftpd.service -f
# Monitor TFTP traffic
sudo tcpdump -i eth0 -n port 69 -vv
# List TFTP root directory
ls -lh /var/lib/tftpboot/
HTTP Server Management
# Test HTTP server
curl http://localhost:8080/health
# Test boot script availability
curl http://localhost:8080/boot/ipxe/boot.ipxe
# Test netboot image availability
curl -I http://localhost:8080/boot/nixos/control-plane/bzImage
# Check nginx configuration syntax
sudo nginx -t
# Reload nginx configuration (without restart)
sudo nginx -s reload
# View nginx access logs
sudo tail -f /var/log/nginx/access.log
# View nginx error logs
sudo tail -f /var/log/nginx/error.log
# Monitor HTTP traffic
sudo tcpdump -i eth0 -n port 80 or port 8080 -A | grep -E "GET|POST|HTTP"
PXE Boot Debugging
# Monitor all PXE-related traffic
sudo tcpdump -i eth0 -n '(port 67 or port 68 or port 69 or port 80)' -vv
# Watch for DHCP discover packets
sudo tcpdump -i eth0 -n 'udp port 67 or udp port 68' -vv
# Watch for specific MAC address
sudo tcpdump -i eth0 -n 'ether host 52:54:00:12:34:56'
# Check PXE server health
curl http://10.0.100.10:8080/health | jq
# View comprehensive logs (all services)
sudo journalctl -u dhcpd4 -u atftpd -u nginx -f --since "5 minutes ago"
Image Building
Build Netboot Images
# Build all profiles
cd /home/centra/cloud/baremetal/image-builder
./build-images.sh
# Build specific profile
./build-images.sh --profile control-plane
./build-images.sh --profile worker
./build-images.sh --profile all-in-one
# Build and deploy to PXE server
./build-images.sh --deploy
# Build with custom output directory
./build-images.sh --output-dir /srv/pxe/images
# Build with verbose output
./build-images.sh --verbose
Manual Nix Builds
# Build initrd
nix build .#nixosConfigurations.netboot-control-plane.config.system.build.netbootRamdisk
# Build kernel
nix build .#nixosConfigurations.netboot-control-plane.config.system.build.kernel
# Build complete system
nix build .#nixosConfigurations.netboot-control-plane.config.system.build.toplevel
# Check build artifacts
ls -lh result/
# Copy artifacts manually
sudo cp result/bzImage /var/lib/pxe-boot/nixos/control-plane/
sudo cp result/initrd /var/lib/pxe-boot/nixos/control-plane/
Image Verification
# Check image sizes
ls -lh /var/lib/pxe-boot/nixos/*/
# Verify bzImage is a valid kernel
file /var/lib/pxe-boot/nixos/control-plane/bzImage
# Expected: Linux kernel x86 boot executable ...
# Verify initrd is compressed
file /var/lib/pxe-boot/nixos/control-plane/initrd
# Expected: gzip compressed data ...
# Check HTTP accessibility
curl -I http://10.0.100.10:8080/boot/nixos/control-plane/bzImage
# Expected: HTTP/1.1 200 OK
# Calculate checksums
sha256sum /var/lib/pxe-boot/nixos/control-plane/{bzImage,initrd}
Node Provisioning
nixos-anywhere Commands
# Basic provisioning
nix run github:nix-community/nixos-anywhere -- \
--flake /srv/provisioning#node01 \
root@10.0.100.50
# Provision with remote build (faster on slow local machine)
nix run github:nix-community/nixos-anywhere -- \
--flake /srv/provisioning#node01 \
--build-on-remote \
root@10.0.100.50
# Provision with disk encryption
nix run github:nix-community/nixos-anywhere -- \
--flake /srv/provisioning#node01 \
--disk-encryption-keys /tmp/luks.key <(cat /srv/provisioning/secrets/node01-luks.key) \
root@10.0.100.50
# Debug mode (verbose output, no reboot)
nix run github:nix-community/nixos-anywhere -- \
--flake /srv/provisioning#node01 \
--debug \
--no-reboot \
root@10.0.100.50
# Use specific SSH key
nix run github:nix-community/nixos-anywhere -- \
--flake /srv/provisioning#node01 \
--ssh-key ~/.ssh/id_ed25519_provisioning \
root@10.0.100.50
# Use specific Nix binary
nix run github:nix-community/nixos-anywhere -- \
--flake /srv/provisioning#node01 \
--nix-path /run/current-system/sw/bin/nix \
root@10.0.100.50
Batch Provisioning
# Provision multiple nodes in parallel
for node in node01 node02 node03; do
nix run github:nix-community/nixos-anywhere -- \
--flake /srv/provisioning#${node} \
--build-on-remote \
root@<node-ip> &
done
wait
echo "All nodes provisioned"
# Provision with logging
for node in node01 node02 node03; do
nix run github:nix-community/nixos-anywhere -- \
--flake /srv/provisioning#${node} \
root@<node-ip> 2>&1 | tee /var/log/provision-${node}.log &
done
wait
SSH to Installer
# SSH to PXE-booted installer
ssh root@10.0.100.50
# Check available disks
ssh root@10.0.100.50 'lsblk'
# Check network configuration
ssh root@10.0.100.50 'ip addr show'
# Check internet connectivity
ssh root@10.0.100.50 'ping -c 3 cache.nixos.org'
# Manual disk wipe (if needed)
ssh root@10.0.100.50 'wipefs -a /dev/sda && sgdisk --zap-all /dev/sda'
# Test disko configuration
ssh root@10.0.100.50 'nix-shell -p disko --run "disko --mode test /tmp/disko.nix"'
Cluster Management
Cluster Member Operations
# List cluster members (Chainfire)
curl -k https://node01.example.com:2379/admin/cluster/members | jq
# List cluster members (FlareDB)
curl -k https://node01.example.com:2479/admin/cluster/members | jq
# Get cluster leader
curl -k https://node01.example.com:2379/admin/cluster/leader | jq
# Get cluster status
curl -k https://node01.example.com:2379/admin/cluster/status | jq
Add Node to Cluster
# Add member to Chainfire cluster
curl -k -X POST https://node01.example.com:2379/admin/member/add \
-H "Content-Type: application/json" \
-d '{
"id": "node04",
"raft_addr": "10.0.200.13:2380"
}'
# Add member to FlareDB cluster
curl -k -X POST https://node01.example.com:2479/admin/member/add \
-H "Content-Type: application/json" \
-d '{
"id": "node04",
"raft_addr": "10.0.200.13:2480"
}'
Remove Node from Cluster
# Remove member from Chainfire cluster
curl -k -X DELETE https://node01.example.com:2379/admin/member/node04
# Remove member from FlareDB cluster
curl -k -X DELETE https://node01.example.com:2479/admin/member/node04
# Verify removal
curl -k https://node01.example.com:2379/admin/cluster/members | jq '.members[] | select(.id=="node04")'
# Expected: empty (no output)
Cluster Health Checks
# Check all nodes health (Chainfire)
for node in node01 node02 node03; do
echo "$node:"
curl -k https://${node}.example.com:2379/health | jq -c
done
# Check cluster has quorum
MEMBER_COUNT=$(curl -sk https://node01.example.com:2379/admin/cluster/members | jq '.members | length')
echo "Cluster has $MEMBER_COUNT members"
if [ $MEMBER_COUNT -ge 2 ]; then
echo "Quorum achieved"
else
echo "WARNING: No quorum"
fi
# Check Raft leader exists
LEADER=$(curl -sk https://node01.example.com:2379/admin/cluster/leader | jq -r '.id')
if [ -n "$LEADER" ]; then
echo "Leader: $LEADER"
else
echo "ERROR: No leader elected"
fi
Service Management
Systemd Service Control
# Start service
sudo systemctl start chainfire.service
# Stop service
sudo systemctl stop chainfire.service
# Restart service
sudo systemctl restart chainfire.service
# Reload configuration (without restart)
sudo systemctl reload chainfire.service
# Enable service at boot
sudo systemctl enable chainfire.service
# Disable service at boot
sudo systemctl disable chainfire.service
# Check service status
sudo systemctl status chainfire.service
# View service dependencies
sudo systemctl list-dependencies chainfire.service
Multi-Service Operations
# Start all PlasmaCloud services
sudo systemctl start chainfire.service flaredb.service iam.service \
plasmavmc.service novanet.service flashdns.service
# Stop all PlasmaCloud services
sudo systemctl stop chainfire.service flaredb.service iam.service \
plasmavmc.service novanet.service flashdns.service
# Check status of all services
systemctl status 'chainfire.service' 'flaredb.service' 'iam.service' \
'plasmavmc.service' 'novanet.service' 'flashdns.service' --no-pager
# Restart services in order
sudo systemctl restart chainfire.service && sleep 10
sudo systemctl restart flaredb.service && sleep 10
sudo systemctl restart iam.service
NixOS Configuration Management
# Build new configuration (test)
sudo nixos-rebuild test --flake /srv/provisioning#node01
# Build and apply new configuration
sudo nixos-rebuild switch --flake /srv/provisioning#node01
# Build and set as boot default (no activation)
sudo nixos-rebuild boot --flake /srv/provisioning#node01
# Rollback to previous generation
sudo nixos-rebuild switch --rollback
# List generations
sudo nixos-rebuild list-generations
# Boot into specific generation (next boot only)
sudo nixos-rebuild boot --switch-generation 3
# Delete old generations
sudo nix-collect-garbage --delete-older-than 30d
Health Checks
Service Health Endpoints
# Chainfire health
curl -k https://node01.example.com:2379/health | jq
# FlareDB health
curl -k https://node01.example.com:2479/health | jq
# IAM health
curl -k https://node01.example.com:8080/health | jq
# PlasmaVMC health
curl -k https://node01.example.com:9090/health | jq
# NovaNET health
curl -k https://node01.example.com:9091/health | jq
# FlashDNS health (via HTTP)
curl -k https://node01.example.com:853/health | jq
# FiberLB health
curl -k https://node01.example.com:9092/health | jq
# K8sHost health
curl -k https://node01.example.com:10250/healthz
Comprehensive Health Check Script
#!/bin/bash
# /srv/provisioning/scripts/health-check-all.sh
NODES=("node01" "node02" "node03")
SERVICES=("2379:Chainfire" "2479:FlareDB" "8080:IAM" "9090:PlasmaVMC")
for node in "${NODES[@]}"; do
echo "Checking $node..."
for service in "${SERVICES[@]}"; do
port=$(echo $service | cut -d: -f1)
name=$(echo $service | cut -d: -f2)
status=$(curl -sk https://${node}.example.com:${port}/health | jq -r '.status' 2>/dev/null)
if [ "$status" = "healthy" ]; then
echo " ✓ $name: healthy"
else
echo " ✗ $name: unhealthy or unreachable"
fi
done
echo ""
done
System Health Checks
# Check system load
ssh root@node01.example.com 'uptime'
# Check memory usage
ssh root@node01.example.com 'free -h'
# Check disk usage
ssh root@node01.example.com 'df -h'
# Check disk I/O
ssh root@node01.example.com 'iostat -x 1 5'
# Check network bandwidth
ssh root@node01.example.com 'iftop -i eth1 -t -s 5'
# Check process list
ssh root@node01.example.com 'ps aux --sort=-%mem | head -20'
# Check for OOM kills
ssh root@node01.example.com 'dmesg | grep -i "out of memory"'
BMC/IPMI Operations
Power Control
# Power on
ipmitool -I lanplus -H 10.0.10.50 -U admin -P password chassis power on
# Power off (graceful)
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis power soft
# Power off (force)
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis power off
# Power cycle
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis power cycle
# Power status
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis power status
Boot Device Control
# Set next boot to PXE
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis bootdev pxe
# Set next boot to disk
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis bootdev disk
# Set next boot to CDROM
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis bootdev cdrom
# Set persistent PXE boot (all future boots)
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis bootdev pxe options=persistent
# Clear persistent boot device
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis bootdev none
Serial-over-LAN (SOL)
# Activate SOL session
ipmitool -I lanplus -H 10.0.10.50 -U admin sol activate
# Deactivate SOL session (from another terminal)
ipmitool -I lanplus -H 10.0.10.50 -U admin sol deactivate
# Configure SOL settings
ipmitool -I lanplus -H 10.0.10.50 -U admin sol set enabled true 1
ipmitool -I lanplus -H 10.0.10.50 -U admin sol set volatile-bit-rate 115.2 1
# View SOL configuration
ipmitool -I lanplus -H 10.0.10.50 -U admin sol info 1
System Information
# Get sensor readings
ipmitool -I lanplus -H 10.0.10.50 -U admin sdr list
# Get specific sensor
ipmitool -I lanplus -H 10.0.10.50 -U admin sdr get "CPU Temp"
# Get system event log
ipmitool -I lanplus -H 10.0.10.50 -U admin sel list
# Clear system event log
ipmitool -I lanplus -H 10.0.10.50 -U admin sel clear
# Get BMC info
ipmitool -I lanplus -H 10.0.10.50 -U admin bmc info
# Get FRU (Field Replaceable Unit) info
ipmitool -I lanplus -H 10.0.10.50 -U admin fru print
Batch Operations
# Power on all nodes
for ip in 10.0.10.{50..55}; do
echo "Powering on $ip..."
ipmitool -I lanplus -H $ip -U admin -P password chassis power on
done
# Check power status all nodes
for ip in 10.0.10.{50..55}; do
echo -n "$ip: "
ipmitool -I lanplus -H $ip -U admin -P password chassis power status
done
# Set all nodes to PXE boot
for ip in 10.0.10.{50..55}; do
echo "Setting $ip to PXE boot..."
ipmitool -I lanplus -H $ip -U admin -P password chassis bootdev pxe options=persistent
done
Network Diagnostics
Connectivity Tests
# Ping test
ping -c 5 node01.example.com
# TCP port test
nc -zv node01.example.com 2379
# TCP port test with timeout
timeout 5 bash -c '</dev/tcp/node01.example.com/2379' && echo "Port open" || echo "Port closed"
# Multiple port test
for port in 2379 2380 2479 2480 8080; do
nc -zv node01.example.com $port
done
# DNS resolution test
dig node01.example.com
dig -x 10.0.200.10
# Route test
traceroute node01.example.com
# MTU test
ping -M do -s 1472 -c 5 node01.example.com
Bandwidth Tests
# iperf3 server (on target node)
ssh root@node02.example.com 'iperf3 -s -D'
# iperf3 client (from source node)
ssh root@node01.example.com 'iperf3 -c node02.example.com -t 10'
# iperf3 bidirectional test
ssh root@node01.example.com 'iperf3 -c node02.example.com -t 10 --bidir'
# iperf3 UDP test
ssh root@node01.example.com 'iperf3 -c node02.example.com -u -b 10G -t 10'
Packet Capture
# Capture all traffic on interface
ssh root@node01.example.com 'tcpdump -i eth1 -w /tmp/capture.pcap -c 1000'
# Capture specific port
ssh root@node01.example.com 'tcpdump -i eth1 port 2379 -w /tmp/chainfire.pcap'
# Capture and display
ssh root@node01.example.com 'tcpdump -i eth1 port 2379 -A -vv'
# Capture with filters
ssh root@node01.example.com 'tcpdump -i eth1 "tcp port 2379 and host node02.example.com" -vv'
# Download capture file
scp root@node01.example.com:/tmp/capture.pcap ./
Firewall Diagnostics
# List all firewall rules
ssh root@node01.example.com 'iptables -L -n -v'
# List specific chain
ssh root@node01.example.com 'iptables -L INPUT -n -v'
# Check if port is allowed
ssh root@node01.example.com 'iptables -L -n | grep 2379'
# Test firewall rule (temporarily add)
ssh root@node01.example.com 'iptables -I INPUT -p tcp --dport 2379 -j ACCEPT'
# Count packets on specific rule
ssh root@node01.example.com 'iptables -L INPUT -n -v | grep 2379'
Log Querying
Journalctl Commands
# View logs for specific service
sudo journalctl -u chainfire.service
# Follow logs in real-time
sudo journalctl -u chainfire.service -f
# View logs from current boot
sudo journalctl -u chainfire.service -b
# View logs from previous boot
sudo journalctl -u chainfire.service -b -1
# View logs from specific time range
sudo journalctl -u chainfire.service --since "2025-12-10 10:00" --until "2025-12-10 11:00"
# View logs from last hour
sudo journalctl -u chainfire.service --since "1 hour ago"
# View last N lines
sudo journalctl -u chainfire.service -n 100
# View logs in JSON format
sudo journalctl -u chainfire.service -o json
# View logs with no pager
sudo journalctl -u chainfire.service --no-pager
# Search for keyword
sudo journalctl -u chainfire.service | grep "error"
# Show logs from multiple services
sudo journalctl -u chainfire.service -u flaredb.service -f
Log Analysis
# Count error messages
sudo journalctl -u chainfire.service | grep -c "ERROR"
# Extract error messages
sudo journalctl -u chainfire.service | grep "ERROR"
# Group by error type
sudo journalctl -u chainfire.service -o json | \
jq -r 'select(.MESSAGE | contains("ERROR")) | .MESSAGE' | \
sort | uniq -c | sort -rn
# Find slow operations
sudo journalctl -u chainfire.service | grep "slow operation"
# Check for restarts
sudo journalctl -u chainfire.service | grep "Started\|Stopped"
# Extract structured logs
sudo journalctl -u chainfire.service -o json | \
jq 'select(.level == "ERROR") | {time: .__REALTIME_TIMESTAMP, message: .MESSAGE}'
Remote Log Querying
# Query logs on remote node
ssh root@node01.example.com 'journalctl -u chainfire.service -n 100'
# Follow remote logs
ssh root@node01.example.com 'journalctl -u chainfire.service -f'
# Query logs from all nodes
for node in node01 node02 node03; do
echo "=== $node ==="
ssh root@${node}.example.com 'journalctl -u chainfire.service -n 10'
done
Backup and Restore
Backup Commands
# Backup Chainfire data
ssh root@node01.example.com 'tar -czf - /var/lib/chainfire' > chainfire-backup-$(date +%Y%m%d).tar.gz
# Backup FlareDB data
ssh root@node01.example.com 'tar -czf - /var/lib/flaredb' > flaredb-backup-$(date +%Y%m%d).tar.gz
# Backup configuration files
tar -czf provisioning-config-$(date +%Y%m%d).tar.gz /srv/provisioning/nodes/
# Backup TLS certificates
tar -czf tls-certs-$(date +%Y%m%d).tar.gz /srv/provisioning/secrets/*.pem
Automated Backup Script
#!/bin/bash
# /srv/provisioning/scripts/backup-cluster.sh
BACKUP_DIR="/backup/cluster-$(date +%Y%m%d-%H%M%S)"
mkdir -p "$BACKUP_DIR"
# Backup cluster data from all nodes
for node in node01 node02 node03; do
echo "Backing up $node..."
ssh root@$node.example.com "tar -czf - /var/lib/chainfire" > "$BACKUP_DIR/chainfire-$node.tar.gz"
ssh root@$node.example.com "tar -czf - /var/lib/flaredb" > "$BACKUP_DIR/flaredb-$node.tar.gz"
done
# Backup configurations
cp -r /srv/provisioning/nodes "$BACKUP_DIR/configs"
# Create manifest
cat > "$BACKUP_DIR/manifest.txt" <<EOF
Backup Date: $(date)
Nodes: node01 node02 node03
Contents:
- Chainfire data (all nodes)
- FlareDB data (all nodes)
- Node configurations
EOF
echo "Backup complete: $BACKUP_DIR"
Restore Commands
# Stop services before restore
ssh root@node01.example.com 'systemctl stop chainfire.service flaredb.service'
# Restore Chainfire data
cat chainfire-backup-20251210.tar.gz | \
ssh root@node01.example.com 'cd / && tar -xzf -'
# Restore FlareDB data
cat flaredb-backup-20251210.tar.gz | \
ssh root@node01.example.com 'cd / && tar -xzf -'
# Fix permissions
ssh root@node01.example.com 'chown -R chainfire:chainfire /var/lib/chainfire'
ssh root@node01.example.com 'chown -R flaredb:flaredb /var/lib/flaredb'
# Start services
ssh root@node01.example.com 'systemctl start chainfire.service flaredb.service'
# Verify restore
ssh root@node01.example.com 'systemctl status chainfire.service flaredb.service'
curl -k https://node01.example.com:2379/health | jq
Snapshot Management
# Create Chainfire snapshot
curl -k -X POST https://node01.example.com:2379/admin/snapshot/create \
-H "Content-Type: application/json" \
-d '{"name":"manual-snapshot-'$(date +%Y%m%d-%H%M%S)'"}'
# List snapshots
curl -k https://node01.example.com:2379/admin/snapshot/list | jq
# Restore from snapshot
curl -k -X POST https://node01.example.com:2379/admin/snapshot/restore \
-H "Content-Type: application/json" \
-d '{"name":"manual-snapshot-20251210-120000"}'
# Delete old snapshots
curl -k -X DELETE https://node01.example.com:2379/admin/snapshot/manual-snapshot-20251201-120000
Quick Reference Cheat Sheet
Most Common Commands
# PXE Server
sudo systemctl restart dhcpd4 atftpd nginx
curl http://localhost:8080/health
# Build Images
cd /home/centra/cloud/baremetal/image-builder && ./build-images.sh
# Provision Node
nix run github:nix-community/nixos-anywhere -- --flake /srv/provisioning#node01 root@10.0.100.50
# Cluster Status
curl -k https://node01.example.com:2379/admin/cluster/members | jq
# Service Status
sudo systemctl status chainfire.service
sudo journalctl -u chainfire.service -f
# Health Check
curl -k https://node01.example.com:2379/health | jq
# Power Control
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis power on
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis bootdev pxe
# NixOS Update
sudo nixos-rebuild switch --flake /srv/provisioning#node01
# Logs
sudo journalctl -u chainfire.service -f
Document End