- Replace form_urlencoded with RFC 3986 compliant URI encoding - Implement aws_uri_encode() matching AWS SigV4 spec exactly - Unreserved chars (A-Z,a-z,0-9,-,_,.,~) not encoded - All other chars percent-encoded with uppercase hex - Preserve slashes in paths, encode in query params - Normalize empty paths to '/' per AWS spec - Fix test expectations (body hash, HMAC values) - Add comprehensive SigV4 signature determinism test This fixes the canonicalization mismatch that caused signature validation failures in T047. Auth can now be enabled for production. Refs: T058.S1
922 lines
22 KiB
Markdown
922 lines
22 KiB
Markdown
# Command Reference Guide
|
|
|
|
**Document Version:** 1.0
|
|
**Last Updated:** 2025-12-10
|
|
|
|
## Table of Contents
|
|
|
|
- [PXE Server Operations](#pxe-server-operations)
|
|
- [Image Building](#image-building)
|
|
- [Node Provisioning](#node-provisioning)
|
|
- [Cluster Management](#cluster-management)
|
|
- [Service Management](#service-management)
|
|
- [Health Checks](#health-checks)
|
|
- [BMC/IPMI Operations](#bmcipmi-operations)
|
|
- [Network Diagnostics](#network-diagnostics)
|
|
- [Log Querying](#log-querying)
|
|
- [Backup and Restore](#backup-and-restore)
|
|
|
|
## PXE Server Operations
|
|
|
|
### Start/Stop Services
|
|
|
|
```bash
|
|
# Start all PXE services
|
|
sudo systemctl start dhcpd4.service atftpd.service nginx.service
|
|
|
|
# Stop all PXE services
|
|
sudo systemctl stop dhcpd4.service atftpd.service nginx.service
|
|
|
|
# Restart all PXE services
|
|
sudo systemctl restart dhcpd4.service atftpd.service nginx.service
|
|
|
|
# Enable services at boot
|
|
sudo systemctl enable dhcpd4.service atftpd.service nginx.service
|
|
|
|
# Check status
|
|
sudo systemctl status dhcpd4.service
|
|
sudo systemctl status atftpd.service
|
|
sudo systemctl status nginx.service
|
|
```
|
|
|
|
### DHCP Server Management
|
|
|
|
```bash
|
|
# Test DHCP configuration syntax
|
|
sudo dhcpd -t -cf /etc/dhcp/dhcpd.conf
|
|
|
|
# View DHCP leases
|
|
sudo cat /var/lib/dhcp/dhcpd.leases
|
|
|
|
# Watch DHCP leases in real-time
|
|
sudo tail -f /var/lib/dhcp/dhcpd.leases
|
|
|
|
# View DHCP server logs
|
|
sudo journalctl -u dhcpd4.service -f
|
|
|
|
# Check DHCP server is listening
|
|
sudo ss -ulnp | grep :67
|
|
|
|
# Send DHCP discover (from client)
|
|
sudo nmap --script broadcast-dhcp-discover -e eth0
|
|
```
|
|
|
|
### TFTP Server Management
|
|
|
|
```bash
|
|
# Test TFTP download locally
|
|
tftp localhost -c get undionly.kpxe /tmp/test.kpxe
|
|
|
|
# Test TFTP from remote host
|
|
tftp 10.0.100.10 -c get ipxe.efi /tmp/test.efi
|
|
|
|
# Check TFTP server is listening
|
|
sudo ss -ulnp | grep :69
|
|
|
|
# View TFTP logs
|
|
sudo journalctl -u atftpd.service -f
|
|
|
|
# Monitor TFTP traffic
|
|
sudo tcpdump -i eth0 -n port 69 -vv
|
|
|
|
# List TFTP root directory
|
|
ls -lh /var/lib/tftpboot/
|
|
```
|
|
|
|
### HTTP Server Management
|
|
|
|
```bash
|
|
# Test HTTP server
|
|
curl http://localhost:8080/health
|
|
|
|
# Test boot script availability
|
|
curl http://localhost:8080/boot/ipxe/boot.ipxe
|
|
|
|
# Test netboot image availability
|
|
curl -I http://localhost:8080/boot/nixos/control-plane/bzImage
|
|
|
|
# Check nginx configuration syntax
|
|
sudo nginx -t
|
|
|
|
# Reload nginx configuration (without restart)
|
|
sudo nginx -s reload
|
|
|
|
# View nginx access logs
|
|
sudo tail -f /var/log/nginx/access.log
|
|
|
|
# View nginx error logs
|
|
sudo tail -f /var/log/nginx/error.log
|
|
|
|
# Monitor HTTP traffic
|
|
sudo tcpdump -i eth0 -n port 80 or port 8080 -A | grep -E "GET|POST|HTTP"
|
|
```
|
|
|
|
### PXE Boot Debugging
|
|
|
|
```bash
|
|
# Monitor all PXE-related traffic
|
|
sudo tcpdump -i eth0 -n '(port 67 or port 68 or port 69 or port 80)' -vv
|
|
|
|
# Watch for DHCP discover packets
|
|
sudo tcpdump -i eth0 -n 'udp port 67 or udp port 68' -vv
|
|
|
|
# Watch for specific MAC address
|
|
sudo tcpdump -i eth0 -n 'ether host 52:54:00:12:34:56'
|
|
|
|
# Check PXE server health
|
|
curl http://10.0.100.10:8080/health | jq
|
|
|
|
# View comprehensive logs (all services)
|
|
sudo journalctl -u dhcpd4 -u atftpd -u nginx -f --since "5 minutes ago"
|
|
```
|
|
|
|
## Image Building
|
|
|
|
### Build Netboot Images
|
|
|
|
```bash
|
|
# Build all profiles
|
|
cd /home/centra/cloud/baremetal/image-builder
|
|
./build-images.sh
|
|
|
|
# Build specific profile
|
|
./build-images.sh --profile control-plane
|
|
./build-images.sh --profile worker
|
|
./build-images.sh --profile all-in-one
|
|
|
|
# Build and deploy to PXE server
|
|
./build-images.sh --deploy
|
|
|
|
# Build with custom output directory
|
|
./build-images.sh --output-dir /srv/pxe/images
|
|
|
|
# Build with verbose output
|
|
./build-images.sh --verbose
|
|
```
|
|
|
|
### Manual Nix Builds
|
|
|
|
```bash
|
|
# Build initrd
|
|
nix build .#nixosConfigurations.netboot-control-plane.config.system.build.netbootRamdisk
|
|
|
|
# Build kernel
|
|
nix build .#nixosConfigurations.netboot-control-plane.config.system.build.kernel
|
|
|
|
# Build complete system
|
|
nix build .#nixosConfigurations.netboot-control-plane.config.system.build.toplevel
|
|
|
|
# Check build artifacts
|
|
ls -lh result/
|
|
|
|
# Copy artifacts manually
|
|
sudo cp result/bzImage /var/lib/pxe-boot/nixos/control-plane/
|
|
sudo cp result/initrd /var/lib/pxe-boot/nixos/control-plane/
|
|
```
|
|
|
|
### Image Verification
|
|
|
|
```bash
|
|
# Check image sizes
|
|
ls -lh /var/lib/pxe-boot/nixos/*/
|
|
|
|
# Verify bzImage is a valid kernel
|
|
file /var/lib/pxe-boot/nixos/control-plane/bzImage
|
|
# Expected: Linux kernel x86 boot executable ...
|
|
|
|
# Verify initrd is compressed
|
|
file /var/lib/pxe-boot/nixos/control-plane/initrd
|
|
# Expected: gzip compressed data ...
|
|
|
|
# Check HTTP accessibility
|
|
curl -I http://10.0.100.10:8080/boot/nixos/control-plane/bzImage
|
|
# Expected: HTTP/1.1 200 OK
|
|
|
|
# Calculate checksums
|
|
sha256sum /var/lib/pxe-boot/nixos/control-plane/{bzImage,initrd}
|
|
```
|
|
|
|
## Node Provisioning
|
|
|
|
### nixos-anywhere Commands
|
|
|
|
```bash
|
|
# Basic provisioning
|
|
nix run github:nix-community/nixos-anywhere -- \
|
|
--flake /srv/provisioning#node01 \
|
|
root@10.0.100.50
|
|
|
|
# Provision with remote build (faster on slow local machine)
|
|
nix run github:nix-community/nixos-anywhere -- \
|
|
--flake /srv/provisioning#node01 \
|
|
--build-on-remote \
|
|
root@10.0.100.50
|
|
|
|
# Provision with disk encryption
|
|
nix run github:nix-community/nixos-anywhere -- \
|
|
--flake /srv/provisioning#node01 \
|
|
--disk-encryption-keys /tmp/luks.key <(cat /srv/provisioning/secrets/node01-luks.key) \
|
|
root@10.0.100.50
|
|
|
|
# Debug mode (verbose output, no reboot)
|
|
nix run github:nix-community/nixos-anywhere -- \
|
|
--flake /srv/provisioning#node01 \
|
|
--debug \
|
|
--no-reboot \
|
|
root@10.0.100.50
|
|
|
|
# Use specific SSH key
|
|
nix run github:nix-community/nixos-anywhere -- \
|
|
--flake /srv/provisioning#node01 \
|
|
--ssh-key ~/.ssh/id_ed25519_provisioning \
|
|
root@10.0.100.50
|
|
|
|
# Use specific Nix binary
|
|
nix run github:nix-community/nixos-anywhere -- \
|
|
--flake /srv/provisioning#node01 \
|
|
--nix-path /run/current-system/sw/bin/nix \
|
|
root@10.0.100.50
|
|
```
|
|
|
|
### Batch Provisioning
|
|
|
|
```bash
|
|
# Provision multiple nodes in parallel
|
|
for node in node01 node02 node03; do
|
|
nix run github:nix-community/nixos-anywhere -- \
|
|
--flake /srv/provisioning#${node} \
|
|
--build-on-remote \
|
|
root@<node-ip> &
|
|
done
|
|
wait
|
|
echo "All nodes provisioned"
|
|
|
|
# Provision with logging
|
|
for node in node01 node02 node03; do
|
|
nix run github:nix-community/nixos-anywhere -- \
|
|
--flake /srv/provisioning#${node} \
|
|
root@<node-ip> 2>&1 | tee /var/log/provision-${node}.log &
|
|
done
|
|
wait
|
|
```
|
|
|
|
### SSH to Installer
|
|
|
|
```bash
|
|
# SSH to PXE-booted installer
|
|
ssh root@10.0.100.50
|
|
|
|
# Check available disks
|
|
ssh root@10.0.100.50 'lsblk'
|
|
|
|
# Check network configuration
|
|
ssh root@10.0.100.50 'ip addr show'
|
|
|
|
# Check internet connectivity
|
|
ssh root@10.0.100.50 'ping -c 3 cache.nixos.org'
|
|
|
|
# Manual disk wipe (if needed)
|
|
ssh root@10.0.100.50 'wipefs -a /dev/sda && sgdisk --zap-all /dev/sda'
|
|
|
|
# Test disko configuration
|
|
ssh root@10.0.100.50 'nix-shell -p disko --run "disko --mode test /tmp/disko.nix"'
|
|
```
|
|
|
|
## Cluster Management
|
|
|
|
### Cluster Member Operations
|
|
|
|
```bash
|
|
# List cluster members (Chainfire)
|
|
curl -k https://node01.example.com:2379/admin/cluster/members | jq
|
|
|
|
# List cluster members (FlareDB)
|
|
curl -k https://node01.example.com:2479/admin/cluster/members | jq
|
|
|
|
# Get cluster leader
|
|
curl -k https://node01.example.com:2379/admin/cluster/leader | jq
|
|
|
|
# Get cluster status
|
|
curl -k https://node01.example.com:2379/admin/cluster/status | jq
|
|
```
|
|
|
|
### Add Node to Cluster
|
|
|
|
```bash
|
|
# Add member to Chainfire cluster
|
|
curl -k -X POST https://node01.example.com:2379/admin/member/add \
|
|
-H "Content-Type: application/json" \
|
|
-d '{
|
|
"id": "node04",
|
|
"raft_addr": "10.0.200.13:2380"
|
|
}'
|
|
|
|
# Add member to FlareDB cluster
|
|
curl -k -X POST https://node01.example.com:2479/admin/member/add \
|
|
-H "Content-Type: application/json" \
|
|
-d '{
|
|
"id": "node04",
|
|
"raft_addr": "10.0.200.13:2480"
|
|
}'
|
|
```
|
|
|
|
### Remove Node from Cluster
|
|
|
|
```bash
|
|
# Remove member from Chainfire cluster
|
|
curl -k -X DELETE https://node01.example.com:2379/admin/member/node04
|
|
|
|
# Remove member from FlareDB cluster
|
|
curl -k -X DELETE https://node01.example.com:2479/admin/member/node04
|
|
|
|
# Verify removal
|
|
curl -k https://node01.example.com:2379/admin/cluster/members | jq '.members[] | select(.id=="node04")'
|
|
# Expected: empty (no output)
|
|
```
|
|
|
|
### Cluster Health Checks
|
|
|
|
```bash
|
|
# Check all nodes health (Chainfire)
|
|
for node in node01 node02 node03; do
|
|
echo "$node:"
|
|
curl -k https://${node}.example.com:2379/health | jq -c
|
|
done
|
|
|
|
# Check cluster has quorum
|
|
MEMBER_COUNT=$(curl -sk https://node01.example.com:2379/admin/cluster/members | jq '.members | length')
|
|
echo "Cluster has $MEMBER_COUNT members"
|
|
if [ $MEMBER_COUNT -ge 2 ]; then
|
|
echo "Quorum achieved"
|
|
else
|
|
echo "WARNING: No quorum"
|
|
fi
|
|
|
|
# Check Raft leader exists
|
|
LEADER=$(curl -sk https://node01.example.com:2379/admin/cluster/leader | jq -r '.id')
|
|
if [ -n "$LEADER" ]; then
|
|
echo "Leader: $LEADER"
|
|
else
|
|
echo "ERROR: No leader elected"
|
|
fi
|
|
```
|
|
|
|
## Service Management
|
|
|
|
### Systemd Service Control
|
|
|
|
```bash
|
|
# Start service
|
|
sudo systemctl start chainfire.service
|
|
|
|
# Stop service
|
|
sudo systemctl stop chainfire.service
|
|
|
|
# Restart service
|
|
sudo systemctl restart chainfire.service
|
|
|
|
# Reload configuration (without restart)
|
|
sudo systemctl reload chainfire.service
|
|
|
|
# Enable service at boot
|
|
sudo systemctl enable chainfire.service
|
|
|
|
# Disable service at boot
|
|
sudo systemctl disable chainfire.service
|
|
|
|
# Check service status
|
|
sudo systemctl status chainfire.service
|
|
|
|
# View service dependencies
|
|
sudo systemctl list-dependencies chainfire.service
|
|
```
|
|
|
|
### Multi-Service Operations
|
|
|
|
```bash
|
|
# Start all PlasmaCloud services
|
|
sudo systemctl start chainfire.service flaredb.service iam.service \
|
|
plasmavmc.service prismnet.service flashdns.service
|
|
|
|
# Stop all PlasmaCloud services
|
|
sudo systemctl stop chainfire.service flaredb.service iam.service \
|
|
plasmavmc.service prismnet.service flashdns.service
|
|
|
|
# Check status of all services
|
|
systemctl status 'chainfire.service' 'flaredb.service' 'iam.service' \
|
|
'plasmavmc.service' 'prismnet.service' 'flashdns.service' --no-pager
|
|
|
|
# Restart services in order
|
|
sudo systemctl restart chainfire.service && sleep 10
|
|
sudo systemctl restart flaredb.service && sleep 10
|
|
sudo systemctl restart iam.service
|
|
```
|
|
|
|
### NixOS Configuration Management
|
|
|
|
```bash
|
|
# Build new configuration (test)
|
|
sudo nixos-rebuild test --flake /srv/provisioning#node01
|
|
|
|
# Build and apply new configuration
|
|
sudo nixos-rebuild switch --flake /srv/provisioning#node01
|
|
|
|
# Build and set as boot default (no activation)
|
|
sudo nixos-rebuild boot --flake /srv/provisioning#node01
|
|
|
|
# Rollback to previous generation
|
|
sudo nixos-rebuild switch --rollback
|
|
|
|
# List generations
|
|
sudo nixos-rebuild list-generations
|
|
|
|
# Boot into specific generation (next boot only)
|
|
sudo nixos-rebuild boot --switch-generation 3
|
|
|
|
# Delete old generations
|
|
sudo nix-collect-garbage --delete-older-than 30d
|
|
```
|
|
|
|
## Health Checks
|
|
|
|
### Service Health Endpoints
|
|
|
|
```bash
|
|
# Chainfire health
|
|
curl -k https://node01.example.com:2379/health | jq
|
|
|
|
# FlareDB health
|
|
curl -k https://node01.example.com:2479/health | jq
|
|
|
|
# IAM health
|
|
curl -k https://node01.example.com:8080/health | jq
|
|
|
|
# PlasmaVMC health
|
|
curl -k https://node01.example.com:9090/health | jq
|
|
|
|
# PrismNET health
|
|
curl -k https://node01.example.com:9091/health | jq
|
|
|
|
# FlashDNS health (via HTTP)
|
|
curl -k https://node01.example.com:853/health | jq
|
|
|
|
# FiberLB health
|
|
curl -k https://node01.example.com:9092/health | jq
|
|
|
|
# K8sHost health
|
|
curl -k https://node01.example.com:10250/healthz
|
|
```
|
|
|
|
### Comprehensive Health Check Script
|
|
|
|
```bash
|
|
#!/bin/bash
|
|
# /srv/provisioning/scripts/health-check-all.sh
|
|
|
|
NODES=("node01" "node02" "node03")
|
|
SERVICES=("2379:Chainfire" "2479:FlareDB" "8080:IAM" "9090:PlasmaVMC")
|
|
|
|
for node in "${NODES[@]}"; do
|
|
echo "Checking $node..."
|
|
for service in "${SERVICES[@]}"; do
|
|
port=$(echo $service | cut -d: -f1)
|
|
name=$(echo $service | cut -d: -f2)
|
|
|
|
status=$(curl -sk https://${node}.example.com:${port}/health | jq -r '.status' 2>/dev/null)
|
|
if [ "$status" = "healthy" ]; then
|
|
echo " ✓ $name: healthy"
|
|
else
|
|
echo " ✗ $name: unhealthy or unreachable"
|
|
fi
|
|
done
|
|
echo ""
|
|
done
|
|
```
|
|
|
|
### System Health Checks
|
|
|
|
```bash
|
|
# Check system load
|
|
ssh root@node01.example.com 'uptime'
|
|
|
|
# Check memory usage
|
|
ssh root@node01.example.com 'free -h'
|
|
|
|
# Check disk usage
|
|
ssh root@node01.example.com 'df -h'
|
|
|
|
# Check disk I/O
|
|
ssh root@node01.example.com 'iostat -x 1 5'
|
|
|
|
# Check network bandwidth
|
|
ssh root@node01.example.com 'iftop -i eth1 -t -s 5'
|
|
|
|
# Check process list
|
|
ssh root@node01.example.com 'ps aux --sort=-%mem | head -20'
|
|
|
|
# Check for OOM kills
|
|
ssh root@node01.example.com 'dmesg | grep -i "out of memory"'
|
|
```
|
|
|
|
## BMC/IPMI Operations
|
|
|
|
### Power Control
|
|
|
|
```bash
|
|
# Power on
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin -P password chassis power on
|
|
|
|
# Power off (graceful)
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis power soft
|
|
|
|
# Power off (force)
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis power off
|
|
|
|
# Power cycle
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis power cycle
|
|
|
|
# Power status
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis power status
|
|
```
|
|
|
|
### Boot Device Control
|
|
|
|
```bash
|
|
# Set next boot to PXE
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis bootdev pxe
|
|
|
|
# Set next boot to disk
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis bootdev disk
|
|
|
|
# Set next boot to CDROM
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis bootdev cdrom
|
|
|
|
# Set persistent PXE boot (all future boots)
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis bootdev pxe options=persistent
|
|
|
|
# Clear persistent boot device
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis bootdev none
|
|
```
|
|
|
|
### Serial-over-LAN (SOL)
|
|
|
|
```bash
|
|
# Activate SOL session
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin sol activate
|
|
|
|
# Deactivate SOL session (from another terminal)
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin sol deactivate
|
|
|
|
# Configure SOL settings
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin sol set enabled true 1
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin sol set volatile-bit-rate 115.2 1
|
|
|
|
# View SOL configuration
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin sol info 1
|
|
```
|
|
|
|
### System Information
|
|
|
|
```bash
|
|
# Get sensor readings
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin sdr list
|
|
|
|
# Get specific sensor
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin sdr get "CPU Temp"
|
|
|
|
# Get system event log
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin sel list
|
|
|
|
# Clear system event log
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin sel clear
|
|
|
|
# Get BMC info
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin bmc info
|
|
|
|
# Get FRU (Field Replaceable Unit) info
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin fru print
|
|
```
|
|
|
|
### Batch Operations
|
|
|
|
```bash
|
|
# Power on all nodes
|
|
for ip in 10.0.10.{50..55}; do
|
|
echo "Powering on $ip..."
|
|
ipmitool -I lanplus -H $ip -U admin -P password chassis power on
|
|
done
|
|
|
|
# Check power status all nodes
|
|
for ip in 10.0.10.{50..55}; do
|
|
echo -n "$ip: "
|
|
ipmitool -I lanplus -H $ip -U admin -P password chassis power status
|
|
done
|
|
|
|
# Set all nodes to PXE boot
|
|
for ip in 10.0.10.{50..55}; do
|
|
echo "Setting $ip to PXE boot..."
|
|
ipmitool -I lanplus -H $ip -U admin -P password chassis bootdev pxe options=persistent
|
|
done
|
|
```
|
|
|
|
## Network Diagnostics
|
|
|
|
### Connectivity Tests
|
|
|
|
```bash
|
|
# Ping test
|
|
ping -c 5 node01.example.com
|
|
|
|
# TCP port test
|
|
nc -zv node01.example.com 2379
|
|
|
|
# TCP port test with timeout
|
|
timeout 5 bash -c '</dev/tcp/node01.example.com/2379' && echo "Port open" || echo "Port closed"
|
|
|
|
# Multiple port test
|
|
for port in 2379 2380 2479 2480 8080; do
|
|
nc -zv node01.example.com $port
|
|
done
|
|
|
|
# DNS resolution test
|
|
dig node01.example.com
|
|
dig -x 10.0.200.10
|
|
|
|
# Route test
|
|
traceroute node01.example.com
|
|
|
|
# MTU test
|
|
ping -M do -s 1472 -c 5 node01.example.com
|
|
```
|
|
|
|
### Bandwidth Tests
|
|
|
|
```bash
|
|
# iperf3 server (on target node)
|
|
ssh root@node02.example.com 'iperf3 -s -D'
|
|
|
|
# iperf3 client (from source node)
|
|
ssh root@node01.example.com 'iperf3 -c node02.example.com -t 10'
|
|
|
|
# iperf3 bidirectional test
|
|
ssh root@node01.example.com 'iperf3 -c node02.example.com -t 10 --bidir'
|
|
|
|
# iperf3 UDP test
|
|
ssh root@node01.example.com 'iperf3 -c node02.example.com -u -b 10G -t 10'
|
|
```
|
|
|
|
### Packet Capture
|
|
|
|
```bash
|
|
# Capture all traffic on interface
|
|
ssh root@node01.example.com 'tcpdump -i eth1 -w /tmp/capture.pcap -c 1000'
|
|
|
|
# Capture specific port
|
|
ssh root@node01.example.com 'tcpdump -i eth1 port 2379 -w /tmp/chainfire.pcap'
|
|
|
|
# Capture and display
|
|
ssh root@node01.example.com 'tcpdump -i eth1 port 2379 -A -vv'
|
|
|
|
# Capture with filters
|
|
ssh root@node01.example.com 'tcpdump -i eth1 "tcp port 2379 and host node02.example.com" -vv'
|
|
|
|
# Download capture file
|
|
scp root@node01.example.com:/tmp/capture.pcap ./
|
|
```
|
|
|
|
### Firewall Diagnostics
|
|
|
|
```bash
|
|
# List all firewall rules
|
|
ssh root@node01.example.com 'iptables -L -n -v'
|
|
|
|
# List specific chain
|
|
ssh root@node01.example.com 'iptables -L INPUT -n -v'
|
|
|
|
# Check if port is allowed
|
|
ssh root@node01.example.com 'iptables -L -n | grep 2379'
|
|
|
|
# Test firewall rule (temporarily add)
|
|
ssh root@node01.example.com 'iptables -I INPUT -p tcp --dport 2379 -j ACCEPT'
|
|
|
|
# Count packets on specific rule
|
|
ssh root@node01.example.com 'iptables -L INPUT -n -v | grep 2379'
|
|
```
|
|
|
|
## Log Querying
|
|
|
|
### Journalctl Commands
|
|
|
|
```bash
|
|
# View logs for specific service
|
|
sudo journalctl -u chainfire.service
|
|
|
|
# Follow logs in real-time
|
|
sudo journalctl -u chainfire.service -f
|
|
|
|
# View logs from current boot
|
|
sudo journalctl -u chainfire.service -b
|
|
|
|
# View logs from previous boot
|
|
sudo journalctl -u chainfire.service -b -1
|
|
|
|
# View logs from specific time range
|
|
sudo journalctl -u chainfire.service --since "2025-12-10 10:00" --until "2025-12-10 11:00"
|
|
|
|
# View logs from last hour
|
|
sudo journalctl -u chainfire.service --since "1 hour ago"
|
|
|
|
# View last N lines
|
|
sudo journalctl -u chainfire.service -n 100
|
|
|
|
# View logs in JSON format
|
|
sudo journalctl -u chainfire.service -o json
|
|
|
|
# View logs with no pager
|
|
sudo journalctl -u chainfire.service --no-pager
|
|
|
|
# Search for keyword
|
|
sudo journalctl -u chainfire.service | grep "error"
|
|
|
|
# Show logs from multiple services
|
|
sudo journalctl -u chainfire.service -u flaredb.service -f
|
|
```
|
|
|
|
### Log Analysis
|
|
|
|
```bash
|
|
# Count error messages
|
|
sudo journalctl -u chainfire.service | grep -c "ERROR"
|
|
|
|
# Extract error messages
|
|
sudo journalctl -u chainfire.service | grep "ERROR"
|
|
|
|
# Group by error type
|
|
sudo journalctl -u chainfire.service -o json | \
|
|
jq -r 'select(.MESSAGE | contains("ERROR")) | .MESSAGE' | \
|
|
sort | uniq -c | sort -rn
|
|
|
|
# Find slow operations
|
|
sudo journalctl -u chainfire.service | grep "slow operation"
|
|
|
|
# Check for restarts
|
|
sudo journalctl -u chainfire.service | grep "Started\|Stopped"
|
|
|
|
# Extract structured logs
|
|
sudo journalctl -u chainfire.service -o json | \
|
|
jq 'select(.level == "ERROR") | {time: .__REALTIME_TIMESTAMP, message: .MESSAGE}'
|
|
```
|
|
|
|
### Remote Log Querying
|
|
|
|
```bash
|
|
# Query logs on remote node
|
|
ssh root@node01.example.com 'journalctl -u chainfire.service -n 100'
|
|
|
|
# Follow remote logs
|
|
ssh root@node01.example.com 'journalctl -u chainfire.service -f'
|
|
|
|
# Query logs from all nodes
|
|
for node in node01 node02 node03; do
|
|
echo "=== $node ==="
|
|
ssh root@${node}.example.com 'journalctl -u chainfire.service -n 10'
|
|
done
|
|
```
|
|
|
|
## Backup and Restore
|
|
|
|
### Backup Commands
|
|
|
|
```bash
|
|
# Backup Chainfire data
|
|
ssh root@node01.example.com 'tar -czf - /var/lib/chainfire' > chainfire-backup-$(date +%Y%m%d).tar.gz
|
|
|
|
# Backup FlareDB data
|
|
ssh root@node01.example.com 'tar -czf - /var/lib/flaredb' > flaredb-backup-$(date +%Y%m%d).tar.gz
|
|
|
|
# Backup configuration files
|
|
tar -czf provisioning-config-$(date +%Y%m%d).tar.gz /srv/provisioning/nodes/
|
|
|
|
# Backup TLS certificates
|
|
tar -czf tls-certs-$(date +%Y%m%d).tar.gz /srv/provisioning/secrets/*.pem
|
|
```
|
|
|
|
### Automated Backup Script
|
|
|
|
```bash
|
|
#!/bin/bash
|
|
# /srv/provisioning/scripts/backup-cluster.sh
|
|
|
|
BACKUP_DIR="/backup/cluster-$(date +%Y%m%d-%H%M%S)"
|
|
mkdir -p "$BACKUP_DIR"
|
|
|
|
# Backup cluster data from all nodes
|
|
for node in node01 node02 node03; do
|
|
echo "Backing up $node..."
|
|
ssh root@$node.example.com "tar -czf - /var/lib/chainfire" > "$BACKUP_DIR/chainfire-$node.tar.gz"
|
|
ssh root@$node.example.com "tar -czf - /var/lib/flaredb" > "$BACKUP_DIR/flaredb-$node.tar.gz"
|
|
done
|
|
|
|
# Backup configurations
|
|
cp -r /srv/provisioning/nodes "$BACKUP_DIR/configs"
|
|
|
|
# Create manifest
|
|
cat > "$BACKUP_DIR/manifest.txt" <<EOF
|
|
Backup Date: $(date)
|
|
Nodes: node01 node02 node03
|
|
Contents:
|
|
- Chainfire data (all nodes)
|
|
- FlareDB data (all nodes)
|
|
- Node configurations
|
|
EOF
|
|
|
|
echo "Backup complete: $BACKUP_DIR"
|
|
```
|
|
|
|
### Restore Commands
|
|
|
|
```bash
|
|
# Stop services before restore
|
|
ssh root@node01.example.com 'systemctl stop chainfire.service flaredb.service'
|
|
|
|
# Restore Chainfire data
|
|
cat chainfire-backup-20251210.tar.gz | \
|
|
ssh root@node01.example.com 'cd / && tar -xzf -'
|
|
|
|
# Restore FlareDB data
|
|
cat flaredb-backup-20251210.tar.gz | \
|
|
ssh root@node01.example.com 'cd / && tar -xzf -'
|
|
|
|
# Fix permissions
|
|
ssh root@node01.example.com 'chown -R chainfire:chainfire /var/lib/chainfire'
|
|
ssh root@node01.example.com 'chown -R flaredb:flaredb /var/lib/flaredb'
|
|
|
|
# Start services
|
|
ssh root@node01.example.com 'systemctl start chainfire.service flaredb.service'
|
|
|
|
# Verify restore
|
|
ssh root@node01.example.com 'systemctl status chainfire.service flaredb.service'
|
|
curl -k https://node01.example.com:2379/health | jq
|
|
```
|
|
|
|
### Snapshot Management
|
|
|
|
```bash
|
|
# Create Chainfire snapshot
|
|
curl -k -X POST https://node01.example.com:2379/admin/snapshot/create \
|
|
-H "Content-Type: application/json" \
|
|
-d '{"name":"manual-snapshot-'$(date +%Y%m%d-%H%M%S)'"}'
|
|
|
|
# List snapshots
|
|
curl -k https://node01.example.com:2379/admin/snapshot/list | jq
|
|
|
|
# Restore from snapshot
|
|
curl -k -X POST https://node01.example.com:2379/admin/snapshot/restore \
|
|
-H "Content-Type: application/json" \
|
|
-d '{"name":"manual-snapshot-20251210-120000"}'
|
|
|
|
# Delete old snapshots
|
|
curl -k -X DELETE https://node01.example.com:2379/admin/snapshot/manual-snapshot-20251201-120000
|
|
```
|
|
|
|
---
|
|
|
|
## Quick Reference Cheat Sheet
|
|
|
|
### Most Common Commands
|
|
|
|
```bash
|
|
# PXE Server
|
|
sudo systemctl restart dhcpd4 atftpd nginx
|
|
curl http://localhost:8080/health
|
|
|
|
# Build Images
|
|
cd /home/centra/cloud/baremetal/image-builder && ./build-images.sh
|
|
|
|
# Provision Node
|
|
nix run github:nix-community/nixos-anywhere -- --flake /srv/provisioning#node01 root@10.0.100.50
|
|
|
|
# Cluster Status
|
|
curl -k https://node01.example.com:2379/admin/cluster/members | jq
|
|
|
|
# Service Status
|
|
sudo systemctl status chainfire.service
|
|
sudo journalctl -u chainfire.service -f
|
|
|
|
# Health Check
|
|
curl -k https://node01.example.com:2379/health | jq
|
|
|
|
# Power Control
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis power on
|
|
ipmitool -I lanplus -H 10.0.10.50 -U admin chassis bootdev pxe
|
|
|
|
# NixOS Update
|
|
sudo nixos-rebuild switch --flake /srv/provisioning#node01
|
|
|
|
# Logs
|
|
sudo journalctl -u chainfire.service -f
|
|
```
|
|
|
|
---
|
|
|
|
**Document End**
|