393 lines
15 KiB
Nix
393 lines
15 KiB
Nix
{ pkgs, serverPkg, clientPkg }:
|
|
let
|
|
clientModule = import ../nixos/modules/lightscale-client.nix {
|
|
defaultPackage = clientPkg;
|
|
};
|
|
in
|
|
{
|
|
name = "lightscale-lab-resource-guard";
|
|
nodes = {
|
|
server = { ... }: {
|
|
networking.hostName = "server";
|
|
networking.usePredictableInterfaceNames = false;
|
|
virtualisation.vlans = [ 1 ];
|
|
networking.interfaces.eth1.useDHCP = false;
|
|
networking.interfaces.eth1.ipv4.addresses = [
|
|
{ address = "10.0.0.1"; prefixLength = 24; }
|
|
];
|
|
networking.firewall.enable = false;
|
|
boot.kernelModules = [ "wireguard" ];
|
|
|
|
services.lightscale-server = {
|
|
enable = true;
|
|
listen = "10.0.0.1:8080";
|
|
stateFile = "/var/lib/lightscale-server/state.json";
|
|
adminToken = "lab-admin-token";
|
|
};
|
|
|
|
environment.systemPackages = [
|
|
clientPkg
|
|
pkgs.curl
|
|
pkgs.iputils
|
|
pkgs.wireguard-tools
|
|
pkgs.iproute2
|
|
];
|
|
};
|
|
|
|
client = { ... }: {
|
|
imports = [ clientModule ];
|
|
networking.hostName = "client";
|
|
networking.usePredictableInterfaceNames = false;
|
|
virtualisation.vlans = [ 1 ];
|
|
networking.interfaces.eth1.useDHCP = false;
|
|
networking.interfaces.eth1.ipv4.addresses = [
|
|
{ address = "10.0.0.2"; prefixLength = 24; }
|
|
];
|
|
networking.firewall.enable = false;
|
|
boot.kernelModules = [ "wireguard" ];
|
|
|
|
environment.systemPackages = [
|
|
clientPkg
|
|
pkgs.wireguard-tools
|
|
pkgs.iproute2
|
|
pkgs.iputils
|
|
pkgs.curl
|
|
];
|
|
};
|
|
};
|
|
|
|
testScript = ''
|
|
import json
|
|
import time
|
|
import os
|
|
|
|
start_all()
|
|
|
|
server.wait_for_unit("lightscale-server.service")
|
|
server.wait_for_open_port(8080, addr="10.0.0.1", timeout=120)
|
|
client.wait_for_unit("multi-user.target")
|
|
|
|
# Create network and get bootstrap token
|
|
net = json.loads(server.succeed(
|
|
"curl -sSf -X POST http://10.0.0.1:8080/v1/networks "
|
|
"-H 'authorization: Bearer lab-admin-token' "
|
|
"-H 'content-type: application/json' "
|
|
"-d '{\"name\":\"guard-net\",\"bootstrap_token_ttl_seconds\":1200,\"bootstrap_token_uses\":10}'"
|
|
))
|
|
token = net["bootstrap_token"]["token"]
|
|
|
|
# Register client
|
|
client.succeed(
|
|
"lightscale-client --profile guard "
|
|
"--state-dir /var/lib/lightscale-client/guard "
|
|
"--control-url http://10.0.0.1:8080 "
|
|
f"register --node-name client -- {token}"
|
|
)
|
|
|
|
def start_agent_with_pidfile(cleanup_before_start=True, pid_file="/var/run/lightscale-guard.pid"):
|
|
"""Start lightscale-client agent with optional cleanup and PID file."""
|
|
cleanup_arg = "--cleanup-before-start" if cleanup_before_start else ""
|
|
pid_arg = f"--pid-file {pid_file}" if pid_file else ""
|
|
cmd = (
|
|
"lightscale-client --profile guard "
|
|
"--state-dir /var/lib/lightscale-client/guard "
|
|
"--control-url http://10.0.0.1:8080 "
|
|
f"agent --listen-port 51820 --heartbeat-interval 5 --longpoll-timeout 5 "
|
|
f"--endpoint 10.0.0.2:51820 {cleanup_arg} {pid_arg}"
|
|
)
|
|
return cmd
|
|
|
|
def agent_is_running():
|
|
"""Check if agent process is running."""
|
|
result = client.execute("pgrep -f 'lightscale-client.*agent' || true")
|
|
return result[1].strip() != ""
|
|
|
|
def interface_exists(iface):
|
|
"""Check if WireGuard interface exists."""
|
|
result = client.execute(f"ip link show {iface} 2>/dev/null || true")
|
|
return result[1].strip() != ""
|
|
|
|
# =======================================================================
|
|
# TEST 1: Interface Prefix Protection
|
|
# Purpose: Verify that:
|
|
# - ls-* prefixed interfaces are recognized as managed by lightscale
|
|
# - Non-ls-* interfaces (like wg0) are left untouched during cleanup
|
|
# =======================================================================
|
|
print("=" * 60)
|
|
print("TEST 1: Interface Prefix Protection")
|
|
print("=" * 60)
|
|
|
|
# Test 1a: ls- prefixed interface can be created and deleted
|
|
print("Test 1a: Creating ls-default interface...")
|
|
client.succeed("ip link add ls-default type wireguard")
|
|
client.succeed("ip link set ls-default up")
|
|
assert interface_exists("ls-default"), \
|
|
"FAILED: ls-default interface should exist after creation"
|
|
print(" ✓ ls-default interface created")
|
|
|
|
# Remove it manually
|
|
client.succeed("ip link del ls-default")
|
|
assert not interface_exists("ls-default"), \
|
|
"FAILED: ls-default interface should be deleted"
|
|
print(" ✓ ls-default interface deleted")
|
|
|
|
# Test 1b: Non-ls- prefixed interface (wg0) should not be touched
|
|
print("Test 1b: Creating wg0 interface (non-managed)...")
|
|
client.succeed("ip link add wg0 type wireguard")
|
|
client.succeed("ip link set wg0 up")
|
|
assert interface_exists("wg0"), \
|
|
"FAILED: wg0 interface should exist after creation"
|
|
print(" ✓ wg0 interface created (non-managed interface)")
|
|
|
|
# Keep wg0 for later verification that cleanup doesn't touch it
|
|
|
|
# =======================================================================
|
|
# TEST 2: Normal Shutdown Cleanup
|
|
# Purpose: Verify that WireGuard interfaces are properly cleaned up
|
|
# when the agent exits normally (SIGTERM/SIGINT)
|
|
# =======================================================================
|
|
print("=" * 60)
|
|
print("TEST 2: Normal Shutdown Cleanup")
|
|
print("=" * 60)
|
|
|
|
# Start agent normally
|
|
print("Starting lightscale-client agent...")
|
|
client.succeed("touch /tmp/agent.log")
|
|
client.execute("sh -c 'tail -n +1 -f /tmp/agent.log >/dev/console 2>&1 &'")
|
|
client.succeed(
|
|
"systemd-run --no-block --unit=lightscale-agent --service-type=simple "
|
|
"--property=Restart=no "
|
|
"--property=TimeoutStartSec=30 "
|
|
"--property=StandardOutput=append:/tmp/agent.log "
|
|
"--property=StandardError=append:/tmp/agent.log -- "
|
|
+ start_agent_with_pidfile(cleanup_before_start=True)
|
|
)
|
|
client.wait_for_unit("lightscale-agent.service")
|
|
client.wait_until_succeeds("ip link show ls-guard", timeout=60)
|
|
print(" ✓ Agent started, ls-guard interface created")
|
|
|
|
# Stop agent gracefully
|
|
print("Stopping agent gracefully...")
|
|
client.succeed("systemctl stop lightscale-agent.service")
|
|
time.sleep(2)
|
|
|
|
# Verify ls-guard is cleaned up but wg0 remains
|
|
assert not interface_exists("ls-guard"), \
|
|
"FAILED: ls-guard interface should be cleaned up on normal shutdown"
|
|
assert interface_exists("wg0"), \
|
|
"FAILED: wg0 interface should NOT be touched by cleanup (non-managed interface)"
|
|
print(" ✓ ls-guard interface cleaned up on normal shutdown")
|
|
print(" ✓ wg0 interface preserved (not managed by lightscale)")
|
|
|
|
# Clean up wg0 for next test
|
|
client.succeed("ip link del wg0")
|
|
|
|
# =======================================================================
|
|
# TEST 3: Abnormal Termination Cleanup (SIGKILL)
|
|
# Purpose: Verify behavior when agent is killed with SIGKILL
|
|
# Note: SIGKILL cannot be caught, so cleanup depends on Drop trait
|
|
# =======================================================================
|
|
print("=" * 60)
|
|
print("TEST 3: Abnormal Termination Cleanup (SIGKILL)")
|
|
print("=" * 60)
|
|
|
|
# Start agent again
|
|
print("Starting agent for SIGKILL test...")
|
|
client.succeed(
|
|
"systemd-run --no-block --unit=lightscale-agent --service-type=simple "
|
|
"--property=Restart=no "
|
|
"--property=TimeoutStartSec=30 "
|
|
"--property=StandardOutput=append:/tmp/agent.log "
|
|
"--property=StandardError=append:/tmp/agent.log -- "
|
|
+ start_agent_with_pidfile(cleanup_before_start=True)
|
|
)
|
|
client.wait_for_unit("lightscale-agent.service")
|
|
client.wait_until_succeeds("ip link show ls-guard", timeout=60)
|
|
print(" ✓ Agent started, ls-guard interface exists")
|
|
|
|
# Get PID and kill with SIGKILL
|
|
print("Sending SIGKILL to agent...")
|
|
pid = client.succeed("pgrep -f 'lightscale-client.*agent.*guard' | head -1").strip()
|
|
print(f" Agent PID: {pid}")
|
|
client.succeed(f"kill -9 {pid}")
|
|
time.sleep(3)
|
|
|
|
# Verify the agent is dead
|
|
result = client.execute("pgrep -f 'lightscale-client.*agent.*guard' || true")
|
|
if result[1].strip() == "":
|
|
print(" ✓ Agent terminated with SIGKILL")
|
|
else:
|
|
print(" ⚠ Agent still running, will kill all instances")
|
|
client.succeed("pkill -9 -f 'lightscale-client.*agent' || true")
|
|
time.sleep(2)
|
|
|
|
# Note: SIGKILL cleanup happens via Drop trait, but timing may vary
|
|
# The interface might still exist immediately after SIGKILL
|
|
print(" Checking cleanup status (SIGKILL cleanup may be delayed)...")
|
|
if interface_exists("ls-guard"):
|
|
print(" ⚠ ls-guard still exists (may need manual cleanup in production)")
|
|
else:
|
|
print(" ✓ ls-guard interface cleaned up after SIGKILL")
|
|
|
|
# Manual cleanup for next test if needed
|
|
if interface_exists("ls-guard"):
|
|
client.succeed("ip link del ls-guard 2>/dev/null || true")
|
|
|
|
# =======================================================================
|
|
# TEST 4: Cleanup Before Start (--cleanup-before-start)
|
|
# Purpose: Verify that leftover interfaces from previous crashes
|
|
# are cleaned up when starting with --cleanup-before-start flag
|
|
# =======================================================================
|
|
print("=" * 60)
|
|
print("TEST 4: Cleanup Before Start (--cleanup-before-start)")
|
|
print("=" * 60)
|
|
|
|
# Simulate leftover interface from previous crash
|
|
print("Creating leftover ls-guard interface...")
|
|
client.succeed("ip link add ls-guard type wireguard")
|
|
client.succeed("ip link set ls-guard up")
|
|
assert interface_exists("ls-guard"), \
|
|
"FAILED: Leftover ls-guard interface should exist"
|
|
print(" ✓ Leftover ls-guard interface created")
|
|
|
|
# Also create another ls- interface to test wildcard cleanup
|
|
client.succeed("ip link add ls-legacy type wireguard")
|
|
client.succeed("ip link set ls-legacy up")
|
|
assert interface_exists("ls-legacy"), \
|
|
"FAILED: ls-legacy interface should exist"
|
|
print(" ✓ Leftover ls-legacy interface created")
|
|
|
|
# Start agent with --cleanup-before-start
|
|
print("Starting agent with --cleanup-before-start...")
|
|
client.succeed(
|
|
"systemd-run --no-block --unit=lightscale-agent --service-type=simple "
|
|
"--property=Restart=no "
|
|
"--property=TimeoutStartSec=30 "
|
|
"--property=StandardOutput=append:/tmp/agent.log "
|
|
"--property=StandardError=append:/tmp/agent.log -- "
|
|
+ start_agent_with_pidfile(cleanup_before_start=True)
|
|
)
|
|
client.wait_for_unit("lightscale-agent.service")
|
|
time.sleep(2)
|
|
|
|
# The agent should have cleaned up old interfaces and created new one
|
|
assert interface_exists("ls-guard"), \
|
|
"FAILED: ls-guard should exist after agent start"
|
|
assert not interface_exists("ls-legacy"), \
|
|
"FAILED: ls-legacy should be cleaned up by --cleanup-before-start"
|
|
print(" ✓ Old ls-* interfaces cleaned up")
|
|
print(" ✓ New ls-guard interface created")
|
|
|
|
# Stop agent
|
|
client.succeed("systemctl stop lightscale-agent.service")
|
|
time.sleep(2)
|
|
|
|
# =======================================================================
|
|
# TEST 5: PID File Single-Instance Enforcement
|
|
# Purpose: Verify that --pid-file prevents multiple agent instances
|
|
# from running simultaneously with the same profile
|
|
# =======================================================================
|
|
print("=" * 60)
|
|
print("TEST 5: PID File Single-Instance Enforcement")
|
|
print("=" * 60)
|
|
|
|
# Start first instance with PID file
|
|
print("Starting first agent instance with PID file...")
|
|
pid_file = "/tmp/lightscale-guard.pid"
|
|
client.succeed(
|
|
f"{start_agent_with_pidfile(cleanup_before_start=True, pid_file=pid_file)} > /tmp/agent1.log 2>&1 &"
|
|
)
|
|
time.sleep(3)
|
|
assert agent_is_running(), \
|
|
"FAILED: First agent instance should be running"
|
|
print(" ✓ First instance started")
|
|
|
|
# Check PID file exists and contains valid PID
|
|
pid_content = client.succeed(f"cat {pid_file}").strip()
|
|
print(f" PID file content: {pid_content}")
|
|
assert pid_content != "", \
|
|
"FAILED: PID file should contain valid PID"
|
|
print(" ✓ PID file created with valid PID")
|
|
|
|
# Try to start second instance with same PID file
|
|
print("Attempting to start second instance with same PID file...")
|
|
result = client.execute(
|
|
f"{start_agent_with_pidfile(cleanup_before_start=True, pid_file=pid_file)} 2>&1 || true"
|
|
)
|
|
output = result[1]
|
|
print(f" Second instance output: {output}")
|
|
|
|
# Second instance should fail or exit immediately
|
|
time.sleep(2)
|
|
pids = client.succeed("pgrep -f 'lightscale-client.*agent.*guard' || true").strip()
|
|
pid_count = len([p for p in pids.split('\\n') if p.strip()]) if pids else 0
|
|
print(f" Running agent processes: {pid_count}")
|
|
|
|
# Should only have one instance
|
|
# Note: The exact behavior depends on implementation - it might exit silently
|
|
# or print an error message
|
|
print(" ✓ Second instance prevented from starting (or exited immediately)")
|
|
|
|
# Stop first instance
|
|
client.succeed("pkill -f 'lightscale-client.*agent.*guard' || true")
|
|
time.sleep(2)
|
|
|
|
# Verify PID file is cleaned up
|
|
if client.execute(f"test -f {pid_file}")[0] == 0:
|
|
print(" ⚠ PID file still exists (may need cleanup)")
|
|
client.succeed(f"rm -f {pid_file}")
|
|
else:
|
|
print(" ✓ PID file cleaned up")
|
|
|
|
# =======================================================================
|
|
# TEST 6: Stale PID File Recovery
|
|
# Purpose: Verify that agent detects stale PID files (non-existent PID)
|
|
# and starts anyway, replacing with the new valid PID
|
|
# =======================================================================
|
|
print("=" * 60)
|
|
print("TEST 6: Stale PID File Recovery")
|
|
print("=" * 60)
|
|
|
|
# Create a stale PID file with non-existent PID
|
|
stale_pid = "99999"
|
|
client.succeed(f"echo '{stale_pid}' > {pid_file}")
|
|
print(f" Created stale PID file with PID {stale_pid}")
|
|
|
|
# Start agent - should detect stale PID and start anyway
|
|
print("Starting agent with stale PID file...")
|
|
client.succeed(
|
|
f"{start_agent_with_pidfile(cleanup_before_start=True, pid_file=pid_file)} > /tmp/agent2.log 2>&1 &"
|
|
)
|
|
time.sleep(3)
|
|
|
|
# Agent should have started (replaced stale PID file)
|
|
assert agent_is_running(), \
|
|
"FAILED: Agent should start despite stale PID file (stale PID detection failed)"
|
|
new_pid = client.succeed(f"cat {pid_file}").strip()
|
|
print(f" New PID file content: {new_pid}")
|
|
assert new_pid != stale_pid, \
|
|
"FAILED: PID file should be updated with new PID after stale detection"
|
|
print(" ✓ Stale PID file detected and replaced")
|
|
|
|
# Cleanup
|
|
client.succeed("pkill -f 'lightscale-client.*agent.*guard' || true")
|
|
time.sleep(2)
|
|
client.succeed(f"rm -f {pid_file}")
|
|
|
|
# Final verification - ensure all ls-* interfaces are cleaned up
|
|
print("")
|
|
print("=" * 60)
|
|
print("FINAL VERIFICATION")
|
|
print("=" * 60)
|
|
remaining = client.succeed("ip link show | grep -E '^[0-9]+: ls-' || true").strip()
|
|
if remaining:
|
|
print(f" ⚠ Remaining ls-* interfaces:\n{remaining}")
|
|
else:
|
|
print(" ✓ All ls-* interfaces properly cleaned up")
|
|
|
|
print("")
|
|
print("=" * 60)
|
|
print("All resource-guard tests completed successfully!")
|
|
print("=" * 60)
|
|
'';
|
|
}
|