commit 58e2be433f8d9b84cd5bbf44592166fa8f7af005 Author: Soma Nakamura
Date: Fri Feb 13 17:08:38 2026 +0900 Initialize superproject with client/server submodules diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b2be92b --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +result diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..957a7a2 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "lightscale-client"] + path = lightscale-client + url = https://git.centraworks.net/centra/lightscale-client.git +[submodule "lightscale-server"] + path = lightscale-server + url = https://git.centraworks.net/centra/lightscale-server.git diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..d52e90f --- /dev/null +++ b/TODO.md @@ -0,0 +1,81 @@ +# Lightscale TODO + +This list captures every feature discussed in the design conversation and tracks the current +implementation status. Items marked as done are implemented in a minimal form; the rest are +pending or stubbed. + +## Control plane +- [x] Network creation + enrollment tokens +- [x] Deterministic overlay prefixes (IPv4/IPv6) +- [x] Netmap with node/peer metadata +- [x] Relay config surfaced in netmap (STUN/TURN/stream relay/UDP relay lists) +- [x] Device naming + DNS name metadata in netmap +- [x] Admin approval flow for new devices +- [x] Auth URL onboarding flow (approval link) +- [x] Token revocation endpoint +- [x] ACL / policy rules per network +- [x] Key rotation policy and revocation +- [x] Key transparency / audit log +- [x] TLS pinning / server identity bootstrapping +- [x] Control plane HA with shared DB (multi-server, no single point of failure) +- [x] Client failover across multiple control URLs + +## Data plane +- [x] WireGuard interface bring-up (Linux) +- [x] Userspace WireGuard fallback (boringtun embedded) +- [x] Peer config from netmap (AllowedIPs per peer) +- [x] Basic route application (`wg-up --apply-routes`) +- [x] Basic exit-node acceptance flag (first advertised default route) +- [x] Dynamic peer updates (long-polling or streaming) +- [x] Subnet routing conflict detection and metrics +- [x] Exit node selection and routing policy (multi-exit, per-app, metrics) +- [x] Exit node selection by peer ID/name (single) +- [x] DNS push / resolver integration +- [x] L2 segment support (optional, non-default) +- [x] Subnet router SNAT mode + return-route guidance + +## NAT traversal and relay +- [x] STUN client to discover public endpoints (best effort) +- [x] Server-observed endpoint merge (heartbeat listen port) +- [x] Peer probe to trigger NAT traversal (best effort) +- [x] Dynamic endpoint rotation on stale handshakes +- [x] UDP relay (best effort, not TURN/stream relay) +- [x] Stream relay (TCP, DERP-like) +- [x] Stream relay signaling for peer probes +- [x] TURN client (UDP) +- [x] Stream relay integration into dataplane (fallback tunnels) +- [x] IPv6-only server strategy (use IPv6 listen + IPv6 control URLs) + +## Multi-network (one client, multiple networks) +- [x] Profile-scoped client state +- [x] Concurrent multi-network routing isolation +- [x] Default no-forwarding between networks (prevent accidental routing) +- [x] Route conflict detection for overlapping subnets +- [x] Route translation / prefix mapping for overlaps +- [x] Per-network DNS suffix + split DNS +- [x] Exit node selection when multiple networks advertise default routes + +## Onboarding and UX +- [x] CLI init/register/heartbeat/netmap/status +- [x] CLI wg-up/wg-down for local testing +- [x] CLI dns export and relay inspection +- [x] CLI to approve devices +- [x] CLI to manage ACLs +- [x] Local UI / agent mode for background operation + +## Testing +- [x] NixOS VM lab (2-node fast) +- [x] NixOS VM lab (5-node) +- [x] NAT and firewall scenario tests +- [x] Relay fallback tests +- [x] Multi-network overlap tests +- [x] CLI smoke assertions for `status`/`netmap`/`relay` outputs in lab tests +- [x] Negative-path enrollment tests (invalid/expired/revoked tokens, approval-required flow) +- [x] Key rotation + revoke flow validation (netmap status + peer removal) +- [x] DNS export + resolver integration tests (hosts/json, split DNS) +- [x] IPv6 dataplane connectivity tests (overlay + subnet routes) +- [x] Userspace endpoint refresh/relay fallback regression test coverage +- [x] Scale lab test (8-10 nodes) to validate full-mesh and relay fallback +- [x] Agent restart + state recovery tests (graceful restart, endpoint re-discovery) +- [x] Control plane restart/outage resilience (data plane stays up, netmap recovers) +- [x] Multi-relay server failover (stream relay list, first down) diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..3fcda0b --- /dev/null +++ b/flake.lock @@ -0,0 +1,27 @@ +{ + "nodes": { + "nixpkgs": { + "locked": { + "lastModified": 1768564909, + "narHash": "sha256-Kell/SpJYVkHWMvnhqJz/8DqQg2b6PguxVWOuadbHCc=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "e4bae1bd10c9c57b2cf517953ab70060a828ee6f", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..bf1dcdd --- /dev/null +++ b/flake.nix @@ -0,0 +1,118 @@ +{ + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + }; + + outputs = { self, nixpkgs }: + let + system = "x86_64-linux"; + pkgs = import nixpkgs { inherit system; }; + + serverPkg = pkgs.rustPlatform.buildRustPackage { + pname = "lightscale-server"; + version = "0.1.0"; + src = ./lightscale-server; + cargoLock.lockFile = ./lightscale-server/Cargo.lock; + }; + + clientPkg = pkgs.rustPlatform.buildRustPackage { + pname = "lightscale-client"; + version = "0.1.0"; + src = ./lightscale-client; + cargoLock.lockFile = ./lightscale-client/Cargo.lock; + nativeBuildInputs = [ pkgs.pkg-config ]; + buildInputs = [ pkgs.libmnl pkgs.libnftnl ]; + }; + + testConfigFast = import ./lab/test-fast.nix { inherit pkgs serverPkg clientPkg; }; + testConfigFull = import ./lab/test.nix { inherit pkgs serverPkg clientPkg; }; + testConfigFirewall = import ./lab/test-firewall.nix { inherit pkgs serverPkg clientPkg; }; + testConfigNat = import ./lab/test-nat.nix { inherit pkgs serverPkg clientPkg; }; + testConfigMulti = import ./lab/test-multi.nix { inherit pkgs serverPkg clientPkg; }; + testConfigScale = import ./lab/test-scale.nix { inherit pkgs serverPkg clientPkg; }; + testConfigNegative = import ./lab/test-negative.nix { inherit pkgs serverPkg clientPkg; }; + testConfigKeys = import ./lab/test-keys.nix { inherit pkgs serverPkg clientPkg; }; + testConfigRestart = import ./lab/test-restart.nix { inherit pkgs serverPkg clientPkg; }; + testConfigControlplaneRestart = import ./lab/test-controlplane-restart.nix { inherit pkgs serverPkg clientPkg; }; + testConfigControlplaneHa = import ./lab/test-controlplane-ha.nix { inherit pkgs serverPkg clientPkg; }; + testConfigDns = import ./lab/test-dns.nix { inherit pkgs serverPkg clientPkg; }; + testConfigIpv6 = import ./lab/test-ipv6.nix { inherit pkgs serverPkg clientPkg; }; + testConfigUserspace = import ./lab/test-userspace.nix { inherit pkgs serverPkg clientPkg; }; + testConfigRelayFailover = import ./lab/test-relay-failover.nix { inherit pkgs serverPkg clientPkg; }; + labTestFast = + (import (nixpkgs + "/nixos/tests/make-test-python.nix") testConfigFast) + { inherit system pkgs; }; + labTestFull = + (import (nixpkgs + "/nixos/tests/make-test-python.nix") testConfigFull) + { inherit system pkgs; }; + labTestFirewall = + (import (nixpkgs + "/nixos/tests/make-test-python.nix") testConfigFirewall) + { inherit system pkgs; }; + labTestNat = + (import (nixpkgs + "/nixos/tests/make-test-python.nix") testConfigNat) + { inherit system pkgs; }; + labTestMulti = + (import (nixpkgs + "/nixos/tests/make-test-python.nix") testConfigMulti) + { inherit system pkgs; }; + labTestScale = + (import (nixpkgs + "/nixos/tests/make-test-python.nix") testConfigScale) + { inherit system pkgs; }; + labTestNegative = + (import (nixpkgs + "/nixos/tests/make-test-python.nix") testConfigNegative) + { inherit system pkgs; }; + labTestKeys = + (import (nixpkgs + "/nixos/tests/make-test-python.nix") testConfigKeys) + { inherit system pkgs; }; + labTestRestart = + (import (nixpkgs + "/nixos/tests/make-test-python.nix") testConfigRestart) + { inherit system pkgs; }; + labTestControlplaneRestart = + (import (nixpkgs + "/nixos/tests/make-test-python.nix") testConfigControlplaneRestart) + { inherit system pkgs; }; + labTestControlplaneHa = + (import (nixpkgs + "/nixos/tests/make-test-python.nix") testConfigControlplaneHa) + { inherit system pkgs; }; + labTestDns = + (import (nixpkgs + "/nixos/tests/make-test-python.nix") testConfigDns) + { inherit system pkgs; }; + labTestIpv6 = + (import (nixpkgs + "/nixos/tests/make-test-python.nix") testConfigIpv6) + { inherit system pkgs; }; + labTestUserspace = + (import (nixpkgs + "/nixos/tests/make-test-python.nix") testConfigUserspace) + { inherit system pkgs; }; + labTestRelayFailover = + (import (nixpkgs + "/nixos/tests/make-test-python.nix") testConfigRelayFailover) + { inherit system pkgs; }; + in + { + packages.${system} = { + lightscale-server = serverPkg; + lightscale-client = clientPkg; + }; + + nixosTests.lightscale-lab = labTestFast; + nixosTests.lightscale-lab-5 = labTestFull; + nixosTests.lightscale-lab-firewall = labTestFirewall; + nixosTests.lightscale-lab-nat = labTestNat; + nixosTests.lightscale-lab-multi = labTestMulti; + nixosTests.lightscale-lab-scale = labTestScale; + nixosTests.lightscale-lab-negative = labTestNegative; + nixosTests.lightscale-lab-keys = labTestKeys; + nixosTests.lightscale-lab-restart = labTestRestart; + nixosTests.lightscale-lab-controlplane-restart = labTestControlplaneRestart; + nixosTests.lightscale-lab-controlplane-ha = labTestControlplaneHa; + nixosTests.lightscale-lab-dns = labTestDns; + nixosTests.lightscale-lab-ipv6 = labTestIpv6; + nixosTests.lightscale-lab-userspace = labTestUserspace; + nixosTests.lightscale-lab-relay-failover = labTestRelayFailover; + + devShells.${system}.default = pkgs.mkShell { + buildInputs = [ + pkgs.nix + pkgs.qemu + pkgs.curl + ]; + }; + }; +} diff --git a/lab/README.md b/lab/README.md new file mode 100644 index 0000000..1ec834e --- /dev/null +++ b/lab/README.md @@ -0,0 +1,84 @@ +# lightscale lab (NixOS VMs) + +This folder contains NixOS tests that boot QEMU VMs, register them against the +lightscale control plane, bring up WireGuard interfaces, and verify ping across overlay IPs. + +## Run the lab test + +Fast (2 nodes) test to avoid long builds: + +```sh +cd /home/centra/dev/lightscale +nix build .#nixosTests.lightscale-lab +./result/bin/nixos-test-driver +``` + +Or use the helper script (prebuilds client/server first): + +```sh +cd /home/centra/dev/lightscale +./lab/run.sh +``` + +To watch logs interactively (recommended when debugging hangs): + +```sh +cd /home/centra/dev/lightscale +./lab/run.sh --interactive --keep +``` + +In the Python prompt, run: + +```python +node1.execute("tail -f /tmp/lightscale-server.log") +``` + +Full (5 nodes) test: + +```sh +cd /home/centra/dev/lightscale +nix build .#nixosTests.lightscale-lab-5 +./result/bin/nixos-test-driver +``` + +Helper script: + +```sh +cd /home/centra/dev/lightscale +./lab/run.sh full +``` + +Firewall (UDP blocked, stream relay fallback): + +```sh +cd /home/centra/dev/lightscale +./lab/run.sh firewall +``` + +Control plane restart/outage resilience: + +```sh +cd /home/centra/dev/lightscale +./lab/run.sh controlplane +``` + +Control plane HA (shared DB + client failover across control URLs): + +```sh +cd /home/centra/dev/lightscale +./lab/run.sh controlplane-ha +``` + +Stream relay failover (multi relay servers, first down): + +```sh +cd /home/centra/dev/lightscale +./lab/run.sh relay-failover +``` + +The tests will: +- start node1 as the control server +- register clients +- run `agent` on each node (endpoint rotation + stream relay fallback phases) +- ping full-mesh between overlay IPv4 addresses in both phases +- for the firewall test, block UDP/51820 between nodes and verify stream relay fallback diff --git a/lab/driver-controlplane b/lab/driver-controlplane new file mode 120000 index 0000000..ec41175 --- /dev/null +++ b/lab/driver-controlplane @@ -0,0 +1 @@ +/nix/store/mg62ylj2alfj3i1ixpmg19xi0dcnny0z-nixos-test-driver-lightscale-lab-controlplane-restart \ No newline at end of file diff --git a/lab/driver-controlplane-ha b/lab/driver-controlplane-ha new file mode 120000 index 0000000..2c4c0a8 --- /dev/null +++ b/lab/driver-controlplane-ha @@ -0,0 +1 @@ +/nix/store/hac85k2vi3vwly832nv7x80mlbwng6iy-nixos-test-driver-lightscale-lab-controlplane-ha \ No newline at end of file diff --git a/lab/driver-dns b/lab/driver-dns new file mode 120000 index 0000000..d23b60f --- /dev/null +++ b/lab/driver-dns @@ -0,0 +1 @@ +/nix/store/br0pl0x3qmlvnds00nd8y35w1vyzziaj-nixos-test-driver-lightscale-lab-dns \ No newline at end of file diff --git a/lab/driver-fast b/lab/driver-fast new file mode 120000 index 0000000..ac5271c --- /dev/null +++ b/lab/driver-fast @@ -0,0 +1 @@ +/nix/store/80h2bviz9fdv4g247arhn5hksmm2nvhh-nixos-test-driver-lightscale-lab-fast \ No newline at end of file diff --git a/lab/driver-firewall b/lab/driver-firewall new file mode 120000 index 0000000..3d5ec8a --- /dev/null +++ b/lab/driver-firewall @@ -0,0 +1 @@ +/nix/store/ydrpa0rckw7kfinh0n8c4nvc2129rbgy-nixos-test-driver-lightscale-lab-firewall \ No newline at end of file diff --git a/lab/driver-full b/lab/driver-full new file mode 120000 index 0000000..55abeb8 --- /dev/null +++ b/lab/driver-full @@ -0,0 +1 @@ +/nix/store/cz5w7fyycrxpn17d54dan4zv6wjq6niw-nixos-test-driver-lightscale-lab \ No newline at end of file diff --git a/lab/driver-ipv6 b/lab/driver-ipv6 new file mode 120000 index 0000000..967bf51 --- /dev/null +++ b/lab/driver-ipv6 @@ -0,0 +1 @@ +/nix/store/i52my94w85aj8dqvaizb8mi2aisllp2s-nixos-test-driver-lightscale-lab-ipv6 \ No newline at end of file diff --git a/lab/driver-keys b/lab/driver-keys new file mode 120000 index 0000000..66280b7 --- /dev/null +++ b/lab/driver-keys @@ -0,0 +1 @@ +/nix/store/mn51lrf2sjhb81nqcmz9j3p1hj3swmb6-nixos-test-driver-lightscale-lab-keys \ No newline at end of file diff --git a/lab/driver-multi b/lab/driver-multi new file mode 120000 index 0000000..1aaf977 --- /dev/null +++ b/lab/driver-multi @@ -0,0 +1 @@ +/nix/store/2mn037ldm47kaw3iw6kh94b0dqsy26d7-nixos-test-driver-lightscale-lab-multi \ No newline at end of file diff --git a/lab/driver-nat b/lab/driver-nat new file mode 120000 index 0000000..f3bd105 --- /dev/null +++ b/lab/driver-nat @@ -0,0 +1 @@ +/nix/store/k2nmjhrq7a3gjq1zgx9ff59dy0qm4mxf-nixos-test-driver-lightscale-lab-nat \ No newline at end of file diff --git a/lab/driver-negative b/lab/driver-negative new file mode 120000 index 0000000..63e5d93 --- /dev/null +++ b/lab/driver-negative @@ -0,0 +1 @@ +/nix/store/xiwvziwzdc0fri28vi1f5ph7bai2rryf-nixos-test-driver-lightscale-lab-negative \ No newline at end of file diff --git a/lab/driver-relay-failover b/lab/driver-relay-failover new file mode 120000 index 0000000..aa27079 --- /dev/null +++ b/lab/driver-relay-failover @@ -0,0 +1 @@ +/nix/store/zhbdb2sdba1460q0l18ffrgqnip59wzy-nixos-test-driver-lightscale-lab-relay-failover \ No newline at end of file diff --git a/lab/driver-restart b/lab/driver-restart new file mode 120000 index 0000000..aa7145e --- /dev/null +++ b/lab/driver-restart @@ -0,0 +1 @@ +/nix/store/1yqvld21wpsgnpbc5917d9bx5ivz4jhr-nixos-test-driver-lightscale-lab-restart \ No newline at end of file diff --git a/lab/driver-scale b/lab/driver-scale new file mode 120000 index 0000000..3447fa9 --- /dev/null +++ b/lab/driver-scale @@ -0,0 +1 @@ +/nix/store/q2h7928xcj425jmdjyp8sn4hsvcxispw-nixos-test-driver-lightscale-lab-scale \ No newline at end of file diff --git a/lab/driver-userspace b/lab/driver-userspace new file mode 120000 index 0000000..2e808ba --- /dev/null +++ b/lab/driver-userspace @@ -0,0 +1 @@ +/nix/store/66wrz088pg2vfvx8pvpic2dlvlzkj6bk-nixos-test-driver-lightscale-lab-userspace \ No newline at end of file diff --git a/lab/result-fast b/lab/result-fast new file mode 120000 index 0000000..5aea53d --- /dev/null +++ b/lab/result-fast @@ -0,0 +1 @@ +/nix/store/vrjzgcqjhcli86rapiyrv14453n8dgrq-vm-test-run-lightscale-lab-fast \ No newline at end of file diff --git a/lab/run.sh b/lab/run.sh new file mode 100755 index 0000000..703af2c --- /dev/null +++ b/lab/run.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +set -euo pipefail + +MODE=fast +INTERACTIVE=0 +KEEP=0 +for arg in "$@"; do + case "$arg" in + full) MODE=full ;; + fast) MODE=fast ;; + firewall) MODE=firewall ;; + nat) MODE=nat ;; + multi) MODE=multi ;; + scale) MODE=scale ;; + negative) MODE=negative ;; + keys) MODE=keys ;; + restart) MODE=restart ;; + controlplane) MODE=controlplane ;; + controlplane-ha) MODE=controlplane-ha ;; + relay-failover) MODE=relay-failover ;; + dns) MODE=dns ;; + ipv6) MODE=ipv6 ;; + userspace) MODE=userspace ;; + --interactive) INTERACTIVE=1 ;; + --keep) KEEP=1 ;; + esac +done + +ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) +cd "$ROOT_DIR" + +nix build .#packages.x86_64-linux.lightscale-server +nix build .#packages.x86_64-linux.lightscale-client + +OUT_LINK="$ROOT_DIR/lab/driver-$MODE" +if [[ "$MODE" == "full" ]]; then + nix build .#nixosTests.lightscale-lab-5.driver --out-link "$OUT_LINK" +elif [[ "$MODE" == "nat" ]]; then + nix build .#nixosTests.lightscale-lab-nat.driver --out-link "$OUT_LINK" +elif [[ "$MODE" == "multi" ]]; then + nix build .#nixosTests.lightscale-lab-multi.driver --out-link "$OUT_LINK" +elif [[ "$MODE" == "scale" ]]; then + nix build .#nixosTests.lightscale-lab-scale.driver --out-link "$OUT_LINK" +elif [[ "$MODE" == "negative" ]]; then + nix build .#nixosTests.lightscale-lab-negative.driver --out-link "$OUT_LINK" +elif [[ "$MODE" == "keys" ]]; then + nix build .#nixosTests.lightscale-lab-keys.driver --out-link "$OUT_LINK" +elif [[ "$MODE" == "restart" ]]; then + nix build .#nixosTests.lightscale-lab-restart.driver --out-link "$OUT_LINK" +elif [[ "$MODE" == "controlplane" ]]; then + nix build .#nixosTests.lightscale-lab-controlplane-restart.driver --out-link "$OUT_LINK" +elif [[ "$MODE" == "controlplane-ha" ]]; then + nix build .#nixosTests.lightscale-lab-controlplane-ha.driver --out-link "$OUT_LINK" +elif [[ "$MODE" == "relay-failover" ]]; then + nix build .#nixosTests.lightscale-lab-relay-failover.driver --out-link "$OUT_LINK" +elif [[ "$MODE" == "dns" ]]; then + nix build .#nixosTests.lightscale-lab-dns.driver --out-link "$OUT_LINK" +elif [[ "$MODE" == "ipv6" ]]; then + nix build .#nixosTests.lightscale-lab-ipv6.driver --out-link "$OUT_LINK" +elif [[ "$MODE" == "userspace" ]]; then + nix build .#nixosTests.lightscale-lab-userspace.driver --out-link "$OUT_LINK" +elif [[ "$MODE" == "firewall" ]]; then + nix build .#nixosTests.lightscale-lab-firewall.driver --out-link "$OUT_LINK" +else + nix build .#nixosTests.lightscale-lab.driver --out-link "$OUT_LINK" +fi + +DRIVER_ARGS=() +if [[ "$INTERACTIVE" == "1" ]]; then + DRIVER_ARGS+=("--interactive") +fi +if [[ "$KEEP" == "1" ]]; then + DRIVER_ARGS+=("--keep-vm-state") +fi + +exec "$OUT_LINK/bin/nixos-test-driver" "${DRIVER_ARGS[@]}" diff --git a/lab/test-controlplane-ha.nix b/lab/test-controlplane-ha.nix new file mode 100644 index 0000000..a508e86 --- /dev/null +++ b/lab/test-controlplane-ha.nix @@ -0,0 +1,133 @@ +{ pkgs, serverPkg, clientPkg }: +{ + name = "lightscale-lab-controlplane-ha"; + nodes = { + node1 = { ... }: { + networking.hostName = "node1"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.1"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + services.postgresql = { + enable = true; + package = pkgs.postgresql_16; + settings = { + listen_addresses = pkgs.lib.mkForce "*"; + }; + authentication = '' + local all all trust + host all all 127.0.0.1/32 trust + ''; + initialScript = pkgs.writeText "lightscale-init.sql" '' + CREATE ROLE lightscale LOGIN; + CREATE DATABASE lightscale OWNER lightscale; + ''; + }; + environment.systemPackages = [ + serverPkg + clientPkg + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node2 = { ... }: { + networking.hostName = "node2"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.2"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + }; + + testScript = '' + start_all() + node1.wait_for_unit("multi-user.target") + node2.wait_for_unit("multi-user.target") + + node1.wait_for_unit("postgresql.service") + node1.wait_for_open_port(5432, timeout=120) + + node1.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.1/24'") + node2.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.2/24'") + + node1.succeed("touch /tmp/lightscale-server-1.log") + node1.succeed("touch /tmp/lightscale-server-2.log") + node1.execute("sh -c 'tail -n +1 -f /tmp/lightscale-server-1.log >/dev/console 2>&1 &'") + node1.execute("sh -c 'tail -n +1 -f /tmp/lightscale-server-2.log >/dev/console 2>&1 &'") + + db_url = "postgres://lightscale@127.0.0.1/lightscale?sslmode=disable" + + node1.succeed( + "systemd-run --no-block --unit=lightscale-server-1 --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-server-1.log " + "--property=StandardError=append:/tmp/lightscale-server-1.log " + "--setenv=RUST_LOG=info -- " + "lightscale-server --listen 10.0.0.1:8080 " + + f"--db-url '{db_url}'" + ) + node1.succeed( + "systemd-run --no-block --unit=lightscale-server-2 --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-server-2.log " + "--property=StandardError=append:/tmp/lightscale-server-2.log " + "--setenv=RUST_LOG=info -- " + "lightscale-server --listen 10.0.0.1:8081 " + + f"--db-url '{db_url}'" + ) + + node1.wait_for_unit("lightscale-server-1.service") + node1.wait_for_unit("lightscale-server-2.service") + node1.wait_for_open_port(8080, addr="10.0.0.1", timeout=120) + node1.wait_for_open_port(8081, addr="10.0.0.1", timeout=120) + + import json + + net = json.loads(node1.succeed( + "curl -sSf -X POST http://10.0.0.1:8080/v1/networks " + "-H 'content-type: application/json' " + "-d '{\"name\":\"ha\",\"bootstrap_token_ttl_seconds\":600," \ + "\"bootstrap_token_uses\":10,\"bootstrap_token_tags\":[\"ha\"]}'" + )) + token = net["bootstrap_token"]["token"] + + node1.succeed("systemctl stop lightscale-server-1.service") + node1.wait_until_fails("curl -sSf http://10.0.0.1:8080/healthz") + + node2.succeed( + "lightscale-client --profile ha --config /tmp/ls-config.json " + "init http://10.0.0.1:8080,http://10.0.0.1:8081" + ) + node2.succeed( + "lightscale-client --profile ha --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state register --node-name node2 -- " + token + ) + node2.succeed( + "lightscale-client --profile ha --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state heartbeat --endpoint 10.0.0.2:51820" + ) + node2.succeed( + "lightscale-client --profile ha --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state netmap | grep -q 'peers:'" + ) + ''; +} diff --git a/lab/test-controlplane-restart.nix b/lab/test-controlplane-restart.nix new file mode 100644 index 0000000..c709fce --- /dev/null +++ b/lab/test-controlplane-restart.nix @@ -0,0 +1,200 @@ +{ pkgs, serverPkg, clientPkg }: +{ + name = "lightscale-lab-controlplane-restart"; + nodes = { + node1 = { ... }: { + networking.hostName = "node1"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.1"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + serverPkg + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node2 = { ... }: { + networking.hostName = "node2"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.2"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node3 = { ... }: { + networking.hostName = "node3"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.3"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node4 = { ... }: { + networking.hostName = "node4"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.4"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + }; + + testScript = '' + start_all() + node1.wait_for_unit("multi-user.target") + node2.wait_for_unit("multi-user.target") + node3.wait_for_unit("multi-user.target") + node4.wait_for_unit("multi-user.target") + + node1.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.1/24'") + node2.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.2/24'") + node3.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.3/24'") + node4.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.4/24'") + + node1.succeed("touch /tmp/lightscale-server.log") + node1.execute("sh -c 'tail -n +1 -f /tmp/lightscale-server.log >/dev/console 2>&1 &'") + node1.succeed( + "systemd-run --no-block --unit=lightscale-server --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-server.log " + "--property=StandardError=append:/tmp/lightscale-server.log " + "--setenv=RUST_LOG=info -- " + "lightscale-server --listen 10.0.0.1:8080 --state /tmp/lightscale-state.json" + ) + node1.wait_for_unit("lightscale-server.service") + node1.wait_for_open_port(8080, addr="10.0.0.1", timeout=120) + + import json + import time + + net = json.loads(node1.succeed( + "curl -sSf -X POST http://10.0.0.1:8080/v1/networks " + "-H 'content-type: application/json' " + "-d '{\"name\":\"lab\",\"bootstrap_token_ttl_seconds\":600," \ + "\"bootstrap_token_uses\":10,\"bootstrap_token_tags\":[\"lab\"]}'" + )) + token = net["bootstrap_token"]["token"] + + def enroll(node, name, ip, state_dir): + node.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "init http://10.0.0.1:8080" + ) + node.succeed( + f"lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir {state_dir} register --node-name {name} -- {token}" + ) + node.succeed( + f"lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir {state_dir} heartbeat --endpoint {ip}:51820" + ) + + def start_agent(node, ip, state_dir): + node.succeed("touch /tmp/lightscale-agent.log") + cmd = ( + "lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir {state_dir} agent --listen-port 51820 " + "--heartbeat-interval 5 --longpoll-timeout 5 " + f"--endpoint {ip}:51820" + ) + node.succeed( + "systemd-run --no-block --unit=lightscale-agent --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-agent.log " + "--property=StandardError=append:/tmp/lightscale-agent.log -- " + + cmd + ) + node.wait_for_unit("lightscale-agent.service") + node.wait_until_succeeds("ip link show ls-test", timeout=60) + + enroll(node2, "node2", "10.0.0.2", "/tmp/ls-state-2") + enroll(node3, "node3", "10.0.0.3", "/tmp/ls-state-3") + start_agent(node2, "10.0.0.2", "/tmp/ls-state-2") + start_agent(node3, "10.0.0.3", "/tmp/ls-state-3") + + data2 = json.loads(node2.succeed("cat /tmp/ls-state-2/state.json")) + data3 = json.loads(node3.succeed("cat /tmp/ls-state-3/state.json")) + ip2 = data2["ipv4"] + ip3 = data3["ipv4"] + + node2.wait_until_succeeds(f"ping -c 3 {ip3}", timeout=120) + node3.wait_until_succeeds(f"ping -c 3 {ip2}", timeout=120) + + def restarts(node): + return int(node.succeed("systemctl show -p NRestarts --value lightscale-agent.service")) + + restarts_before = restarts(node2) + + node1.succeed("systemctl stop lightscale-server.service") + node1.wait_until_fails("curl -sSf http://10.0.0.1:8080/healthz") + time.sleep(6) + + node2.wait_until_succeeds(f"ping -c 3 {ip3}", timeout=120) + restarts_after = restarts(node2) + if restarts_after != restarts_before: + raise Exception(f"agent restarted during control plane outage ({restarts_before} -> {restarts_after})") + + node1.succeed( + "systemd-run --no-block --unit=lightscale-server --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-server.log " + "--property=StandardError=append:/tmp/lightscale-server.log " + "--setenv=RUST_LOG=info -- " + "lightscale-server --listen 10.0.0.1:8080 --state /tmp/lightscale-state.json" + ) + node1.wait_for_unit("lightscale-server.service") + node1.wait_for_open_port(8080, addr="10.0.0.1", timeout=120) + + enroll(node4, "node4", "10.0.0.4", "/tmp/ls-state-4") + start_agent(node4, "10.0.0.4", "/tmp/ls-state-4") + + data4 = json.loads(node4.succeed("cat /tmp/ls-state-4/state.json")) + ip4 = data4["ipv4"] + node2.wait_until_succeeds(f"ping -c 3 {ip4}", timeout=180) + ''; +} diff --git a/lab/test-dns.nix b/lab/test-dns.nix new file mode 100644 index 0000000..2ab8ec7 --- /dev/null +++ b/lab/test-dns.nix @@ -0,0 +1,190 @@ +{ pkgs, serverPkg, clientPkg }: +{ + name = "lightscale-lab-dns"; + nodes = { + node1 = { ... }: { + networking.hostName = "node1"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.1"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + services.resolved.enable = true; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + serverPkg + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.curl + pkgs.dnsutils + ]; + }; + node2 = { ... }: { + networking.hostName = "node2"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.2"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + services.resolved.enable = true; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.curl + pkgs.dnsutils + ]; + }; + }; + + testScript = '' + start_all() + node1.wait_for_unit("multi-user.target") + node2.wait_for_unit("multi-user.target") + node1.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.1/24'") + node2.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.2/24'") + + node1.succeed("touch /tmp/lightscale-server.log") + node1.execute("sh -c 'tail -n +1 -f /tmp/lightscale-server.log >/dev/console 2>&1 &'") + node1.succeed( + "systemd-run --no-block --unit=lightscale-server --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-server.log " + "--property=StandardError=append:/tmp/lightscale-server.log " + "--setenv=RUST_LOG=info -- " + "lightscale-server --listen 10.0.0.1:8080 --state /tmp/lightscale-state.json" + ) + node1.wait_for_unit("lightscale-server.service") + node1.wait_for_open_port(8080, addr="10.0.0.1", timeout=120) + + import json + + net = json.loads(node1.succeed( + "curl -sSf -X POST http://10.0.0.1:8080/v1/networks " + "-H 'content-type: application/json' " + "-d '{\"name\":\"dns\",\"bootstrap_token_ttl_seconds\":600," \ + "\"bootstrap_token_uses\":10,\"bootstrap_token_tags\":[\"dns\"]}'" + )) + token = net["bootstrap_token"]["token"] + + def enroll(node, name, ip): + node.succeed( + "lightscale-client --profile dns --config /tmp/ls-config.json " + "init http://10.0.0.1:8080" + ) + node.succeed( + f"lightscale-client --profile dns --config /tmp/ls-config.json " + f"--state-dir /tmp/ls-state register --node-name {name} -- {token}" + ) + node.succeed( + f"lightscale-client --profile dns --config /tmp/ls-config.json " + f"--state-dir /tmp/ls-state heartbeat --endpoint {ip}:51820" + ) + + enroll(node1, "node1", "10.0.0.1") + enroll(node2, "node2", "10.0.0.2") + + node1.succeed( + "lightscale-client --profile dns --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state netmap | grep -q 'peers: 1'" + ) + + node1.succeed( + "lightscale-client --profile dns --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state dns --format hosts --output /tmp/hosts.out" + ) + node1.succeed( + "lightscale-client --profile dns --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state dns --format json --output /tmp/dns.json" + ) + node1.succeed( + "lightscale-client --profile dns --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state dns --format hosts " + "--apply-hosts --hosts-path /tmp/hosts.apply" + ) + + state = json.loads(node1.succeed("cat /tmp/ls-state/state.json")) + netmap = state["last_netmap"] + node_name = netmap["node"]["dns_name"] + peer_name = netmap["peers"][0]["dns_name"] + dns_domain = netmap["network"]["dns_domain"] + peer_ipv4 = netmap["peers"][0]["ipv4"] + peer_ipv6 = netmap["peers"][0]["ipv6"] + + hosts_out = node1.succeed("cat /tmp/hosts.out") + assert node_name in hosts_out + assert peer_name in hosts_out + assert state["ipv4"] in hosts_out + assert state["ipv6"] in hosts_out + + dns_json = json.loads(node1.succeed("cat /tmp/dns.json")) + assert dns_json["network"]["dns_domain"] == dns_domain + record_names = [record["name"] for record in dns_json["records"]] + assert node_name in record_names + assert peer_name in record_names + + hosts_apply = node1.succeed("cat /tmp/hosts.apply") + assert "# lightscale:dns begin" in hosts_apply + assert "# lightscale:dns end" in hosts_apply + assert node_name in hosts_apply + assert peer_name in hosts_apply + + def start_agent(node, endpoint): + node.succeed("touch /tmp/lightscale-agent.log") + cmd = ( + "lightscale-client --profile dns --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state agent --listen-port 51820 " + "--heartbeat-interval 5 --longpoll-timeout 5 " + f"--endpoint {endpoint}" + ) + node.succeed( + "systemd-run --no-block --unit=lightscale-agent --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-agent.log " + "--property=StandardError=append:/tmp/lightscale-agent.log -- " + + cmd + ) + node.wait_for_unit("lightscale-agent.service") + node.wait_until_succeeds("ip link show ls-dns", timeout=60) + + start_agent(node1, "10.0.0.1:51820") + start_agent(node2, "10.0.0.2:51820") + + node1.succeed("touch /tmp/lightscale-dns.log") + node1.succeed( + "systemd-run --no-block --unit=lightscale-dns --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-dns.log " + "--property=StandardError=append:/tmp/lightscale-dns.log -- " + "lightscale-client --profile dns --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state dns-serve --listen 127.0.0.1:53 " + "--apply-resolver --interface ls-dns" + ) + + node1.wait_until_succeeds("resolvectl dns ls-dns | grep -q '127.0.0.1'", timeout=60) + node1.wait_until_succeeds( + f"resolvectl domain ls-dns | grep -q '~{dns_domain}'", + timeout=60, + ) + + node1.wait_until_succeeds( + f"dig +short @127.0.0.1 {peer_name} | grep -q '{peer_ipv4}'", + timeout=60, + ) + node1.wait_until_succeeds( + f"dig +short -t AAAA @127.0.0.1 {peer_name} | grep -qi '{peer_ipv6}'", + timeout=60, + ) + ''; +} diff --git a/lab/test-fast.nix b/lab/test-fast.nix new file mode 100644 index 0000000..b4dbece --- /dev/null +++ b/lab/test-fast.nix @@ -0,0 +1,170 @@ +{ pkgs, serverPkg, clientPkg }: +{ + name = "lightscale-lab-fast"; + nodes = { + node1 = { ... }: { + networking.hostName = "node1"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.1"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + serverPkg + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node2 = { ... }: { + networking.hostName = "node2"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.2"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.curl + ]; + }; + }; + + testScript = '' + start_all() + node1.wait_for_unit("multi-user.target") + node2.wait_for_unit("multi-user.target") + node1.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.1/24'") + node2.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.2/24'") + + node1.succeed("touch /tmp/lightscale-server.log") + node1.execute("sh -c 'tail -n +1 -f /tmp/lightscale-server.log >/dev/console 2>&1 &'") + node1.succeed( + "systemd-run --no-block --unit=lightscale-server --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-server.log " + "--property=StandardError=append:/tmp/lightscale-server.log " + "--setenv=RUST_LOG=info -- " + "lightscale-server --listen 10.0.0.1:8080 --state /tmp/lightscale-state.json " + "--stream-relay 10.0.0.1:8443 --stream-relay-listen 10.0.0.1:8443" + ) + node1.wait_for_unit("lightscale-server.service") + node1.wait_for_open_port(8080, addr="10.0.0.1", timeout=120) + node1.wait_for_open_port(8443, addr="10.0.0.1", timeout=120) + node1.succeed("tail -n 50 /tmp/lightscale-server.log || true") + + import json + + net = json.loads(node1.succeed( + "curl -sSf -X POST http://10.0.0.1:8080/v1/networks " + "-H 'content-type: application/json' " + "-d '{\"name\":\"lab\",\"bootstrap_token_ttl_seconds\":600," \ + "\"bootstrap_token_uses\":10,\"bootstrap_token_tags\":[\"lab\"]}'" + )) + token = net["bootstrap_token"]["token"] + + def enroll(node, name, ip): + node.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "init http://10.0.0.1:8080" + ) + node.succeed( + f"lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir /tmp/ls-state register --node-name {name} -- {token}" + ) + node.succeed( + f"lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir /tmp/ls-state heartbeat --endpoint {ip}:51820" + ) + + enroll(node1, "node1", "10.0.0.1") + enroll(node2, "node2", "10.0.0.2") + + node1.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state netmap | grep -q 'peers: 1'" + ) + node1.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state status | grep -q 'approved: true'" + ) + node1.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state relay | grep -q 'stream-relay:'" + ) + + node2.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state netmap | grep -q 'peers: 1'" + ) + node2.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state status | grep -q 'approved: true'" + ) + node2.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state relay | grep -q 'stream-relay:'" + ) + + def start_agent(node, endpoints, stream_relay): + node.succeed("touch /tmp/lightscale-agent.log") + cmd = ( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state agent --listen-port 51820 " + "--heartbeat-interval 5 --longpoll-timeout 5" + ) + for endpoint in endpoints: + cmd += f" --endpoint {endpoint}" + if stream_relay: + cmd += " --stream-relay" + node.succeed( + "systemd-run --no-block --unit=lightscale-agent --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-agent.log " + "--property=StandardError=append:/tmp/lightscale-agent.log -- " + + cmd + ) + node.wait_for_unit("lightscale-agent.service") + node.wait_until_succeeds("ip link show ls-test", timeout=60) + + # Phase 1: direct endpoint rotation (invalid -> valid) + start_agent(node1, ["203.0.113.1:51820", "10.0.0.1:51820"], False) + start_agent(node2, ["203.0.113.2:51820", "10.0.0.2:51820"], False) + + data1 = json.loads(node1.succeed("cat /tmp/ls-state/state.json")) + data2 = json.loads(node2.succeed("cat /tmp/ls-state/state.json")) + nodes = [node1, node2] + ips = [data1["ipv4"], data2["ipv4"]] + + def full_mesh_ping(nodes, ips): + for i, src in enumerate(nodes): + for j, dst in enumerate(nodes): + if i == j: + continue + src.wait_until_succeeds(f"ping -c 3 {ips[j]}", timeout=120) + + full_mesh_ping(nodes, ips) + + # Phase 2: stream relay fallback (invalid only) + node1.succeed("systemctl stop lightscale-agent.service") + node2.succeed("systemctl stop lightscale-agent.service") + start_agent(node1, ["203.0.113.1:51820"], True) + start_agent(node2, ["203.0.113.2:51820"], True) + + full_mesh_ping(nodes, ips) + ''; +} diff --git a/lab/test-firewall.nix b/lab/test-firewall.nix new file mode 100644 index 0000000..401a656 --- /dev/null +++ b/lab/test-firewall.nix @@ -0,0 +1,166 @@ +{ pkgs, serverPkg, clientPkg }: +{ + name = "lightscale-lab-firewall"; + nodes = { + node1 = { ... }: { + networking.hostName = "node1"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.1"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + serverPkg + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + pkgs.iptables + ]; + }; + node2 = { ... }: { + networking.hostName = "node2"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.2"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.curl + pkgs.iptables + ]; + }; + }; + + testScript = '' + start_all() + node1.wait_for_unit("multi-user.target") + node2.wait_for_unit("multi-user.target") + node1.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.1/24'") + node2.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.2/24'") + + node1.succeed("touch /tmp/lightscale-server.log") + node1.execute("sh -c 'tail -n +1 -f /tmp/lightscale-server.log >/dev/console 2>&1 &'") + node1.succeed( + "systemd-run --no-block --unit=lightscale-server --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-server.log " + "--property=StandardError=append:/tmp/lightscale-server.log " + "--setenv=RUST_LOG=info -- " + "lightscale-server --listen 10.0.0.1:8080 --state /tmp/lightscale-state.json " + "--stream-relay 10.0.0.1:8443 --stream-relay-listen 10.0.0.1:8443" + ) + node1.wait_for_unit("lightscale-server.service") + node1.wait_for_open_port(8080, addr="10.0.0.1", timeout=120) + node1.wait_for_open_port(8443, addr="10.0.0.1", timeout=120) + node1.succeed("tail -n 50 /tmp/lightscale-server.log || true") + + import json + + net = json.loads(node1.succeed( + "curl -sSf -X POST http://10.0.0.1:8080/v1/networks " + "-H 'content-type: application/json' " + "-d '{\"name\":\"lab\",\"bootstrap_token_ttl_seconds\":600," \ + "\"bootstrap_token_uses\":10,\"bootstrap_token_tags\":[\"lab\"]}'" + )) + token = net["bootstrap_token"]["token"] + + def enroll(node, name, ip): + node.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "init http://10.0.0.1:8080" + ) + node.succeed( + f"lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir /tmp/ls-state register --node-name {name} -- {token}" + ) + node.succeed( + f"lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir /tmp/ls-state heartbeat --endpoint {ip}:51820" + ) + + enroll(node1, "node1", "10.0.0.1") + enroll(node2, "node2", "10.0.0.2") + + def block_udp(node, peer_ip): + node.succeed(f"iptables -I OUTPUT -p udp --dport 51820 -d {peer_ip} -j DROP") + node.succeed(f"iptables -I INPUT -p udp --sport 51820 -s {peer_ip} -j DROP") + + block_udp(node1, "10.0.0.2") + block_udp(node2, "10.0.0.1") + + def start_agent(node, endpoints): + node.succeed("touch /tmp/lightscale-agent.log") + node.execute("sh -c 'tail -n +1 -f /tmp/lightscale-agent.log >/dev/console 2>&1 &'") + cmd = ( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state agent --listen-port 51820 " + "--heartbeat-interval 5 --longpoll-timeout 5 " + "--endpoint-stale-after 5 --endpoint-max-rotations 1 " + "--stream-relay --backend boringtun" + ) + for endpoint in endpoints: + cmd += f" --endpoint {endpoint}" + node.succeed( + "systemd-run --no-block --unit=lightscale-agent --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-agent.log " + "--property=StandardError=append:/tmp/lightscale-agent.log -- " + + cmd + ) + node.wait_for_unit("lightscale-agent.service") + node.wait_until_succeeds("ip link show ls-test", timeout=60) + + start_agent(node1, ["10.0.0.1:51820"]) + start_agent(node2, ["10.0.0.2:51820"]) + + node1.succeed("sleep 10") + node2.succeed("sleep 10") + print(node1.succeed("lightscale-client --profile test --state-dir /tmp/ls-state status --wg --interface ls-test || true")) + print(node2.succeed("lightscale-client --profile test --state-dir /tmp/ls-state status --wg --interface ls-test || true")) + + data1 = json.loads(node1.succeed("cat /tmp/ls-state/state.json")) + data2 = json.loads(node2.succeed("cat /tmp/ls-state/state.json")) + nodes = [node1, node2] + ips = [data1["ipv4"], data2["ipv4"]] + + node1.execute(f"ping -c 1 {ips[1]} || true") + node2.execute(f"ping -c 1 {ips[0]} || true") + node1.succeed("sleep 2") + print(node1.succeed("ip link show ls-test || true")) + print(node2.succeed("ip link show ls-test || true")) + print(node1.succeed("timeout 5 wg show ls-test || true")) + print(node2.succeed("timeout 5 wg show ls-test || true")) + print(node1.succeed("sysctl net.ipv4.conf.all.route_localnet net.ipv4.conf.lo.route_localnet || true")) + print(node2.succeed("sysctl net.ipv4.conf.all.route_localnet net.ipv4.conf.lo.route_localnet || true")) + print(node1.succeed("ss -u -lpn | grep lightscale-clie || true")) + print(node2.succeed("ss -u -lpn | grep lightscale-clie || true")) + print(node1.succeed(f"ip -4 route get {ips[1]} || true")) + print(node2.succeed(f"ip -4 route get {ips[0]} || true")) + print(node1.succeed("tail -n 200 /tmp/lightscale-agent.log || true")) + print(node2.succeed("tail -n 200 /tmp/lightscale-agent.log || true")) + + def full_mesh_ping(nodes, ips): + for i, src in enumerate(nodes): + for j, dst in enumerate(nodes): + if i == j: + continue + src.wait_until_succeeds(f"ping -c 3 {ips[j]}", timeout=180) + + full_mesh_ping(nodes, ips) + ''; +} diff --git a/lab/test-ipv6.nix b/lab/test-ipv6.nix new file mode 100644 index 0000000..4ca48ff --- /dev/null +++ b/lab/test-ipv6.nix @@ -0,0 +1,168 @@ +{ pkgs, serverPkg, clientPkg }: +{ + name = "lightscale-lab-ipv6"; + nodes = { + node1 = { ... }: { + networking.hostName = "node1"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.1"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + serverPkg + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.curl + ]; + }; + node2 = { ... }: { + networking.hostName = "node2"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 2 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.2"; prefixLength = 24; } + ]; + networking.interfaces.eth2.useDHCP = false; + networking.interfaces.eth2.ipv4.addresses = [ + { address = "192.168.60.1"; prefixLength = 24; } + ]; + networking.interfaces.eth2.ipv6.addresses = [ + { address = "fd42:1::1"; prefixLength = 64; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.curl + pkgs.nftables + ]; + }; + node3 = { ... }: { + networking.hostName = "node3"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.3"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.curl + ]; + }; + }; + + testScript = '' + start_all() + node1.wait_for_unit("multi-user.target") + node2.wait_for_unit("multi-user.target") + node3.wait_for_unit("multi-user.target") + node1.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.1/24'") + node2.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.2/24'") + node2.wait_until_succeeds("ip -4 addr show dev eth2 | grep -q '192.168.60.1/24'") + node3.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.3/24'") + + node2.succeed("ip -6 addr add fd42:1::10/64 dev eth2 || true") + + node1.succeed("touch /tmp/lightscale-server.log") + node1.execute("sh -c 'tail -n +1 -f /tmp/lightscale-server.log >/dev/console 2>&1 &'") + node1.succeed( + "systemd-run --no-block --unit=lightscale-server --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-server.log " + "--property=StandardError=append:/tmp/lightscale-server.log " + "--setenv=RUST_LOG=info -- " + "lightscale-server --listen 10.0.0.1:8080 --state /tmp/lightscale-state.json" + ) + node1.wait_for_unit("lightscale-server.service") + node1.wait_for_open_port(8080, addr="10.0.0.1", timeout=120) + + import json + + net = json.loads(node1.succeed( + "curl -sSf -X POST http://10.0.0.1:8080/v1/networks " + "-H 'content-type: application/json' " + "-d '{\"name\":\"ipv6\",\"bootstrap_token_ttl_seconds\":600," \ + "\"bootstrap_token_uses\":10,\"bootstrap_token_tags\":[\"ipv6\"]}'" + )) + token = net["bootstrap_token"]["token"] + + def enroll(node, name, ip): + node.succeed( + "lightscale-client --profile ipv6 --config /tmp/ls-config.json " + "init http://10.0.0.1:8080" + ) + node.succeed( + f"lightscale-client --profile ipv6 --config /tmp/ls-config.json " + f"--state-dir /tmp/ls-state register --node-name {name} -- {token}" + ) + node.succeed( + f"lightscale-client --profile ipv6 --config /tmp/ls-config.json " + f"--state-dir /tmp/ls-state heartbeat --endpoint {ip}:51820" + ) + + enroll(node2, "router", "10.0.0.2") + enroll(node3, "client", "10.0.0.3") + + def start_agent(node, extra_args): + node.succeed("touch /tmp/lightscale-agent.log") + cmd = ( + "lightscale-client --profile ipv6 --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state agent --listen-port 51820 " + "--heartbeat-interval 5 --longpoll-timeout 5 " + "--endpoint-stale-after 5 --endpoint-max-rotations 1" + ) + for arg in extra_args: + cmd += f" {arg}" + node.succeed( + "systemd-run --no-block --unit=lightscale-agent --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-agent.log " + "--property=StandardError=append:/tmp/lightscale-agent.log -- " + + cmd + ) + node.wait_for_unit("lightscale-agent.service") + node.wait_until_succeeds("ip link show ls-ipv6", timeout=60) + + start_agent(node2, [ + "--endpoint 10.0.0.2:51820", + "--advertise-route fd42:1::/64", + ]) + start_agent(node3, [ + "--endpoint 10.0.0.3:51820", + "--apply-routes", + ]) + + node2.succeed( + "lightscale-client --profile ipv6 --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state router enable --interface ls-ipv6 " + "--out-interface eth2 --no-snat" + ) + + data2 = json.loads(node2.succeed("cat /tmp/ls-state/state.json")) + data3 = json.loads(node3.succeed("cat /tmp/ls-state/state.json")) + + node2.wait_until_succeeds(f"ping -6 -c 3 {data3['ipv6']}", timeout=120) + node3.wait_until_succeeds(f"ping -6 -c 3 {data2['ipv6']}", timeout=120) + + node3.wait_until_succeeds("ip -6 route get fd42:1::10 | grep -q 'ls-ipv6'", timeout=120) + node3.wait_until_succeeds("ping -6 -c 3 fd42:1::10", timeout=180) + ''; +} diff --git a/lab/test-keys.nix b/lab/test-keys.nix new file mode 100644 index 0000000..9ba61b9 --- /dev/null +++ b/lab/test-keys.nix @@ -0,0 +1,201 @@ +{ pkgs, serverPkg, clientPkg }: +{ + name = "lightscale-lab-keys"; + nodes = { + node1 = { ... }: { + networking.hostName = "node1"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.1"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + serverPkg + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node2 = { ... }: { + networking.hostName = "node2"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.2"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node3 = { ... }: { + networking.hostName = "node3"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.3"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + }; + + testScript = '' + start_all() + node1.wait_for_unit("multi-user.target") + node2.wait_for_unit("multi-user.target") + node3.wait_for_unit("multi-user.target") + node1.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.1/24'") + node2.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.2/24'") + node3.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.3/24'") + + node1.succeed("touch /tmp/lightscale-server.log") + node1.execute("sh -c 'tail -n +1 -f /tmp/lightscale-server.log >/dev/console 2>&1 &'") + node1.succeed( + "systemd-run --no-block --unit=lightscale-server --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-server.log " + "--property=StandardError=append:/tmp/lightscale-server.log " + "--setenv=RUST_LOG=info -- " + "lightscale-server --listen 10.0.0.1:8080 --state /tmp/lightscale-state.json" + ) + node1.wait_for_unit("lightscale-server.service") + node1.wait_for_open_port(8080, addr="10.0.0.1", timeout=120) + + import json + import time + + net = json.loads(node1.succeed( + "curl -sSf -X POST http://10.0.0.1:8080/v1/networks " + "-H 'content-type: application/json' " + "-d '{\"name\":\"lab\",\"bootstrap_token_ttl_seconds\":600," \ + "\"bootstrap_token_uses\":5,\"bootstrap_token_tags\":[\"lab\"]}'" + )) + token = net["bootstrap_token"]["token"] + network_id = net["network"]["id"] + + def enroll(node, name, ip, state_dir): + node.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "init http://10.0.0.1:8080" + ) + node.succeed( + f"lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir {state_dir} register --node-name {name} -- {token}" + ) + node.succeed( + f"lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir {state_dir} heartbeat --endpoint {ip}:51820" + ) + + enroll(node2, "node2", "10.0.0.2", "/tmp/ls-state-2") + enroll(node3, "node3", "10.0.0.3", "/tmp/ls-state-3") + + def start_agent(node, ip, state_dir): + node.succeed("touch /tmp/lightscale-agent.log") + cmd = ( + "lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir {state_dir} agent --listen-port 51820 " + "--heartbeat-interval 5 --longpoll-timeout 5 " + f"--endpoint {ip}:51820" + ) + node.succeed( + "systemd-run --no-block --unit=lightscale-agent --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-agent.log " + "--property=StandardError=append:/tmp/lightscale-agent.log -- " + + cmd + ) + node.wait_for_unit("lightscale-agent.service") + node.wait_until_succeeds("ip link show ls-test", timeout=60) + + start_agent(node2, "10.0.0.2", "/tmp/ls-state-2") + start_agent(node3, "10.0.0.3", "/tmp/ls-state-3") + + data2 = json.loads(node2.succeed("cat /tmp/ls-state-2/state.json")) + data3 = json.loads(node3.succeed("cat /tmp/ls-state-3/state.json")) + ip2 = data2["ipv4"] + ip3 = data3["ipv4"] + + node2.wait_until_succeeds(f"ping -c 3 {ip3}", timeout=120) + node3.wait_until_succeeds(f"ping -c 3 {ip2}", timeout=120) + + node1.succeed( + "lightscale-client --profile admin --config /tmp/ls-config.json " + "init http://10.0.0.1:8080" + ) + + def admin_cmd(cmd): + return node1.succeed( + "lightscale-client --profile admin --config /tmp/ls-config.json " + cmd + ) + + admin_cmd(f"admin key-policy set {network_id} --max-age-seconds 5") + time.sleep(6) + + node2.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state-2 netmap | grep -q 'key_rotation_required: true'" + ) + node2.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state-2 netmap | grep -q 'peers: 0'" + ) + + node2.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state-2 rotate-keys" + ) + node3.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state-3 rotate-keys" + ) + + node2.fail( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state-2 netmap | grep -q 'key_rotation_required: true'" + ) + node2.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state-2 netmap | grep -q 'peers: 1'" + ) + + admin_cmd(f"admin key-policy set {network_id} --clear") + + node3_id = data3["node_id"] + admin_cmd(f"admin keys revoke {node3_id}") + + node3.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state-3 netmap | grep -q 'revoked: true'" + ) + node2.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state-2 netmap | grep -q 'peers: 0'" + ) + ''; +} diff --git a/lab/test-multi.nix b/lab/test-multi.nix new file mode 100644 index 0000000..53707ff --- /dev/null +++ b/lab/test-multi.nix @@ -0,0 +1,215 @@ +{ pkgs, serverPkg, clientPkg }: +{ + name = "lightscale-lab-multi"; + nodes = { + node1 = { ... }: { + networking.hostName = "node1"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.1"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + serverPkg + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node2 = { ... }: { + networking.hostName = "node2"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 2 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.2"; prefixLength = 24; } + ]; + networking.interfaces.eth2.useDHCP = false; + networking.interfaces.eth2.ipv4.addresses = [ + { address = "192.168.100.1"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.curl + pkgs.nftables + ]; + }; + node3 = { ... }: { + networking.hostName = "node3"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 3 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.3"; prefixLength = 24; } + ]; + networking.interfaces.eth2.useDHCP = false; + networking.interfaces.eth2.ipv4.addresses = [ + { address = "192.168.100.1"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.curl + pkgs.nftables + ]; + }; + node4 = { ... }: { + networking.hostName = "node4"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.4"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.curl + ]; + }; + }; + + testScript = '' + start_all() + node1.wait_for_unit("multi-user.target") + node2.wait_for_unit("multi-user.target") + node3.wait_for_unit("multi-user.target") + node4.wait_for_unit("multi-user.target") + + node1.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.1/24'") + node2.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.2/24'") + node2.wait_until_succeeds("ip -4 addr show dev eth2 | grep -q '192.168.100.1/24'") + node3.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.3/24'") + node3.wait_until_succeeds("ip -4 addr show dev eth2 | grep -q '192.168.100.1/24'") + node4.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.4/24'") + + node2.succeed("ip addr add 192.168.100.10/24 dev eth2 || true") + node3.succeed("ip addr add 192.168.100.10/24 dev eth2 || true") + + node1.succeed("touch /tmp/lightscale-server.log") + node1.execute("sh -c 'tail -n +1 -f /tmp/lightscale-server.log >/dev/console 2>&1 &'") + node1.succeed( + "systemd-run --no-block --unit=lightscale-server --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-server.log " + "--property=StandardError=append:/tmp/lightscale-server.log " + "--setenv=RUST_LOG=info -- " + "lightscale-server --listen 10.0.0.1:8080 --state /tmp/lightscale-state.json" + ) + node1.wait_for_unit("lightscale-server.service") + node1.wait_for_open_port(8080, addr="10.0.0.1", timeout=120) + + import json + + net_a = json.loads(node1.succeed( + "curl -sSf -X POST http://10.0.0.1:8080/v1/networks " + "-H 'content-type: application/json' " + "-d '{\"name\":\"net-a\",\"bootstrap_token_ttl_seconds\":600," \ + "\"bootstrap_token_uses\":10,\"bootstrap_token_tags\":[\"a\"]}'" + )) + net_b = json.loads(node1.succeed( + "curl -sSf -X POST http://10.0.0.1:8080/v1/networks " + "-H 'content-type: application/json' " + "-d '{\"name\":\"net-b\",\"bootstrap_token_ttl_seconds\":600," \ + "\"bootstrap_token_uses\":10,\"bootstrap_token_tags\":[\"b\"]}'" + )) + token_a = net_a["bootstrap_token"]["token"] + token_b = net_b["bootstrap_token"]["token"] + + def enroll(node, profile, token, name, ip, state_dir): + node.succeed( + f"lightscale-client --profile {profile} --config /tmp/ls-config.json " + "init http://10.0.0.1:8080" + ) + node.succeed( + f"lightscale-client --profile {profile} --config /tmp/ls-config.json " + f"--state-dir {state_dir} register --node-name {name} -- {token}" + ) + node.succeed( + f"lightscale-client --profile {profile} --config /tmp/ls-config.json " + f"--state-dir {state_dir} heartbeat --endpoint {ip}:51820" + ) + + enroll(node2, "neta", token_a, "router-a", "10.0.0.2", "/tmp/ls-state-a") + enroll(node3, "netb", token_b, "router-b", "10.0.0.3", "/tmp/ls-state-b") + enroll(node4, "neta", token_a, "client-a", "10.0.0.4", "/tmp/ls-state-a") + enroll(node4, "netb", token_b, "client-b", "10.0.0.4", "/tmp/ls-state-b") + + def start_agent(node, profile, state_dir, listen_port, extra_args): + node.succeed("touch /tmp/lightscale-agent.log") + cmd = ( + f"lightscale-client --profile {profile} --config /tmp/ls-config.json " + f"--state-dir {state_dir} agent --listen-port {listen_port} " + "--heartbeat-interval 5 --longpoll-timeout 5 " + "--endpoint-stale-after 5 --endpoint-max-rotations 1" + ) + for arg in extra_args: + cmd += f" {arg}" + node.succeed( + f"systemd-run --no-block --unit=lightscale-agent-{profile} " + "--service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-agent.log " + "--property=StandardError=append:/tmp/lightscale-agent.log -- " + + cmd + ) + node.wait_for_unit(f"lightscale-agent-{profile}.service") + node.wait_until_succeeds(f"ip link show ls-{profile}", timeout=60) + + start_agent(node2, "neta", "/tmp/ls-state-a", 51820, [ + "--endpoint 10.0.0.2:51820", + "--advertise-route 192.168.100.0/24", + "--advertise-map 192.168.100.0/24=10.250.0.0/24", + ]) + start_agent(node3, "netb", "/tmp/ls-state-b", 51820, [ + "--endpoint 10.0.0.3:51820", + "--advertise-route 192.168.100.0/24", + "--advertise-map 192.168.100.0/24=10.251.0.0/24", + ]) + start_agent(node4, "neta", "/tmp/ls-state-a", 51820, [ + "--endpoint 10.0.0.4:51820", + "--apply-routes", + ]) + start_agent(node4, "netb", "/tmp/ls-state-b", 51821, [ + "--endpoint 10.0.0.4:51821", + "--apply-routes", + ]) + + node2.succeed( + "lightscale-client --profile neta --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state-a router enable --interface ls-neta " + "--out-interface eth2 --no-snat " + "--map 192.168.100.0/24=10.250.0.0/24" + ) + node3.succeed( + "lightscale-client --profile netb --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state-b router enable --interface ls-netb " + "--out-interface eth2 --no-snat " + "--map 192.168.100.0/24=10.251.0.0/24" + ) + + node4.wait_until_succeeds("ping -c 3 10.250.0.10", timeout=180) + node4.wait_until_succeeds("ping -c 3 10.251.0.10", timeout=180) + ''; +} diff --git a/lab/test-nat.nix b/lab/test-nat.nix new file mode 100644 index 0000000..22387b3 --- /dev/null +++ b/lab/test-nat.nix @@ -0,0 +1,163 @@ +{ pkgs, serverPkg, clientPkg }: +{ + name = "lightscale-lab-nat"; + nodes = { + node1 = { ... }: { + networking.hostName = "node1"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.1"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + serverPkg + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node2 = { ... }: { + networking.hostName = "node2"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 2 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.2"; prefixLength = 24; } + ]; + networking.interfaces.eth2.useDHCP = false; + networking.interfaces.eth2.ipv4.addresses = [ + { address = "192.168.50.1"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.curl + pkgs.nftables + ]; + }; + node3 = { ... }: { + networking.hostName = "node3"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.3"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.curl + ]; + }; + }; + + testScript = '' + start_all() + node1.wait_for_unit("multi-user.target") + node2.wait_for_unit("multi-user.target") + node3.wait_for_unit("multi-user.target") + node1.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.1/24'") + node2.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.2/24'") + node2.wait_until_succeeds("ip -4 addr show dev eth2 | grep -q '192.168.50.1/24'") + node3.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.3/24'") + + node2.succeed("ip addr add 192.168.50.10/24 dev eth2 || true") + + node1.succeed("touch /tmp/lightscale-server.log") + node1.execute("sh -c 'tail -n +1 -f /tmp/lightscale-server.log >/dev/console 2>&1 &'") + node1.succeed( + "systemd-run --no-block --unit=lightscale-server --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-server.log " + "--property=StandardError=append:/tmp/lightscale-server.log " + "--setenv=RUST_LOG=info -- " + "lightscale-server --listen 10.0.0.1:8080 --state /tmp/lightscale-state.json" + ) + node1.wait_for_unit("lightscale-server.service") + node1.wait_for_open_port(8080, addr="10.0.0.1", timeout=120) + + import json + + net = json.loads(node1.succeed( + "curl -sSf -X POST http://10.0.0.1:8080/v1/networks " + "-H 'content-type: application/json' " + "-d '{\"name\":\"nat\",\"bootstrap_token_ttl_seconds\":600," \ + "\"bootstrap_token_uses\":10,\"bootstrap_token_tags\":[\"nat\"]}'" + )) + token = net["bootstrap_token"]["token"] + + def enroll(node, name, ip): + node.succeed( + "lightscale-client --profile nat --config /tmp/ls-config.json " + "init http://10.0.0.1:8080" + ) + node.succeed( + f"lightscale-client --profile nat --config /tmp/ls-config.json " + f"--state-dir /tmp/ls-state register --node-name {name} -- {token}" + ) + node.succeed( + f"lightscale-client --profile nat --config /tmp/ls-config.json " + f"--state-dir /tmp/ls-state heartbeat --endpoint {ip}:51820" + ) + + enroll(node2, "router", "10.0.0.2") + enroll(node3, "client", "10.0.0.3") + + def start_agent(node, extra_args): + node.succeed("touch /tmp/lightscale-agent.log") + cmd = ( + "lightscale-client --profile nat --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state agent --listen-port 51820 " + "--heartbeat-interval 5 --longpoll-timeout 5 " + "--endpoint-stale-after 5 --endpoint-max-rotations 1" + ) + for arg in extra_args: + cmd += f" {arg}" + node.succeed( + "systemd-run --no-block --unit=lightscale-agent --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-agent.log " + "--property=StandardError=append:/tmp/lightscale-agent.log -- " + + cmd + ) + node.wait_for_unit("lightscale-agent.service") + node.wait_until_succeeds("ip link show ls-nat", timeout=60) + + start_agent(node2, [ + "--endpoint 10.0.0.2:51820", + "--advertise-route 192.168.50.0/24", + "--advertise-map 192.168.50.0/24=10.250.0.0/24", + ]) + start_agent(node3, [ + "--endpoint 10.0.0.3:51820", + "--apply-routes", + ]) + + node2.wait_until_succeeds("ip link show ls-nat", timeout=60) + node2.succeed( + "lightscale-client --profile nat --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state router enable --interface ls-nat " + "--out-interface eth2 --no-snat " + "--map 192.168.50.0/24=10.250.0.0/24" + ) + + node3.wait_until_succeeds("ip route get 10.250.0.10 | grep -q 'ls-nat'", timeout=120) + node3.wait_until_succeeds("ping -c 3 10.250.0.10", timeout=180) + ''; +} diff --git a/lab/test-negative.nix b/lab/test-negative.nix new file mode 100644 index 0000000..0a25390 --- /dev/null +++ b/lab/test-negative.nix @@ -0,0 +1,147 @@ +{ pkgs, serverPkg, clientPkg }: +{ + name = "lightscale-lab-negative"; + nodes = { + node1 = { ... }: { + networking.hostName = "node1"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.1"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + serverPkg + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node2 = { ... }: { + networking.hostName = "node2"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.2"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + }; + + testScript = '' + start_all() + node1.wait_for_unit("multi-user.target") + node2.wait_for_unit("multi-user.target") + node1.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.1/24'") + node2.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.2/24'") + + node1.succeed("touch /tmp/lightscale-server.log") + node1.execute("sh -c 'tail -n +1 -f /tmp/lightscale-server.log >/dev/console 2>&1 &'") + node1.succeed( + "systemd-run --no-block --unit=lightscale-server --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-server.log " + "--property=StandardError=append:/tmp/lightscale-server.log " + "--setenv=RUST_LOG=info -- " + "lightscale-server --listen 10.0.0.1:8080 --state /tmp/lightscale-state.json" + ) + node1.wait_for_unit("lightscale-server.service") + node1.wait_for_open_port(8080, addr="10.0.0.1", timeout=120) + + import json + import time + + net = json.loads(node1.succeed( + "curl -sSf -X POST http://10.0.0.1:8080/v1/networks " + "-H 'content-type: application/json' " + "-d '{\"name\":\"lab\",\"requires_approval\":true," \ + "\"bootstrap_token_ttl_seconds\":600," \ + "\"bootstrap_token_uses\":5,\"bootstrap_token_tags\":[\"lab\"]}'" + )) + network_id = net["network"]["id"] + bootstrap_token = net["bootstrap_token"]["token"] + + node1.succeed( + "lightscale-client --profile admin --config /tmp/ls-config.json " + "init http://10.0.0.1:8080" + ) + node2.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "init http://10.0.0.1:8080" + ) + + def admin_cmd(cmd): + return node1.succeed( + "lightscale-client --profile admin --config /tmp/ls-config.json " + cmd + ) + + def create_token(ttl_seconds, uses): + out = admin_cmd( + f"admin token create {network_id} --ttl-seconds {ttl_seconds} --uses {uses}" + ) + for line in out.splitlines(): + if line.startswith("token:"): + return line.split(":", 1)[1].strip() + raise Exception("token not found in output") + + # Invalid token should fail. + node2.fail( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state-invalid register --node-name bad -- not-a-token" + ) + + # Expired token should fail. + expired_token = create_token(1, 1) + time.sleep(2) + node2.fail( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state-expired register --node-name expired -- " + + expired_token + ) + + # Revoked token should fail. + revoked_token = create_token(600, 1) + admin_cmd(f"admin token revoke {revoked_token}") + node2.fail( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state-revoked register --node-name revoked -- " + + revoked_token + ) + + # Approval-required flow. + node2.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state-pending register --node-name pending -- " + + bootstrap_token + ) + node2.succeed( + "lightscale-client --profile test --state-dir /tmp/ls-state-pending " + "status | grep -q 'approved: false'" + ) + + data = json.loads(node2.succeed("cat /tmp/ls-state-pending/state.json")) + node_id = data["node_id"] + + admin_cmd(f"admin approve {node_id}") + node2.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state-pending netmap | grep -q 'approved: true'" + ) + ''; +} diff --git a/lab/test-relay-failover.nix b/lab/test-relay-failover.nix new file mode 100644 index 0000000..1867d21 --- /dev/null +++ b/lab/test-relay-failover.nix @@ -0,0 +1,204 @@ +{ pkgs, serverPkg, clientPkg }: +{ + name = "lightscale-lab-relay-failover"; + nodes = { + node1 = { ... }: { + networking.hostName = "node1"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.1"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + serverPkg + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node2 = { ... }: { + networking.hostName = "node2"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.2"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node3 = { ... }: { + networking.hostName = "node3"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.3"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + serverPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node4 = { ... }: { + networking.hostName = "node4"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.4"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node5 = { ... }: { + networking.hostName = "node5"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.5"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + }; + + testScript = '' + start_all() + node1.wait_for_unit("multi-user.target") + node2.wait_for_unit("multi-user.target") + node3.wait_for_unit("multi-user.target") + node4.wait_for_unit("multi-user.target") + node5.wait_for_unit("multi-user.target") + + node1.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.1/24'") + node2.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.2/24'") + node3.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.3/24'") + node4.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.4/24'") + node5.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.5/24'") + + node1.succeed("touch /tmp/lightscale-server.log") + node1.execute("sh -c 'tail -n +1 -f /tmp/lightscale-server.log >/dev/console 2>&1 &'") + node1.succeed( + "systemd-run --no-block --unit=lightscale-server --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-server.log " + "--property=StandardError=append:/tmp/lightscale-server.log " + "--setenv=RUST_LOG=info -- " + "lightscale-server --listen 10.0.0.1:8080 --state /tmp/lightscale-state.json " + "--stream-relay 10.0.0.2:8443,10.0.0.3:8443" + ) + node1.wait_for_unit("lightscale-server.service") + node1.wait_for_open_port(8080, addr="10.0.0.1", timeout=120) + + node3.succeed("touch /tmp/lightscale-relay.log") + node3.execute("sh -c 'tail -n +1 -f /tmp/lightscale-relay.log >/dev/console 2>&1 &'") + node3.succeed( + "systemd-run --no-block --unit=lightscale-relay --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-relay.log " + "--property=StandardError=append:/tmp/lightscale-relay.log " + "--setenv=RUST_LOG=info -- " + "lightscale-server --listen 10.0.0.3:8082 --state /tmp/lightscale-relay.json " + "--stream-relay-listen 10.0.0.3:8443" + ) + node3.wait_for_unit("lightscale-relay.service") + node3.wait_for_open_port(8443, addr="10.0.0.3", timeout=120) + + import json + + net = json.loads(node1.succeed( + "curl -sSf -X POST http://10.0.0.1:8080/v1/networks " + "-H 'content-type: application/json' " + "-d '{\"name\":\"lab\",\"bootstrap_token_ttl_seconds\":600," \ + "\"bootstrap_token_uses\":10,\"bootstrap_token_tags\":[\"lab\"]}'" + )) + token = net["bootstrap_token"]["token"] + + def enroll(node, name, ip, state_dir): + node.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "init http://10.0.0.1:8080" + ) + node.succeed( + f"lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir {state_dir} register --node-name {name} -- {token}" + ) + node.succeed( + f"lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir {state_dir} heartbeat --endpoint {ip}:51820" + ) + + def start_agent(node, state_dir, endpoint): + node.succeed("touch /tmp/lightscale-agent.log") + cmd = ( + "lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir {state_dir} agent --listen-port 51820 " + "--heartbeat-interval 5 --longpoll-timeout 5 " + f"--endpoint {endpoint} --stream-relay" + ) + node.succeed( + "systemd-run --no-block --unit=lightscale-agent --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-agent.log " + "--property=StandardError=append:/tmp/lightscale-agent.log -- " + + cmd + ) + node.wait_for_unit("lightscale-agent.service") + node.wait_until_succeeds("ip link show ls-test", timeout=60) + + enroll(node4, "node4", "10.0.0.4", "/tmp/ls-state-4") + enroll(node5, "node5", "10.0.0.5", "/tmp/ls-state-5") + start_agent(node4, "/tmp/ls-state-4", "203.0.113.4:51820") + start_agent(node5, "/tmp/ls-state-5", "203.0.113.5:51820") + + data4 = json.loads(node4.succeed("cat /tmp/ls-state-4/state.json")) + data5 = json.loads(node5.succeed("cat /tmp/ls-state-5/state.json")) + ip4 = data4["ipv4"] + ip5 = data5["ipv4"] + + node4.wait_until_succeeds(f"ping -c 3 {ip5}", timeout=180) + node5.wait_until_succeeds(f"ping -c 3 {ip4}", timeout=180) + + node4.wait_until_succeeds("grep -q 'connected to 10.0.0.3:8443' /tmp/lightscale-agent.log", timeout=60) + node5.wait_until_succeeds("grep -q 'connected to 10.0.0.3:8443' /tmp/lightscale-agent.log", timeout=60) + ''; +} diff --git a/lab/test-restart.nix b/lab/test-restart.nix new file mode 100644 index 0000000..0d72f6f --- /dev/null +++ b/lab/test-restart.nix @@ -0,0 +1,154 @@ +{ pkgs, serverPkg, clientPkg }: +{ + name = "lightscale-lab-restart"; + nodes = { + node1 = { ... }: { + networking.hostName = "node1"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.1"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + serverPkg + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node2 = { ... }: { + networking.hostName = "node2"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.2"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node3 = { ... }: { + networking.hostName = "node3"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.3"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + }; + + testScript = '' + start_all() + node1.wait_for_unit("multi-user.target") + node2.wait_for_unit("multi-user.target") + node3.wait_for_unit("multi-user.target") + node1.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.1/24'") + node2.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.2/24'") + node3.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.3/24'") + + node1.succeed("touch /tmp/lightscale-server.log") + node1.execute("sh -c 'tail -n +1 -f /tmp/lightscale-server.log >/dev/console 2>&1 &'") + node1.succeed( + "systemd-run --no-block --unit=lightscale-server --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-server.log " + "--property=StandardError=append:/tmp/lightscale-server.log " + "--setenv=RUST_LOG=info -- " + "lightscale-server --listen 10.0.0.1:8080 --state /tmp/lightscale-state.json" + ) + node1.wait_for_unit("lightscale-server.service") + node1.wait_for_open_port(8080, addr="10.0.0.1", timeout=120) + + import json + import time + + net = json.loads(node1.succeed( + "curl -sSf -X POST http://10.0.0.1:8080/v1/networks " + "-H 'content-type: application/json' " + "-d '{\"name\":\"lab\",\"bootstrap_token_ttl_seconds\":600," \ + "\"bootstrap_token_uses\":5,\"bootstrap_token_tags\":[\"lab\"]}'" + )) + token = net["bootstrap_token"]["token"] + + def enroll(node, name, ip, state_dir): + node.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "init http://10.0.0.1:8080" + ) + node.succeed( + f"lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir {state_dir} register --node-name {name} -- {token}" + ) + node.succeed( + f"lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir {state_dir} heartbeat --endpoint {ip}:51820" + ) + + enroll(node2, "node2", "10.0.0.2", "/tmp/ls-state-2") + enroll(node3, "node3", "10.0.0.3", "/tmp/ls-state-3") + + def start_agent(node, ip, state_dir): + node.succeed("touch /tmp/lightscale-agent.log") + cmd = ( + "lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir {state_dir} agent --listen-port 51820 " + "--heartbeat-interval 5 --longpoll-timeout 5 " + "--backend boringtun " + f"--endpoint {ip}:51820" + ) + node.succeed( + "systemd-run --no-block --unit=lightscale-agent --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-agent.log " + "--property=StandardError=append:/tmp/lightscale-agent.log -- " + + cmd + ) + node.wait_for_unit("lightscale-agent.service") + node.wait_until_succeeds("ip link show ls-test", timeout=60) + + start_agent(node2, "10.0.0.2", "/tmp/ls-state-2") + start_agent(node3, "10.0.0.3", "/tmp/ls-state-3") + + data2 = json.loads(node2.succeed("cat /tmp/ls-state-2/state.json")) + data3 = json.loads(node3.succeed("cat /tmp/ls-state-3/state.json")) + ip2 = data2["ipv4"] + ip3 = data3["ipv4"] + + node2.wait_until_succeeds(f"ping -c 3 {ip3}", timeout=120) + node3.wait_until_succeeds(f"ping -c 3 {ip2}", timeout=120) + + node2.succeed("systemctl stop lightscale-agent.service") + time.sleep(2) + node3.fail(f"ping -c 3 {ip2}") + + start_agent(node2, "10.0.0.2", "/tmp/ls-state-2") + node3.wait_until_succeeds(f"ping -c 3 {ip2}", timeout=120) + ''; +} diff --git a/lab/test-scale.nix b/lab/test-scale.nix new file mode 100644 index 0000000..cb42d83 --- /dev/null +++ b/lab/test-scale.nix @@ -0,0 +1,269 @@ +{ pkgs, serverPkg, clientPkg }: +{ + name = "lightscale-lab-scale"; + nodes = { + node1 = { ... }: { + networking.hostName = "node1"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.1"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + serverPkg + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node2 = { ... }: { + networking.hostName = "node2"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.2"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node3 = { ... }: { + networking.hostName = "node3"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.3"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node4 = { ... }: { + networking.hostName = "node4"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.4"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node5 = { ... }: { + networking.hostName = "node5"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.5"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node6 = { ... }: { + networking.hostName = "node6"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.6"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node7 = { ... }: { + networking.hostName = "node7"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.7"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node8 = { ... }: { + networking.hostName = "node8"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.8"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + }; + + testScript = '' + start_all() + + nodes = [node1, node2, node3, node4, node5, node6, node7, node8] + for idx, node in enumerate(nodes, start=1): + node.wait_for_unit("multi-user.target") + node.wait_until_succeeds( + f"ip -4 addr show dev eth1 | grep -q '10.0.0.{idx}/24'" + ) + + node1.succeed("touch /tmp/lightscale-server.log") + node1.execute("sh -c 'tail -n +1 -f /tmp/lightscale-server.log >/dev/console 2>&1 &'") + node1.succeed( + "systemd-run --no-block --unit=lightscale-server --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-server.log " + "--property=StandardError=append:/tmp/lightscale-server.log " + "--setenv=RUST_LOG=info -- " + "lightscale-server --listen 10.0.0.1:8080 --state /tmp/lightscale-state.json " + "--stream-relay 10.0.0.1:8443 --stream-relay-listen 10.0.0.1:8443" + ) + node1.wait_for_unit("lightscale-server.service") + node1.wait_for_open_port(8080, addr="10.0.0.1", timeout=120) + node1.wait_for_open_port(8443, addr="10.0.0.1", timeout=120) + node1.succeed("tail -n 50 /tmp/lightscale-server.log || true") + + import json + import time + + net = json.loads(node1.succeed( + "curl -sSf -X POST http://10.0.0.1:8080/v1/networks " + "-H 'content-type: application/json' " + "-d '{\"name\":\"lab\",\"bootstrap_token_ttl_seconds\":600," \ + "\"bootstrap_token_uses\":20,\"bootstrap_token_tags\":[\"lab\"]}'" + )) + token = net["bootstrap_token"]["token"] + + def enroll(node, name, ip): + node.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "init http://10.0.0.1:8080" + ) + node.succeed( + f"lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir /tmp/ls-state register --node-name {name} -- {token}" + ) + node.succeed( + f"lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir /tmp/ls-state heartbeat --endpoint {ip}:51820" + ) + + for idx, node in enumerate(nodes, start=1): + enroll(node, f"node{idx}", f"10.0.0.{idx}") + + def start_agent(node, endpoints, stream_relay): + node.succeed("touch /tmp/lightscale-agent.log") + cmd = ( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state agent --listen-port 51820 " + "--heartbeat-interval 5 --longpoll-timeout 5" + ) + for endpoint in endpoints: + cmd += f" --endpoint {endpoint}" + if stream_relay: + cmd += " --stream-relay" + node.succeed( + "systemd-run --no-block --unit=lightscale-agent --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-agent.log " + "--property=StandardError=append:/tmp/lightscale-agent.log -- " + + cmd + ) + node.wait_for_unit("lightscale-agent.service") + node.wait_until_succeeds("ip link show ls-test", timeout=60) + + for idx, node in enumerate(nodes, start=1): + start_agent( + node, + [f"203.0.113.{idx}:51820", f"10.0.0.{idx}:51820"], + False, + ) + + time.sleep(1) + + def overlay_ipv4(node): + data = json.loads(node.succeed("cat /tmp/ls-state/state.json")) + return data["ipv4"] + + ips = [overlay_ipv4(node) for node in nodes] + + def full_mesh_ping(nodes, ips): + for i, src in enumerate(nodes): + for j, dst in enumerate(nodes): + if i == j: + continue + src.wait_until_succeeds(f"ping -c 3 {ips[j]}", timeout=120) + + full_mesh_ping(nodes, ips) + + for node in nodes: + node.succeed("systemctl stop lightscale-agent.service") + + for idx, node in enumerate(nodes, start=1): + start_agent(node, [f"203.0.113.{idx}:51820"], True) + + full_mesh_ping(nodes, ips) + ''; +} diff --git a/lab/test-userspace.nix b/lab/test-userspace.nix new file mode 100644 index 0000000..993de0a --- /dev/null +++ b/lab/test-userspace.nix @@ -0,0 +1,144 @@ +{ pkgs, serverPkg, clientPkg }: +{ + name = "lightscale-lab-userspace"; + nodes = { + node1 = { ... }: { + networking.hostName = "node1"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.1"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + serverPkg + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node2 = { ... }: { + networking.hostName = "node2"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.2"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.curl + ]; + }; + }; + + testScript = '' + start_all() + node1.wait_for_unit("multi-user.target") + node2.wait_for_unit("multi-user.target") + node1.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.1/24'") + node2.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.2/24'") + + node1.succeed("touch /tmp/lightscale-server.log") + node1.execute("sh -c 'tail -n +1 -f /tmp/lightscale-server.log >/dev/console 2>&1 &'") + node1.succeed( + "systemd-run --no-block --unit=lightscale-server --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-server.log " + "--property=StandardError=append:/tmp/lightscale-server.log " + "--setenv=RUST_LOG=info -- " + "lightscale-server --listen 10.0.0.1:8080 --state /tmp/lightscale-state.json " + "--stream-relay 10.0.0.1:8443 --stream-relay-listen 10.0.0.1:8443" + ) + node1.wait_for_unit("lightscale-server.service") + node1.wait_for_open_port(8080, addr="10.0.0.1", timeout=120) + node1.wait_for_open_port(8443, addr="10.0.0.1", timeout=120) + + import json + + net = json.loads(node1.succeed( + "curl -sSf -X POST http://10.0.0.1:8080/v1/networks " + "-H 'content-type: application/json' " + "-d '{\"name\":\"userspace\",\"bootstrap_token_ttl_seconds\":600," \ + "\"bootstrap_token_uses\":10,\"bootstrap_token_tags\":[\"userspace\"]}'" + )) + token = net["bootstrap_token"]["token"] + + def enroll(node, name, ip): + node.succeed( + "lightscale-client --profile userspace --config /tmp/ls-config.json " + "init http://10.0.0.1:8080" + ) + node.succeed( + f"lightscale-client --profile userspace --config /tmp/ls-config.json " + f"--state-dir /tmp/ls-state register --node-name {name} -- {token}" + ) + node.succeed( + f"lightscale-client --profile userspace --config /tmp/ls-config.json " + f"--state-dir /tmp/ls-state heartbeat --endpoint {ip}:51820" + ) + + enroll(node1, "node1", "10.0.0.1") + enroll(node2, "node2", "10.0.0.2") + + def start_agent(node, endpoints, stream_relay): + node.succeed("touch /tmp/lightscale-agent.log") + cmd = ( + "lightscale-client --profile userspace --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state agent --listen-port 51820 " + "--heartbeat-interval 5 --longpoll-timeout 5 " + "--backend boringtun --endpoint-stale-after 5 --endpoint-max-rotations 1" + ) + for endpoint in endpoints: + cmd += f" --endpoint {endpoint}" + if stream_relay: + cmd += " --stream-relay" + node.succeed( + "systemd-run --no-block --unit=lightscale-agent --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-agent.log " + "--property=StandardError=append:/tmp/lightscale-agent.log -- " + + cmd + ) + node.wait_for_unit("lightscale-agent.service") + node.wait_until_succeeds("ip link show ls-userspace", timeout=60) + + # Phase 1: userspace endpoint rotation (invalid -> valid) + start_agent(node1, ["203.0.113.1:51820", "10.0.0.1:51820"], False) + start_agent(node2, ["203.0.113.2:51820", "10.0.0.2:51820"], False) + + data1 = json.loads(node1.succeed("cat /tmp/ls-state/state.json")) + data2 = json.loads(node2.succeed("cat /tmp/ls-state/state.json")) + nodes = [node1, node2] + ips = [data1["ipv4"], data2["ipv4"]] + + def full_mesh_ping(nodes, ips): + for i, src in enumerate(nodes): + for j, dst in enumerate(nodes): + if i == j: + continue + src.wait_until_succeeds(f"ping -c 3 {ips[j]}", timeout=120) + + full_mesh_ping(nodes, ips) + + # Phase 2: userspace stream relay fallback + node1.succeed("systemctl stop lightscale-agent.service") + node2.succeed("systemctl stop lightscale-agent.service") + start_agent(node1, ["203.0.113.1:51820"], True) + start_agent(node2, ["203.0.113.2:51820"], True) + + full_mesh_ping(nodes, ips) + ''; +} diff --git a/lab/test.nix b/lab/test.nix new file mode 100644 index 0000000..a798ac5 --- /dev/null +++ b/lab/test.nix @@ -0,0 +1,230 @@ +{ pkgs, serverPkg, clientPkg }: +{ + name = "lightscale-lab"; + nodes = { + node1 = { ... }: { + networking.hostName = "node1"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.1"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + serverPkg + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node2 = { ... }: { + networking.hostName = "node2"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.2"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node3 = { ... }: { + networking.hostName = "node3"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.3"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node4 = { ... }: { + networking.hostName = "node4"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.4"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + node5 = { ... }: { + networking.hostName = "node5"; + networking.usePredictableInterfaceNames = false; + virtualisation.vlans = [ 1 ]; + networking.interfaces.eth1.useDHCP = false; + networking.interfaces.eth1.ipv4.addresses = [ + { address = "10.0.0.5"; prefixLength = 24; } + ]; + networking.firewall.enable = false; + boot.kernelModules = [ "wireguard" ]; + environment.systemPackages = [ + clientPkg + pkgs.wireguard-tools + pkgs.iproute2 + pkgs.iputils + pkgs.netcat-openbsd + pkgs.curl + ]; + }; + }; + + testScript = '' + start_all() + node1.wait_for_unit("multi-user.target") + node2.wait_for_unit("multi-user.target") + node3.wait_for_unit("multi-user.target") + node4.wait_for_unit("multi-user.target") + node5.wait_for_unit("multi-user.target") + + node1.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.1/24'") + node2.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.2/24'") + node3.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.3/24'") + node4.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.4/24'") + node5.wait_until_succeeds("ip -4 addr show dev eth1 | grep -q '10.0.0.5/24'") + + node1.succeed("touch /tmp/lightscale-server.log") + node1.execute("sh -c 'tail -n +1 -f /tmp/lightscale-server.log >/dev/console 2>&1 &'") + node1.succeed( + "systemd-run --no-block --unit=lightscale-server --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-server.log " + "--property=StandardError=append:/tmp/lightscale-server.log " + "--setenv=RUST_LOG=info -- " + "lightscale-server --listen 10.0.0.1:8080 --state /tmp/lightscale-state.json " + "--stream-relay 10.0.0.1:8443 --stream-relay-listen 10.0.0.1:8443" + ) + node1.wait_for_unit("lightscale-server.service") + node1.wait_for_open_port(8080, addr="10.0.0.1", timeout=120) + node1.wait_for_open_port(8443, addr="10.0.0.1", timeout=120) + node1.succeed("tail -n 50 /tmp/lightscale-server.log || true") + + import json + import time + + net = json.loads(node1.succeed( + "curl -sSf -X POST http://10.0.0.1:8080/v1/networks " + "-H 'content-type: application/json' " + "-d '{\"name\":\"lab\",\"bootstrap_token_ttl_seconds\":600," \ + "\"bootstrap_token_uses\":10,\"bootstrap_token_tags\":[\"lab\"]}'" + )) + token = net["bootstrap_token"]["token"] + + def enroll(node, name, ip): + node.succeed( + "lightscale-client --profile test --config /tmp/ls-config.json " + "init http://10.0.0.1:8080" + ) + node.succeed( + f"lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir /tmp/ls-state register --node-name {name} -- {token}" + ) + node.succeed( + f"lightscale-client --profile test --config /tmp/ls-config.json " + f"--state-dir /tmp/ls-state heartbeat --endpoint {ip}:51820" + ) + + enroll(node1, "node1", "10.0.0.1") + enroll(node2, "node2", "10.0.0.2") + enroll(node3, "node3", "10.0.0.3") + enroll(node4, "node4", "10.0.0.4") + enroll(node5, "node5", "10.0.0.5") + + def start_agent(node, endpoints, stream_relay): + node.succeed("touch /tmp/lightscale-agent.log") + cmd = ( + "lightscale-client --profile test --config /tmp/ls-config.json " + "--state-dir /tmp/ls-state agent --listen-port 51820 " + "--heartbeat-interval 5 --longpoll-timeout 5" + ) + for endpoint in endpoints: + cmd += f" --endpoint {endpoint}" + if stream_relay: + cmd += " --stream-relay" + node.succeed( + "systemd-run --no-block --unit=lightscale-agent --service-type=simple " + "--property=Restart=on-failure --property=RestartSec=1 " + "--property=TimeoutStartSec=30 " + "--property=StandardOutput=append:/tmp/lightscale-agent.log " + "--property=StandardError=append:/tmp/lightscale-agent.log -- " + + cmd + ) + node.wait_for_unit("lightscale-agent.service") + node.wait_until_succeeds("ip link show ls-test", timeout=60) + + # Phase 1: direct endpoint rotation (invalid -> valid) + start_agent(node1, ["203.0.113.1:51820", "10.0.0.1:51820"], False) + start_agent(node2, ["203.0.113.2:51820", "10.0.0.2:51820"], False) + start_agent(node3, ["203.0.113.3:51820", "10.0.0.3:51820"], False) + start_agent(node4, ["203.0.113.4:51820", "10.0.0.4:51820"], False) + start_agent(node5, ["203.0.113.5:51820", "10.0.0.5:51820"], False) + + time.sleep(1) + + def overlay_ipv4(node): + data = json.loads(node.succeed("cat /tmp/ls-state/state.json")) + return data["ipv4"] + + ip1 = overlay_ipv4(node1) + ip2 = overlay_ipv4(node2) + ip3 = overlay_ipv4(node3) + ip4 = overlay_ipv4(node4) + ip5 = overlay_ipv4(node5) + + nodes = [node1, node2, node3, node4, node5] + ips = [ip1, ip2, ip3, ip4, ip5] + + def full_mesh_ping(nodes, ips): + for i, src in enumerate(nodes): + for j, dst in enumerate(nodes): + if i == j: + continue + src.wait_until_succeeds(f"ping -c 3 {ips[j]}", timeout=120) + + full_mesh_ping(nodes, ips) + + # Phase 2: stream relay fallback (invalid only) + for node in (node1, node2, node3, node4, node5): + node.succeed("systemctl stop lightscale-agent.service") + + start_agent(node1, ["203.0.113.1:51820"], True) + start_agent(node2, ["203.0.113.2:51820"], True) + start_agent(node3, ["203.0.113.3:51820"], True) + start_agent(node4, ["203.0.113.4:51820"], True) + start_agent(node5, ["203.0.113.5:51820"], True) + + full_mesh_ping(nodes, ips) + ''; +} diff --git a/lightscale-client b/lightscale-client new file mode 160000 index 0000000..9a5d8ca --- /dev/null +++ b/lightscale-client @@ -0,0 +1 @@ +Subproject commit 9a5d8ca8ba540f856ea87484742b22f6d4819d3c diff --git a/lightscale-server b/lightscale-server new file mode 160000 index 0000000..b4c72b4 --- /dev/null +++ b/lightscale-server @@ -0,0 +1 @@ +Subproject commit b4c72b4a11904b70646224d244834a5b4309363e diff --git a/soudan.md b/soudan.md new file mode 100644 index 0000000..5dbe43a --- /dev/null +++ b/soudan.md @@ -0,0 +1,613 @@ +# VPNセルフホスト設計相談 + +**User:** Soma Nakamura (namamurasub@gmail.com) +**Created:** 1/19/2026 21:48:40 +**Updated:** 1/20/2026 0:41:25 +**Exported:** 1/20/2026 0:51:17 +**Link:** [https://chatgpt.com/c/696e2828-1440-8330-b0b7-14ed22c5f05c](https://chatgpt.com/c/696e2828-1440-8330-b0b7-14ed22c5f05c) + +## Prompt: +簡単にセルフホスト可能なVPNソフトウェアを作るに当たり、以下のようなものを考えている。 +1. 普通のクライアント同士は、なるべく直接p2p接続を行う。 +2. グローバルIPがある場所に「サーバー」を置く。そいつが設定の管理や鍵の管理、TURNっぽいもの(NAT配下から抜けるために)やTURNS(ファイヤウォール突破のため。この場合のみこのサーバーがリレーを行う?)をやったりする。NAT46みたいなことができるようになれば、サーバーはv6グローバルだけあれば良くなるので、更における場所が広がる。また、これをexit nodeにするような設定もできると良い。あと、このサーバー上の設定として、各クライアントに名前をつけてDNS広報などもできるようになると良い。 +こういうものを作るにあたり、どういうふうにしたらよいか以下の質問に答えてほしい。 +1. 鍵の管理などはtincみたいに分散させるべきか、集中していたほうが(グローバルがある場所にあったほうが)良いか。 +2. 新しいクライアントの参加時には、どのような手法で行うべきか +3. TURNとか、様々なNAT越えの手法があるが、どれを使うとよいか(どんな環境においてもHTTPSさえ通れば絶対に疎通できることが望ましい。) +4. 各クライアントがp2p接続する際のプロトコルはwireguardでよいか?適当にやると(フルメッシュにしたいわけで)無数のwireguardクライアントが生えて困りそうだが、多分カスタムのインターフェースを作ってそこに集約するのか? + +## Response: +Thought for 11m 26s + +あなたが考えている構成は、方向性としてはかなり「現代的なMesh VPN(= Control planeは集中、Data planeはP2P、無理ならHTTPSリレー)」で、実装方針も立てやすいです。結論から言うと、 + +* **鍵“そのもの”(秘密鍵)は各クライアントに保持させる(分散)** +* ただし **鍵配布・ACL・名前/DNS・到達性(NAT越えの候補交換)といった“調停”は中央サーバ(集中)** +* **NAT越えは ICE/STUN/TURN の考え方を借り、最終フォールバックは 443/TLS 上のリレー(DERP的)** +* **P2Pの暗号化・仮想L3は WireGuard が最適。ただし「WireGuard単体」ではなく、それを動的にオーケストレーションする常駐デーモンが要る** + +…という設計が、要件(セルフホスト容易・P2P優先・HTTPSさえ通れば極力なんとかする)に一番素直です。Tailscale/Headscale系がまさにこの分離で動いていて、「Control planeは鍵(公開鍵)とポリシーを配るだけ、Data planeはWireGuardのメッシュ」という整理が定番になっています。 [Tailscale+2GitHub+2](https://tailscale.com/blog/how-tailscale-works?utm_source=chatgpt.com) + +以下、質問に順に答えます。 + +* * * + +1\. 鍵の管理は分散(tinc的)か集中か +---------------------- + +### まず「何の鍵をどこで管理するか」を分解した方がよい + +Mesh VPNでは鍵が最低でも2種類あります。 + +1. **データプレーン鍵**(実際にトラフィックを暗号化する鍵。WireGuardなら各ノードの秘密鍵) +2. **コントロールプレーンの認証・承認**(誰がネットワークに参加できて、誰と通信できるか、という権限) + +このうち、 + +* \*\*(1) データプレーンの秘密鍵は“絶対にクライアントから出さない”\*\*のがベストです。 + そうすると中央サーバが侵害されても「盗聴」まではされにくくなります(ただし後述の“鍵すり替え”リスクは残る)。Tailscaleも「公開鍵だけ交換し、秘密鍵は端末から出ない」と明言しています。 [Tailscale+1](https://tailscale.com/kb/1093/can-tailscale-decrypt-my-traffic?utm_source=chatgpt.com) +* \*\*(2) 参加管理/ACL/DNS名/経路広告/到達性(候補交換)\*\*は、\*\*集中サーバ(coordination server)\*\*に置く方が「簡単にセルフホスト」要件に合います。 [Tailscale+1](https://tailscale.com/blog/how-tailscale-works?utm_source=chatgpt.com) + +### 「分散(tinc的)」のメリット/デメリット(あなたの要件に照らす) + +**分散のメリット** + +* 中央が落ちても(理想的には)参加/鍵配布が継続できる +* 中央を完全に信頼しない設計に寄せられる + +**分散の現実的デメリット** + +* そもそも **NAT越えの“候補交換/到達性観測”のために結局ランデブーが要る**(完全分散は難しい) +* オンボーディングが複雑化(誰が新規ノードを承認するのか) +* 「HTTPSさえ通れば」の最終フォールバックに **443リレーが必要** → 結局“中央サービス”が必要 + +### 推奨(結論) + +* \*\*鍵管理は「秘密鍵は分散(端末保持)」「公開鍵とポリシー配布は集中(サーバ)」\*\*が最適解です。 [Tailscale+1](https://tailscale.com/kb/1093/can-tailscale-decrypt-my-traffic?utm_source=chatgpt.com) +* さらに“中央を信頼しない”を少しでもやるなら、オプションで + **(a) 管理者署名によるノード鍵承認**、または **(b) 透明性ログ(Key Transparency的)** を入れると「サーバ侵害→鍵すり替えMITM」耐性が上がります(実装コストは上がります)。 + +* * * + +2\. 新しいクライアント参加はどう設計すべきか +------------------------ + +要件が「簡単にセルフホスト」なので、**現実的に安全で運用しやすいブートストラップ**に寄せるのが良いです。 + +### 推奨フロー:短命の招待トークン(Enrollment / Auth Key)方式 + +1. 管理者がサーバで **招待トークン**を発行 + * TTL(例: 10分〜24時間) + * 使い捨て or N回まで + * 付与するタグ/グループ(例: `dev`, `admin`, `no-exit`) +2. クライアントは初回起動で + * \*\*デバイス識別鍵(例: Ed25519)\*\*を生成(長期) + * \*\*WireGuard鍵(X25519)\*\*を生成(ローテーション可能) +3. クライアントは `https://control.example.com` に接続し、招待トークンを提示して登録 +4. サーバは + * ノードID発行 + * VPN内IP(v4 /32 と v6 /128)割当 + * ACL適用 + * “通信してよい相手の公開鍵一覧 + ルート + DNS情報” を配布(以後は差分Push) + +この「中央は公開鍵と設定を配るだけ」の形は、Headscaleの説明(WireGuard公開鍵の交換点、IP割当、ルート広告)とも一致します。 [GitHub](https://github.com/juanfont/headscale?utm_source=chatgpt.com) + +### 追加で入れると強い安全策 + +* **承認待ちモード**:トークンで「仮登録」→ 管理者がUI/CLIで承認したら有効化 +* **トークンのスコープ**: + * 「このトークンで参加した端末は exit node になれない」 + * 「このトークンは特定ユーザ/特定タグにのみ」 +* **サーバ真正性**:TLS(Let’s Encrypt等)+ 可能なら **初回にサーバ公開鍵ピン留め(TOFUでも可)** + “社内Proxyで証明書差し替え”環境もあるので、その場合の扱い(明示的にCAを追加する等)は設計で決めておく。 + +* * * + +3\. TURNやNAT越えはどれを使うべきか(HTTPSさえ通れば絶対が理想) +---------------------------------------- + +### 結論:考え方は ICE(STUN+TURN)を採用、実装は「段階的フォールバック」が現実的 + +* \*\*ICE(RFC 8445)\*\*は「NAT越えのために候補(アドレス/経路)を集め、疎通チェックして最適経路を選ぶ」枠組みです。ICEはSTUNとTURNを利用する、とRFC自体が書いています。 [IETF Datatracker+2IETF Datatracker+2](https://datatracker.ietf.org/doc/html/rfc8445?utm_source=chatgpt.com) +* STUN(RFC 8489)は、NAT越しに「外から見える自分のIP:port」を知ったり、NATバインディング維持のkeepaliveにも使えます。 [IETF Datatracker+1](https://datatracker.ietf.org/doc/html/rfc8489?utm_source=chatgpt.com) +* TURN(RFC 5766 / 更新版RFC 8656)は、どうしても直接が無理なときに \*\*中継(リレー)\*\*を提供します。 [RFC Editor+1](https://www.rfc-editor.org/rfc/rfc5766.html?utm_source=chatgpt.com) + +### ただし「VPN用途」だと TURNフル実装が重い + +TURNは汎用中継として標準ですが、**VPNの“データプレーン”のためにTURNサーバ/クライアントを完全実装**するのは工数が増えがちです。 +この場合、現実解としては2つあります: + +#### A) 標準準拠でいく:STUN + TURN(必要ならTCP拡張) + +* UDPが通る: STUNで外部アドレス取得 → UDP hole punching +* UDPが塞がれる: TURN over TCP/TLS を検討 + TURNのTCP割当拡張はRFC 6062が定義しています。 [IETF Datatracker+1](https://datatracker.ietf.org/doc/html/rfc6062?utm_source=chatgpt.com) + +#### B) “HTTPSさえ通れば”を最優先:DERP的な「443/TLSリレー」を自作(おすすめ) + +「HTTPS(443/TCP)が通ればとにかく繋ぐ」を狙うなら、TURNより \*\*DERP(TCP/TLSで暗号化済みパケットを盲目的に転送するリレー)\*\*の方が設計が単純です。 + +* Tailscaleは「DERPはTLS上(多くは443)で動くTCPリレーで、直接繋がらないときに暗号化済みパケットを転送する」と説明しています。 [Tailscale+2Tailscale+2](https://tailscale.com/blog/nat-traversal-improvements-pt3-looking-ahead?utm_source=chatgpt.com) +* そしてファイアウォール観点でも「制御サーバ/DERPリレーはHTTPSで443を使う」とドキュメントにあります。 [Tailscale](https://tailscale.com/kb/1082/firewall-ports?utm_source=chatgpt.com) +* HeadscaleのDERP解説も、DERP relayが **HTTPS tcp/443**、STUNが **udp/3478** という“最低限ポート”の切り方になっています。 [headscale.net](https://headscale.net/stable/ref/derp/?utm_source=chatgpt.com) + +あなたの要件に寄せた “段階的フォールバック” のおすすめ順はこうです: + +1. **IPv6直**(両者がグローバルv6なら最強。NAT越え問題が激減) +2. **UDP直(v4/v6)** + * STUNで各自の外部アドレスを把握(server-reflexive candidate) + * rendezvous(あなたの中央サーバ)で候補交換 + * 同時送信でhole punching(WireGuardのハンドシェイク自体を“疎通パケット”にしても良い) +3. **(任意)UPnP/PCP/NAT-PMP**でポートマッピング要求(家ルータでは効くことがあるが期待しすぎない) +4. **最後の砦:443/TLSリレー(DERP的)** + * WebSocket/HTTP2ストリーミングなど“HTTPSっぽい見え方”にして通りやすくする + +### 「HTTPSさえ通れば絶対」についての注意(設計上の現実) + +“絶対”は難しいです。理由は、ネットワークによっては + +* HTTPプロキシ必須(CONNECTが要る、認証が要る) +* TLSインスペクション(証明書差し替え) +* SNI/宛先制限(特定ドメイン以外禁止) +* キャプティブポータル + などがあるためです。 + +ただし、\*\*「443/TCPで外向きTLSが通る」\*\*という前提が満たされる範囲では、DERP的リレーは非常に強いアプローチです。 [Tailscale+1](https://tailscale.com/blog/nat-traversal-improvements-pt3-looking-ahead?utm_source=chatgpt.com) + +* * * + +4\. P2PのプロトコルはWireGuardでよいか?フルメッシュで無数に増えないか? +-------------------------------------------- + +### 結論:WireGuardで良い。増えるのは「インターフェース」ではなく「peerエントリ」 + +WireGuardは「インターフェース(例: wg0)1つ」に対して、複数peerをぶら下げるモデルです。 +WireGuard公式のQuick Startでもインターフェースを作ってIPを付ける流れが説明されています。 [WireGuard](https://www.wireguard.com/quickstart/?utm_source=chatgpt.com) +またWireGuardはNoiseフレームワーク等の現代的暗号を使う、という説明とホワイトペーパーもあります。 [WireGuard+1](https://www.wireguard.com/?utm_source=chatgpt.com) + +### フルメッシュでの基本設計(典型) + +* 各ノードに **VPN内アドレスを1つだけ**割り当てる + * IPv4: /32(例: 100.64.0.10/32) + * IPv6: /128(例: fd7a:115c:a1e0::10/128) +* 各peerのAllowedIPsは「そのノードの/32(/128)だけ」にする + → **AllowedIPsが重ならない**のでルーティングが破綻しない +* WireGuardのAllowedIPsは「送信時の宛先IPからpeerを選ぶ/受信時のソースIPをフィルタする」用途を兼ねる、という説明がホワイトペーパーにあります。 [WireGuard](https://www.wireguard.com/papers/wireguard.pdf?utm_source=chatgpt.com) + +この形だと、フルメッシュでも「wg0 1つ + peerがN-1個」です。 + +### ただし WireGuard“単体”ではNAT越え・経路切替ができない + +WireGuard自体は「UDPで相手のEndpointへ投げる」だけで、 + +* STUNして外部アドレスを発見 +* 経路が変わったらEndpointを更新 +* UDPが無理なら443/TLSに逃がす + みたいなことは“外側のソフト”がやる必要があります。 + +なのであなたが言っている「カスタムのインターフェースを作って集約するのか?」はだいたい合っていて、実装の形は2択です: + +#### 方式A:カーネルWireGuard + 制御デーモン(netlinkでpeer更新) + +* パフォーマンス良い +* でも \*\*“TCP/TLSリレーでWireGuardパケットを運ぶ”\*\*の統合が難しくなる(WGは基本UDP) + +#### 方式B:userspace WireGuard(wireguard-go相当)を内蔵し、送受信ソケットを自前で差し替える(おすすめ) + +* 「UDP直」と「443/TLSリレー」を \*\*同一の“パケット送受信抽象”\*\*で切り替えやすい +* Tailscaleの `magicsock` はまさに「通信経路を動的に変えられるソケット」を提供する、という説明があります。 [Go Packages+1](https://pkg.go.dev/tailscale.com/wgengine/magicsock?utm_source=chatgpt.com) +* 443/TLSリレーは「WireGuardで暗号化済みのデータをリレーが盲目的に転送する」形にしやすいです。 [Tailscale+1](https://tailscale.com/kb/1232/derp-servers?utm_source=chatgpt.com) + +あなたの「HTTPSさえ通れば」要件が強いなら、**方式B寄り**が実装コストと到達性のバランスが良いと思います。 + +* * * + +追加:あなたの「v6グローバルだけでサーバを置きたい(NAT46的)」について +--------------------------------------- + +ここは誤解が起きやすいので先に釘を刺します。 + +* **IPv4しかないクライアント**は、**IPv6-onlyのサーバへは基本的に到達できません**(間に“IPv4を受けられる翻訳/プロキシ装置”が必要)。 +* NAT46は「IPv4クライアント→IPv6サーバ」を実現する翻訳の一種ですが、翻訳ゲートウェイが“受け口としてIPv4側”を持つ必要があり、サーバが純IPv6だけだとどこか別の場所に翻訳器が要ります。 [Palo Alto Networks TechDocs+1](https://docs.paloaltonetworks.com/content/techdocs/en_US/prisma-sd-wan/administration/nat-protocol-translation/nat-supported-protocol-translation-types/nat46-connect-ipv4-clients-to-ipv6-servers?utm_source=chatgpt.com) + +「IPv6だけで置ける場所を増やしたい」という狙いは理解できますが、**“誰でもどこでも(IPv4-only含む)”を満たすなら、結局どこかにIPv4の到達点が必要**です。 +一方で日本だとIPv6普及は高いので、実用上は「v6優先 + v4はリレーで吸う」でも十分成立するケースは多いです(ただし“絶対”は言えない)。 + +* * * + +まとめ(あなたの4問への短い指針) +----------------- + +1. **鍵管理**: + * **秘密鍵は端末保持(分散)** + * \*\*公開鍵配布・ACL・設定・名前は中央(集中)\*\*が最も簡単で強い。 [Tailscale+2Tailscale+2](https://tailscale.com/kb/1093/can-tailscale-decrypt-my-traffic?utm_source=chatgpt.com) +2. **参加手法**: + * **短命/制約付きの招待トークン** + **端末生成鍵** +(任意で)**管理者承認**が運用しやすい。 [GitHub](https://github.com/juanfont/headscale?utm_source=chatgpt.com) +3. **NAT越え**: + * 枠組みは **ICE(STUN+TURN)** が教科書。 [IETF Datatracker+2IETF Datatracker+2](https://datatracker.ietf.org/doc/html/rfc8445?utm_source=chatgpt.com) + * ただし「HTTPSさえ通れば」重視なら \*\*最終フォールバックは 443/TLSリレー(DERP的)\*\*が現実解。 [Tailscale+2Tailscale+2](https://tailscale.com/blog/nat-traversal-improvements-pt3-looking-ahead?utm_source=chatgpt.com) +4. **P2Pプロトコル**: + * **WireGuardでOK**。インターフェースは基本1つでpeerを増やす。 [WireGuard+1](https://www.wireguard.com/quickstart/?utm_source=chatgpt.com) + * “経路切替・NAT越え・リレー”のために **WireGuardの外側に常駐オーケストレータが必須**。 [Go Packages+1](https://pkg.go.dev/tailscale.com/wgengine/magicsock?utm_source=chatgpt.com) + +* * * + +必要なら次に、あなたの要件に合わせて \*\*最小MVP(サーバ1台・ポート最小・クライアント実装の部品分割)\*\*を、 + +* プロトコル境界(Control API、NetMap差分、Relay framing) +* データ構造(ノードID、鍵、ACL、DNS名、経路広告) +* 推奨ポート設計(443/tcp、3478/udp 等) + まで具体化して提案できます。 + +## Prompt: +あと、L2的にするか、L3的にするかという問題(tailscaleのSubnet Routingみたいなものをやるかどうか?)もあります。 + +## Response: +Thought for 13m 4s + +L2/L3は「見た目がそれっぽい」以上に、**アーキテクチャ全体(到達性・スケール・運用・セキュリティ)を決める分岐**なので、ここは最初に方針を固めた方がいいです。結論としては、あなたの要件(セルフホスト容易 / P2P優先 / 最悪はHTTPSで通す)だと、 + +* **基本は L3(TUN)で作る** +* **“LANをまるごと繋ぐ”は L3の「Subnet Routing(経路広告)」で実現** +* **L2(TAP/ブリッジ)は「どうしても必要なユースケースだけ」のオプション**(MVPでは避ける) + +が一番きれいです。 + +* * * + +1) まず整理:Subnet Routingは「L2にするか?」とは別問題(L3の拡張) +-------------------------------------------- + +TailscaleのSubnet routersは、やっていることは **L3の経路広告**で、L2ブリッジではありません。 + +* Subnet router側で **IP forwardingを有効化**して、`--advertise-routes=192.0.2.0/24,...` みたいに **到達させたいプレフィックスを広告**する(=Control planeに「このサブネットはこのノード経由で届く」と登録)。 [Tailscale](https://tailscale.com/kb/1019/subnets) +* 管理コンソール(またはautoApprovers)で **その経路を承認**して有効化する。 [Tailscale](https://tailscale.com/kb/1019/subnets) +* 複数広告がある場合は **Longest Prefix Matchで選ぶ**(/16より/24が優先、みたいな)。 [Tailscale+1](https://tailscale.com/kb/1019/subnets) +* デフォルトルート `0.0.0.0/0` / `::/0` を流すなら “exit node” にせよ、という整理。 [Tailscale+1](https://tailscale.com/kb/1019/subnets) + +つまり \*\*Subnet Routingを実装するかどうかは「L3でルーティング機能を足すか」\*\*の話で、**L2ブリッジ(同一ブロードキャストドメイン化)とは別**です。 + +* * * + +2) L3(TUN)で作る場合の性質:WireGuardと相性が良い +---------------------------------- + +あなたが前に挙げたWireGuardを採るなら、L3ベースは自然です。 + +* WireGuard自体が **Layer 3のトンネル**として設計されている(公式ペーパーのAbstractで明言)。 [WireGuard](https://www.wireguard.com/papers/wireguard.pdf) +* Tailscaleも「主としてネットワーク層のソリューションで、WireGuardを使う」と整理しています。 [Tailscale](https://tailscale.com/kb/1456/osi) + +### L3(TUN)Mesh + Subnet Routingのメリット + +* **スケールが楽**:各ノード/32(/128)で、ルーティングは「宛先IP→次ホップpeer」になり、ブロードキャストの洪水が起きにくい +* **フルメッシュでも管理可能**:WireGuard peerは増えるが、基本は `wg0` 1つにpeerをぶら下げる設計で済む(あなたの前提とも整合) +* **セキュリティ境界が作りやすい**:ACLはL3/4(IP/Port)で表現しやすい +* **“HTTPSさえ通れば”の最終フォールバック**(443/TLSリレー)も、L3パケット単位で運びやすい + ※Tailscaleは「UDPが無理ならTCP(HTTPS)で暗号化トラフィックをリレーする」と説明しています。 [Tailscale](https://tailscale.com/compare/zerotier) + +### L3のデメリット(ここが「L2が欲しくなる」ポイント) + +* **L2ブロードキャスト/マルチキャストに依存する発見系が素直に動かない** + 例:mDNS/Bonjour、SSDP、NetBIOS Name Service、ゲームのLAN探索、など。 + これは(Tailscaleでも)課題として継続的に話題になります。 [GitHub+1](https://github.com/tailscale/tailscale/issues/11134?utm_source=chatgpt.com) + +* * * + +3) L2(TAP/仮想Ethernet)にすると何が起きるか +------------------------------- + +L2にする=**「全ノードが同一のEthernetセグメントに繋がっている」ように見せる**です。 + +### L2のメリット + +* DHCP・ARP・mDNS・SSDPなど、**L2に依存するプロトコルが“そのまま”動く**可能性が高い +* “VPN越しに同一LANっぽく”という体験は作りやすい + +### ただし、あなたの狙い(簡単にセルフホスト、P2P優先、HTTPSフォールバック)と衝突しやすい + +L2はインターネット越しにやると、現実のつらさが出ます。 + +* **ブロードキャスト/ARPの扱い** + フルメッシュで“同一セグメント”をやると、結局「ブロードキャストを全peerへ複製」する必要がある + → ノードが増えるほど雑に増える(O(N²)寄りになりがち) + → P2P優先のメリットを削りやすい +* **ループ/ストーム/MTU問題** + L2はSTP相当の設計や、フレームの洪水対策をちゃんとやらないと事故りやすい +* **アドレス空間の衝突が地雷** + “同一L2”を複数拠点の既存LANに伸ばすと、同じRFC1918が被って終わる、が典型 +* **セキュリティ境界が曖昧になりやすい**(L2は攻撃面が増えやすい) + +参考として、ZeroTierは「Virtual Layer 2をVXLAN的に作る」と説明していますが、これはそれ相応の設計領域を背負う、ということでもあります。 [Tailscale](https://tailscale.com/compare/zerotier) + +* * * + +4) あなたのプロダクトとしての推奨方針 +-------------------- + +### 推奨:L3をコアにして、Subnet Routingを“必須級のオプション”にする + +MVPとして一番強いのはこれです。 + +* 通常ノード:/32(/128)だけを持つ +* **Subnet routerノード**: + * 「このノードが `192.168.1.0/24` を背負う」みたいな経路広告 + * 管理者承認(勝手にLANが公開されないように) [Tailscale](https://tailscale.com/kb/1019/subnets) + * ルート衝突はLPMで選択 [Tailscale](https://tailscale.com/kb/1019/subnets) +* **Exit nodeノード**: + * `0.0.0.0/0`, `::/0` を背負う(全トラフィック経由) [Tailscale](https://tailscale.com/kb/1103/exit-nodes) + * 利用側/提供側/管理者の“オプトイン”を要求する(勝手に出口化しない) [Tailscale](https://tailscale.com/kb/1103/exit-nodes) + +### Subnet Routing実装でハマる要点(ここを仕様に落とすと良い) + +Tailscaleのドキュメントが現実に寄り添っていて、あなたの実装にもそのまま使えます: + +* ルータ側は **IP forwarding必須**、firewallは転送拒否がデフォルトの前提で説明されている [Tailscale](https://tailscale.com/kb/1019/subnets) +* **SNATの扱い**が超重要 + * デフォルト:SNAT(masquerading)=LAN側から見ると送信元がルータになる → **設定が簡単** [Tailscale](https://tailscale.com/kb/1019/subnets) + * SNAT無効:送信元がVPN内IPのまま → **戻り経路(return route)が必要**(例としてTailscaleは `100.64.0.0/10` をサブネットルータへ向ける静的経路が要る、と明記) [Tailscale](https://tailscale.com/kb/1019/subnets) + → あなたの実装でも同じで、\*\*「簡単モードはSNATあり」「正確モードはSNATなし+return route必須」\*\*が王道になります。 + +* * * + +5) 「L2っぽさ」が欲しい時の現実的な落としどころ +-------------------------- + +L2に振り切る前に、L3のまま“困りがちポイントだけ”解決する手があります。 + +### A. 発見系(mDNS/SSDPなど)だけを中継する + +* いわゆる **mDNSリフレクタ/ゲートウェイ**(224.0.0.251)や **SSDPプロキシ**(239.255.255.250)を、サブネットルータ上で動かす +* VPN全体をL2にせず、必要なプロトコルだけを中継 +* セキュリティポリシー(誰がどの発見情報を見ていいか)が書きやすい + +### B. “名前で解決”に寄せる(DNSの作り込み) + +あなたが元々言っていた「名前つけてDNS広報」は、L3モデルの弱点をかなり潰します。 +Subnet Routingの文脈でも「内部DNSへのルーティング」やsplit DNSの話が出ます。 [Tailscale+1](https://tailscale.com/kb/1019/subnets) + +* * * + +6) それでもL2が必要なら:どう設計するのがマシか +-------------------------- + +もし「L2がないと成立しない」ユースケースが明確にあるなら、僕ならこうします: + +* **コアはL3のまま(WireGuard/TUN)** +* L2は **“L2セグメント”という別機能**として提供(デフォルトOFF) + * 例:特定グループだけが参加する“仮想スイッチVNI”を作り、そのVNI参加ノード間だけEthernetフレームを運ぶ + * 実装は VXLAN/Geneve 的な発想(ただしメンバ管理・フラッディング制御・MTU・暗黙のブロードキャスト複製など、考えることは増える) +* さらに、HTTPSフォールバック上でL2を流すなら、**TCP-over-TCPの性能劣化**やフレーム断片化に注意が必要(ここは設計で割り切る) + +これを最初からやると重いので、MVPでは避けるのが無難です。 + +* * * + +まとめ:あなたのプロジェクトに対する実装優先度 +----------------------- + +1. **L3 Mesh(WireGuard)をコアにする**(最も自然・簡単・堅い) [WireGuard+1](https://www.wireguard.com/papers/wireguard.pdf) +2. **Subnet Routing(経路広告+承認+SNAT制御)を実装**(“VPNを実用にする”機能) [Tailscale+1](https://tailscale.com/kb/1019/subnets) +3. **Exit node(デフォルトルート)を実装**(必要なら) [Tailscale](https://tailscale.com/kb/1103/exit-nodes) +4. それでも足りないときだけ + * 発見系の中継(mDNS/SSDP) + * さらに必要なら L2セグメント(オプション) + +* * * + +必要なら次は、あなたのControl plane設計に合わせて、 + +* 「経路広告(subnet/exit)をnetmapにどう持つか」 +* 「クライアント側でAllowedIPsとOSルートをどう同期するか」 +* 「SNATあり/なしの両モードのユーザ体験(設定手順)をどう分岐させるか」 + +を、擬似コード/データ構造まで落として提案します。 + +## Prompt: +L3にしようと思います。もう一つの問題は、クライアントソフトウェアの設計です。個人的には、一つのクライアントのマシンがいくつかのネットワークに接続できたほうが望ましいと思っている(tailscaleはこれができない!)のですが、どうでしょうか。ただ、問題は適当にやるとIP帯がかぶるところですね。 + +## Response: +Thought for 14m 19s + +L3で行くなら、「1台の端末が複数ネットワークに同時参加できる」は**かなり価値のある機能**です(仕事用・個人用・検証用・顧客環境など)。実際、Tailscaleも“複数アカウントを素早く切り替える”機能はありますが、**同時に複数tailnetへパケット送信はできない**と明記しています。 [Tailscale](https://tailscale.com/kb/1225/fast-user-switching) +(なお組織が複数tailnetを持てる機能も出ていますが、これは主に管理側の話で、クライアントが同時接続できる話とは別です。 [Tailscale+1](https://tailscale.com/kb/1509/multiple-tailnets)) + +あなたが作るなら、**同時参加を最初から設計に入れる**のはアリです。ただし、言う通り「IP帯がかぶる」問題が出るので、そこを**仕様で潰す**のがポイントになります。 + +以下、設計のおすすめを「現実に運用できる線」でまとめます。 + +* * * + +1) まず分けて考えるべき「かぶるIP」の種類 +----------------------- + +### A. オーバレイ内(VPN端末同士)のIP + +これは**あなたのVPNが配るアドレス**なので、設計でほぼ完全に回避できます。 + +### B. Subnet Routingで流す“向こう側LAN”のIP + +こっちは現実世界のLANが `192.168.0.0/16` とかで乱立するので、**高確率で衝突**します。 +同時に2つ以上のネットワークへ参加するなら、衝突が起きたときの挙動が実質“仕様の核”になります。 + +* * * + +2) 「複数ネットワーク同時参加」を成立させるクライアント設計の基本方針 +------------------------------------ + +### 結論(おすすめ) + +* **ネットワークごとに“独立したルーティングドメイン”を持つ**発想で作る +* ただしOSの通常ルーティングテーブルは“宛先IPだけ”で決めるので、 + **衝突する経路を“そのまま”同居はできない** + → だから、衝突を避ける(オーバレイ)+衝突したら方針を決める(サブネット) が必要 + +* * * + +3) 実装アーキテクチャ:一番ラクなのは「マルチインスタンス」 +------------------------------- + +あなたのソフトを「同時に複数ネットワーク参加可能」にする最短ルートはこれです。 + +### 方式A:**1ネットワーク = 1デーモン(推奨)** + +* `clientd --profile netA` が `tun-netA` を作る +* `clientd --profile netB` が `tun-netB` を作る +* UI/CLIだけ “1つのクライアント” に見せれば良い + +**メリット** + +* 実装が単純(状態・鍵・netmap・ルート注入が分離) +* バグっても片方に閉じ込めやすい +* “このネットワークだけ落とす/再接続” が容易 + +**デメリット** + +* DNS統合(後述)が少し面倒 +* OSごとにTUN複数の扱いが違う(Windows/macOS周りは要検証) + +### 方式B:1デーモンで複数ネットワーク管理 + +* 中長期ではUXが良いけど、最初は事故りやすい(状態機械が重なる) + +**MVPは方式A**が堅いです。Tailscaleでも“複数同時接続したい”需要は昔からIssueになっていて、回避として複数デーモンを立てる話が出ます。 [GitHub](https://github.com/tailscale/tailscale/issues/183) + +* * * + +4) オーバレイIPの衝突を「設計で潰す」方法 +----------------------- + +ここはあなたが自由に決められるので、**衝突が起きない割当**にしてしまうのが最強です。 + +### 推奨:ネットワークIDからプレフィックスを決定する + +ネットワークには必ず一意な `NetworkID`(例:コントロールサーバの公開鍵、またはネットワーク作成時に生成するUUID相当)を持たせて、そこから + +* IPv6:ULA /48(RFC 4193の範囲 `fd00::/8`) [IETF Datatracker](https://datatracker.ietf.org/doc/html/rfc4193?utm_source=chatgpt.com) +* IPv4:Shared Address Space `100.64.0.0/10`(RFC 6598)から /24 など [IETF Datatracker](https://datatracker.ietf.org/doc/html/rfc6598?utm_source=chatgpt.com) + +を**決定的(deterministic)に生成**します。 + +#### 例(イメージ) + +* `overlay_v6_prefix = fd00::/8 + H(NetworkID)の先頭40bit → /48` +* `overlay_v4_prefix = 100.64.0.0/10 の中から H(NetworkID)で /24 を選ぶ` + +これにより、**別ネットワーク同士でもオーバレイがかぶりにくい**(実質“ほぼ起きない”)状態を作れます。 + +> 補足:IPv6 ULAは「ローカルで使うがグローバル一意性を期待できる」用途として定義されています。 [IETF Datatracker](https://datatracker.ietf.org/doc/html/rfc4193?utm_source=chatgpt.com) +> IPv4の `100.64/10` はCGN向けの共有アドレス空間としてIETFが予約したブロックです。 [IETF Datatracker](https://datatracker.ietf.org/doc/html/rfc6598?utm_source=chatgpt.com) + +### ルート注入も衝突しにくくする + +* 各ネットワークのオーバレイは **そのネットワーク固有のプレフィックスだけ**を `tun-netX` に向ける +* 端末同士は /32(/128)でもいいし、プレフィックスまとめても良い(規模次第) + +WireGuard自体もL3トンネルとして整理されているので、この方向は素直です。 [WireGuard+1](https://www.wireguard.com/papers/wireguard.pdf?utm_source=chatgpt.com) + +* * * + +5) 本題:Subnet Routingがあると“衝突が必ず起きる”問題 +------------------------------------ + +ここは「何を保証するか」を決めないといけません。おすすめは段階的です。 + +### レベル1(推奨デフォルト):**衝突したサブネット経路は“入れない”** + +クライアントが受け取った advertised routes を、OSのルートテーブルへ入れる前に + +* 既存ルート(ローカルLAN含む) +* 他のVPNネットワークから入っているルート + +と照合して、**プレフィックスが重なるものは拒否**します。 + +ユーザ体験としては: + +* “netB の `192.168.1.0/24` は netA(またはローカル)と衝突するので無効化しました” +* どのルートが衝突してるか、解決策(後述)を提示 + +これで \*\*「同時参加はできるが、衝突サブネットは使えない」\*\*という一貫した挙動になります。 +MVPとしては非常に現実的です。 + +### レベル2:手動優先度(メトリクス)でどちらか一方を採用 + +* 衝突した2つの `192.168.1.0/24` のうち、**片方だけ**を採用 +* UIで “この経路はnetA優先/netB優先” を選ばせる + +ただし、同時に両方は使えません(OSルーティングは宛先IPで決まるため)。 + +### レベル3(上級者向け):**サブネットのアドレス変換(route translation / netmap)** + +「同時に両方の `192.168.1.0/24` を使いたい」を満たす現実解はこれです。 + +#### 発想 + +* netBの `192.168.1.0/24` を、そのネットワーク内では **仮想プレフィックス `172.31.200.0/24` にマッピング**して提供する +* クライアントは `172.31.200.0/24` 宛に送る +* Subnet Router(netB側)が **1:1のプレフィックス変換**で `172.31.200.x ⇔ 192.168.1.x` を行う + +こうすると、 + +* ローカルLANの `192.168.1.0/24` +* netAの `192.168.1.0/24` +* netBの(変換後)`172.31.200.0/24` + +が**同時に共存**できます。 + +**ただし仕様の重さ** + +* 変換後のアドレスをユーザにどう見せるか(DNSで変換後を返す、など)まで決める必要がある +* ルータだけでなく、クライアント側も「この経路は変換モード」を理解する必要がある(実装は可能) + +MVPではレベル1(拒否)で十分、要望が出たらレベル3を足すのが良いです。 + +### レベル4(Linux限定の最終手段):VRF / network namespace + +* OS側でルーティングテーブルを分離して“同じ宛先プレフィックス”を共存させる +* ただし「そのネットワークを使うアプリ」を名前空間に入れる必要が出て、透明性が落ちます +* クロスプラットフォームでは厳しい + +* * * + +6) “同時に複数ネットワーク参加”を入れるなら、追加で決めるべき仕様 +----------------------------------- + +### (a) Exit node(デフォルトルート)は同時に1つ、が基本 + +複数ネットワークで同時に `0.0.0.0/0` を入れるのは混乱のもとです。 + +* 「どのネットワークのexitを使うか」を明示選択 +* もう一方はexit機能をOFFにする(またはアプリ単位の選択にする) + +### (b) DNS名の衝突 + +netAにもnetBにも `db` みたいな名前があると衝突します。 +解決策は素直に + +* ネットワークごとにDNSサフィックスを持つ(例:`db.netA.vpn`, `db.netB.vpn`) +* OSへはsplit DNS(or ローカルスタブDNSで集約) + が堅いです。 + +### (c) ネットワーク間の“意図しない中継”を防ぐ + +同じホストが2ネットワークに入ると、事故でルーティング(転送)してしまう可能性があります。 + +* デフォルトで **IP forwardを有効化しない** +* 「サブネットルータ/exit node」は明示的に有効化+承認制 + +* * * + +まとめ:あなたの状況へのおすすめ +---------------- + +あなたが「複数ネットワーク同時接続」を売りにしたいなら、こう決めると破綻しにくいです。 + +1. **クライアントはマルチインスタンス(1ネットワーク=1デーモン)でMVP** +2. **オーバレイIPは “NetworkIDから決定的に生成”** + * IPv6はULA /48(RFC 4193) [IETF Datatracker](https://datatracker.ietf.org/doc/html/rfc4193?utm_source=chatgpt.com) + * IPv4は100.64/10内(RFC 6598) [IETF Datatracker](https://datatracker.ietf.org/doc/html/rfc6598?utm_source=chatgpt.com) +3. **Subnet Routingは衝突検出して、衝突した経路はデフォルト拒否(レベル1)** +4. どうしても必要になったら \*\*“サブネット変換(レベル3)”\*\*を追加 + +* * * + +もしよければ、次は「衝突検出ロジック」と「ネットワークIDからのプレフィックス生成」の具体案(ビット割り当て、衝突時のリトライ規則、サーバ側のネットワーク作成API仕様、クライアントのルート適用順序)を、実装に落とせる粒度で提案できます。