photoncloud-monorepo/nix/tests/fiberlb-native-bgp-interop-vm-smoke.nix
centra ce979d8f26
Some checks failed
Nix CI / filter (push) Successful in 6s
Nix CI / gate () (push) Failing after 1s
Nix CI / gate (shared crates) (push) Has been skipped
Nix CI / build () (push) Has been skipped
Nix CI / ci-status (push) Failing after 1s
fiberlb: add BGP interop, drain, and policy validation
2026-03-30 20:06:08 +09:00

737 lines
24 KiB
Nix

{
pkgs,
photoncloudPackages,
photoncloudModule,
nixNosModule,
}:
let
frrZebraConfig = pkgs.writeText "fiberlb-interop-frr-zebra.conf" ''
hostname interop-zebra
log stdout debugging
'';
frrBgpdConfig = pkgs.writeText "fiberlb-interop-frr-bgpd.conf" ''
hostname interop-frr
log stdout debugging
router bgp 65020
bgp router-id 192.168.100.1
no bgp ebgp-requires-policy
neighbor 192.168.100.2 remote-as 65010
neighbor 192.168.100.2 description fiberlb-frr
!
address-family ipv4 unicast
neighbor 192.168.100.2 activate
exit-address-family
!
'';
birdConfig = pkgs.writeText "fiberlb-interop-bird.conf" ''
router id 192.168.100.3;
protocol device {}
protocol kernel {
ipv4 {
import none;
export none;
};
}
protocol bgp fiberlb_peer {
local 192.168.100.3 as 65030;
neighbor 192.168.100.2 as 65010;
ipv4 {
import all;
export none;
};
}
'';
gobgpdConfig = pkgs.writeText "fiberlb-interop-gobgpd.json" (builtins.toJSON {
global = {
config = {
as = 65040;
router-id = "192.168.100.4";
};
};
neighbors = [
{
config = {
neighbor-address = "192.168.100.2";
peer-as = 65010;
description = "fiberlb-gobgp";
};
}
];
});
iamProtoDir = ../../iam/proto;
iamProto = "iam.proto";
fiberlbProtoDir = ../../fiberlb/crates/fiberlb-api/proto;
fiberlbProto = "fiberlb.proto";
backendScript = pkgs.writeText "fiberlb-interop-backend.py" ''
from http.server import BaseHTTPRequestHandler, HTTPServer
class Handler(BaseHTTPRequestHandler):
def do_GET(self):
body = b"fiberlb interop backend\n"
self.send_response(200)
self.send_header("Content-Type", "text/plain; charset=utf-8")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
def log_message(self, format, *args):
return
HTTPServer(("127.0.0.1", 18081), Handler).serve_forever()
'';
in
{
name = "fiberlb-native-bgp-interop-vm-smoke";
nodes = {
frr =
{ ... }:
{
networking.hostName = "frr";
networking.useDHCP = false;
networking.firewall.enable = false;
virtualisation.vlans = [ 1 ];
networking.interfaces.eth1.ipv4.addresses = [
{
address = "192.168.100.1";
prefixLength = 24;
}
];
environment.systemPackages = with pkgs; [
curl
frr
jq
iproute2
];
users.groups.frr = { };
users.groups.frrvty = { };
users.users.frr = {
isSystemUser = true;
group = "frr";
extraGroups = [ "frrvty" ];
};
users.users.root.extraGroups = [ "frrvty" ];
systemd.services.frr-zebra = {
description = "FRR zebra for FiberLB interop smoke";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
serviceConfig = {
Type = "simple";
RuntimeDirectory = "frr";
RuntimeDirectoryMode = "0755";
ExecStartPre = "${pkgs.runtimeShell} -lc '${pkgs.coreutils}/bin/install -d -o root -g root /run/frr /var/run/frr && ${pkgs.coreutils}/bin/rm -f /run/frr/zebra.pid /var/run/frr/zebra.pid'";
ExecStart = "${pkgs.frr}/libexec/frr/zebra -f ${frrZebraConfig} -A 127.0.0.1 -P 2601 -i /run/frr/zebra.pid -z /run/frr/zserv.api -u root -g root --log stdout";
Restart = "on-failure";
RestartSec = "2s";
};
};
systemd.services.frr-bgpd = {
description = "FRR bgpd for FiberLB interop smoke";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" "frr-zebra.service" ];
requires = [ "frr-zebra.service" ];
serviceConfig = {
Type = "simple";
RuntimeDirectory = "frr";
RuntimeDirectoryMode = "0755";
ExecStartPre = "${pkgs.runtimeShell} -lc '${pkgs.coreutils}/bin/install -d -o root -g root /run/frr /var/run/frr && ${pkgs.coreutils}/bin/rm -f /run/frr/bgpd.pid /var/run/frr/bgpd.pid && for _ in $(seq 1 30); do [ -S /run/frr/zserv.api ] && exit 0; sleep 1; done; echo zserv socket did not appear >&2; exit 1'";
ExecStart = "${pkgs.frr}/libexec/frr/bgpd -f ${frrBgpdConfig} -A 127.0.0.1 -P 2605 -p 179 -i /run/frr/bgpd.pid -z /run/frr/zserv.api -S --log stdout";
Restart = "on-failure";
RestartSec = "2s";
};
};
system.stateVersion = "24.11";
};
bird =
{ ... }:
{
networking.hostName = "bird";
networking.useDHCP = false;
networking.firewall.enable = false;
virtualisation.vlans = [ 1 ];
networking.interfaces.eth1.ipv4.addresses = [
{
address = "192.168.100.3";
prefixLength = 24;
}
];
environment.systemPackages = with pkgs; [
bird2
jq
];
systemd.services.bird-peer = {
description = "BIRD peer for FiberLB interop smoke";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
serviceConfig = {
Type = "simple";
ExecStart = "${pkgs.bird2}/bin/bird -f -c ${birdConfig} -s /run/bird.ctl";
Restart = "on-failure";
RestartSec = "2s";
};
};
system.stateVersion = "24.11";
};
gobgp =
{ ... }:
{
networking.hostName = "gobgp";
networking.useDHCP = false;
networking.firewall.enable = false;
virtualisation.vlans = [ 1 ];
networking.interfaces.eth1.ipv4.addresses = [
{
address = "192.168.100.4";
prefixLength = 24;
}
];
environment.systemPackages = with pkgs; [
gobgp
gobgpd
jq
];
systemd.services.gobgpd-peer = {
description = "GoBGP peer for FiberLB interop smoke";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
serviceConfig = {
Type = "simple";
ExecStart = "${pkgs.gobgpd}/bin/gobgpd -t json -f ${gobgpdConfig} --api-hosts 127.0.0.1:50051 -p";
Restart = "on-failure";
RestartSec = "2s";
};
};
system.stateVersion = "24.11";
};
lb =
{ ... }:
{
imports = [
nixNosModule
photoncloudModule
];
networking.hostName = "lb";
networking.useDHCP = false;
networking.firewall.enable = false;
virtualisation.vlans = [ 1 ];
networking.interfaces.eth1.ipv4.addresses = [
{
address = "192.168.100.2";
prefixLength = 24;
}
];
environment.systemPackages = with pkgs; [
curl
grpcurl
jq
python3
];
services.iam = {
enable = true;
package = photoncloudPackages.iam-server;
port = 50080;
httpPort = 8083;
storeBackend = "memory";
};
systemd.services.iam.environment = {
IAM_ALLOW_RANDOM_SIGNING_KEY = "1";
};
services.fiberlb = {
enable = true;
package = photoncloudPackages.fiberlb-server;
port = 50085;
iamAddr = "192.168.100.2:50080";
metadataBackend = "sqlite";
databaseUrl = "sqlite:/var/lib/fiberlb/metadata.db";
singleNode = true;
healthCheckIntervalSecs = 1;
healthCheckTimeoutSecs = 1;
vipCheckIntervalSecs = 1;
vipOwnership = {
enable = true;
interface = "lo";
};
bgp = {
enable = true;
localAs = 65010;
routerId = "192.168.100.2";
nextHop = "192.168.100.2";
holdTimeSecs = 9;
keepaliveSecs = 3;
peers = [
{
address = "192.168.100.1";
port = 179;
asn = 65020;
description = "frr-peer";
med = 10;
communities = [ "65010:101" ];
}
{
address = "192.168.100.3";
port = 179;
asn = 65030;
description = "bird-peer";
med = 20;
communities = [ "65010:202" ];
}
{
address = "192.168.100.4";
port = 179;
asn = 65040;
description = "gobgp-peer";
med = 30;
communities = [ "65010:303" ];
}
];
};
};
systemd.services.mock-backend = {
description = "FiberLB interop backend";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
serviceConfig = {
Type = "simple";
ExecStart = "${pkgs.python3}/bin/python ${backendScript}";
Restart = "always";
RestartSec = "1s";
};
};
system.stateVersion = "24.11";
};
};
testScript = ''
import json
import re
import shlex
import time
IAM_PROTO_DIR = "${iamProtoDir}"
IAM_PROTO = "${iamProto}"
FIBERLB_PROTO_DIR = "${fiberlbProtoDir}"
FIBERLB_PROTO = "${fiberlbProto}"
METRIC_RE = re.compile(r"^([a-zA-Z_:][a-zA-Z0-9_:]*)(?:\{([^}]*)\})?\s+([-+0-9.eE]+)$")
def grpcurl_json(machine, endpoint, import_path, proto, service, payload, headers=None):
header_args = ""
for header in headers or []:
header_args += f" -H {shlex.quote(header)}"
command = (
f"grpcurl -plaintext{header_args} "
f"-import-path {shlex.quote(import_path)} "
f"-proto {shlex.quote(proto)} "
f"-d {shlex.quote(json.dumps(payload))} "
f"{shlex.quote(endpoint)} {shlex.quote(service)}"
)
status, output = machine.execute(f"timeout 15 sh -lc {shlex.quote(command + ' 2>&1')}")
if status != 0:
raise AssertionError(
"grpcurl failed"
f" service={service}"
f" status={status}"
f" payload={json.dumps(payload, sort_keys=True)}"
f" output={output}"
)
return json.loads(output)
def issue_project_admin_token(machine, org_id, project_id):
principal_id = f"fiberlb-interop-{int(time.time())}"
deadline = time.time() + 120
def retry(action):
last_error = None
while time.time() < deadline:
try:
return action()
except Exception as exc:
last_error = exc
time.sleep(2)
raise AssertionError(f"IAM bootstrap timed out: {last_error}")
retry(lambda: grpcurl_json(
machine,
"127.0.0.1:50080",
IAM_PROTO_DIR,
IAM_PROTO,
"iam.v1.IamAdmin/CreatePrincipal",
{
"id": principal_id,
"kind": "PRINCIPAL_KIND_SERVICE_ACCOUNT",
"name": principal_id,
"orgId": org_id,
"projectId": project_id,
},
))
retry(lambda: grpcurl_json(
machine,
"127.0.0.1:50080",
IAM_PROTO_DIR,
IAM_PROTO,
"iam.v1.IamAdmin/CreateBinding",
{
"principal": {
"kind": "PRINCIPAL_KIND_SERVICE_ACCOUNT",
"id": principal_id,
},
"role": "roles/ProjectAdmin",
"scope": {
"project": {
"id": project_id,
"orgId": org_id,
}
},
},
))
token_response = retry(lambda: grpcurl_json(
machine,
"127.0.0.1:50080",
IAM_PROTO_DIR,
IAM_PROTO,
"iam.v1.IamToken/IssueToken",
{
"principalId": principal_id,
"principalKind": "PRINCIPAL_KIND_SERVICE_ACCOUNT",
"scope": {
"project": {
"id": project_id,
"orgId": org_id,
}
},
"ttlSeconds": 3600,
},
))
return token_response["token"]
def wait_for_backend_status(status, backend_id, token):
lb.wait_until_succeeds(
"grpcurl -plaintext "
f"-H {shlex.quote('authorization: Bearer ' + token)} "
f"-import-path {shlex.quote(FIBERLB_PROTO_DIR)} "
f"-proto {shlex.quote(FIBERLB_PROTO)} "
f"-d {shlex.quote(json.dumps({'id': backend_id}))} "
"127.0.0.1:50085 fiberlb.v1.BackendService/GetBackend "
f"| jq -e {shlex.quote(f'.backend.status == \"{status}\"')}"
)
def parse_labels(label_blob):
if not label_blob:
return {}
labels = {}
for part in label_blob.split(","):
key, value = part.split("=", 1)
labels[key] = value.strip().strip('"')
return labels
def fiberlb_diagnostics():
metrics = lb.succeed("curl -fsS http://127.0.0.1:9098/metrics || true")
journal = lb.succeed("journalctl -u fiberlb.service -n 200 --no-pager || true")
return (
"fiberlb metrics:\n"
f"{metrics}\n"
"fiberlb journal:\n"
f"{journal}"
)
def wait_for_metric(metric_name, expected_value, labels=None):
expected_labels = labels or {}
deadline = time.time() + 60
last_exposition = ""
while time.time() < deadline:
exposition = lb.succeed("curl -fsS http://127.0.0.1:9098/metrics")
last_exposition = exposition
for line in exposition.splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
match = METRIC_RE.match(line)
if not match:
continue
name, label_blob, value = match.groups()
if name != metric_name:
continue
if parse_labels(label_blob) != expected_labels:
continue
if abs(float(value) - float(expected_value)) < 0.0001:
return
time.sleep(1)
raise AssertionError(
f"metric {metric_name} with labels={expected_labels} did not reach {expected_value}\n"
f"last metrics scrape:\n{last_exposition}\n"
f"{fiberlb_diagnostics()}"
)
def wait_for_local_vip(vip):
lb.wait_until_succeeds(f"ip -4 addr show dev lo | grep -F {shlex.quote('inet ' + vip + '/32')}")
def wait_for_gobgp_route(prefix, present):
command = "gobgp -u 127.0.0.1 -p 50051 global rib || true"
if present:
gobgp.wait_until_succeeds(f"{command} | grep -F {shlex.quote(prefix)}")
else:
deadline = time.time() + 60
while time.time() < deadline:
output = gobgp.succeed(command)
if prefix not in output:
return
time.sleep(1)
raise AssertionError(f"route {prefix} still present in GoBGP RIB")
def wait_for_bird_route(prefix):
bird.wait_until_succeeds(
f"birdc -s /run/bird.ctl show route for {shlex.quote(prefix)} all | grep -F {shlex.quote(prefix)}"
)
def wait_for_frr_route(prefix):
frr.wait_until_succeeds(
f"vtysh -c {shlex.quote('show ip bgp ' + prefix)} | grep -F {shlex.quote(prefix)}"
)
def wait_for_unit_or_dump(machine, unit):
deadline = time.time() + 120
while time.time() < deadline:
status, output = machine.execute(f"systemctl is-active {shlex.quote(unit)}")
state = output.strip()
if status == 0 and state == "active":
return
if state == "failed":
service_status = machine.succeed(
f"systemctl status {shlex.quote(unit)} --no-pager || true"
)
journal = machine.succeed(
f"journalctl -u {shlex.quote(unit)} -n 200 --no-pager || true"
)
raise AssertionError(
f"unit {unit} failed to start\n"
f"systemctl status:\n{service_status}\n"
f"journal:\n{journal}"
)
time.sleep(1)
service_status = machine.succeed(
f"systemctl status {shlex.quote(unit)} --no-pager || true"
)
journal = machine.succeed(
f"journalctl -u {shlex.quote(unit)} -n 200 --no-pager || true"
)
raise AssertionError(
f"unit {unit} did not become active before timeout\n"
f"systemctl status:\n{service_status}\n"
f"journal:\n{journal}"
)
def wait_for_command_or_dump(machine, command, unit=None, timeout=120):
deadline = time.time() + timeout
last_output = ""
while time.time() < deadline:
status, output = machine.execute(f"sh -lc {shlex.quote(command + ' 2>&1')}")
last_output = output
if status == 0:
return
time.sleep(1)
diagnostics = f"last command output:\n{last_output}\n"
if unit is not None:
diagnostics += (
f"systemctl status:\n{machine.succeed(f'systemctl status {shlex.quote(unit)} --no-pager || true')}\n"
f"journal:\n{machine.succeed(f'journalctl -u {shlex.quote(unit)} -n 200 --no-pager || true')}\n"
)
diagnostics += f"socket state:\n{machine.succeed('ss -ltnp || true')}\n"
raise AssertionError(
f"command did not succeed before timeout: {command}\n{diagnostics}"
)
start_all()
serial_stdout_off()
wait_for_unit_or_dump(frr, "frr-zebra.service")
wait_for_command_or_dump(frr, "test -S /run/frr/zserv.api", "frr-zebra.service")
wait_for_unit_or_dump(frr, "frr-bgpd.service")
wait_for_command_or_dump(
frr,
"ss -ltnH '( sport = :179 )' | grep -q LISTEN",
"frr-bgpd.service",
)
wait_for_unit_or_dump(bird, "bird-peer.service")
wait_for_unit_or_dump(gobgp, "gobgpd-peer.service")
wait_for_command_or_dump(
gobgp,
"ss -ltnH '( sport = :179 )' | grep -q LISTEN",
"gobgpd-peer.service",
)
wait_for_unit_or_dump(lb, "iam.service")
wait_for_command_or_dump(lb, "ss -ltnH '( sport = :50080 )' | grep -q LISTEN", "iam.service")
wait_for_unit_or_dump(lb, "mock-backend.service")
wait_for_unit_or_dump(lb, "fiberlb.service")
wait_for_command_or_dump(lb, "ss -ltnH '( sport = :50085 )' | grep -q LISTEN", "fiberlb.service")
wait_for_command_or_dump(lb, "ss -ltnH '( sport = :9098 )' | grep -q LISTEN", "fiberlb.service")
frr.wait_until_succeeds("vtysh -c 'show ip bgp neighbor 192.168.100.2' | grep -F 'BGP state = Established'")
bird.wait_until_succeeds("birdc -s /run/bird.ctl show protocols all fiberlb_peer | grep -F Established")
gobgp.wait_until_succeeds("gobgp -u 127.0.0.1 -p 50051 neighbor | grep -F 192.168.100.2")
wait_for_metric("fiberlb_bgp_configured_peers", 3)
wait_for_metric("fiberlb_bgp_peer_session_up", 1, {"peer": "192.168.100.1:179"})
wait_for_metric("fiberlb_bgp_peer_session_up", 1, {"peer": "192.168.100.3:179"})
wait_for_metric("fiberlb_bgp_peer_session_up", 1, {"peer": "192.168.100.4:179"})
wait_for_metric("fiberlb_bgp_connected_peers", 3)
token = issue_project_admin_token(lb, "bgp-interop-org", "bgp-interop-project")
lb_response = grpcurl_json(
lb,
"127.0.0.1:50085",
FIBERLB_PROTO_DIR,
FIBERLB_PROTO,
"fiberlb.v1.LoadBalancerService/CreateLoadBalancer",
{
"name": "bgp-interop-lb",
"orgId": "bgp-interop-org",
"projectId": "bgp-interop-project",
"description": "native bgp interop smoke",
"vipAddress": "203.0.113.77",
},
headers=[f"authorization: Bearer {token}"],
)
loadbalancer = lb_response["loadbalancer"]
lb_id = loadbalancer["id"]
vip = loadbalancer["vipAddress"]
vip_prefix = f"{vip}/32"
pool_id = grpcurl_json(
lb,
"127.0.0.1:50085",
FIBERLB_PROTO_DIR,
FIBERLB_PROTO,
"fiberlb.v1.PoolService/CreatePool",
{
"name": "bgp-interop-pool",
"loadbalancerId": lb_id,
"algorithm": "POOL_ALGORITHM_ROUND_ROBIN",
"protocol": "POOL_PROTOCOL_TCP",
},
headers=[f"authorization: Bearer {token}"],
)["pool"]["id"]
backend_id = grpcurl_json(
lb,
"127.0.0.1:50085",
FIBERLB_PROTO_DIR,
FIBERLB_PROTO,
"fiberlb.v1.BackendService/CreateBackend",
{
"name": "bgp-interop-backend",
"poolId": pool_id,
"address": "127.0.0.1",
"port": 18081,
"weight": 1,
},
headers=[f"authorization: Bearer {token}"],
)["backend"]["id"]
grpcurl_json(
lb,
"127.0.0.1:50085",
FIBERLB_PROTO_DIR,
FIBERLB_PROTO,
"fiberlb.v1.HealthCheckService/CreateHealthCheck",
{
"name": "bgp-interop-health",
"poolId": pool_id,
"type": "HEALTH_CHECK_TYPE_HTTP",
"intervalSeconds": 1,
"timeoutSeconds": 1,
"healthyThreshold": 1,
"unhealthyThreshold": 1,
"httpConfig": {
"method": "GET",
"path": "/",
"expectedCodes": [200],
},
},
headers=[f"authorization: Bearer {token}"],
)
grpcurl_json(
lb,
"127.0.0.1:50085",
FIBERLB_PROTO_DIR,
FIBERLB_PROTO,
"fiberlb.v1.ListenerService/CreateListener",
{
"name": "bgp-interop-listener",
"loadbalancerId": lb_id,
"protocol": "LISTENER_PROTOCOL_TCP",
"port": 18080,
"defaultPoolId": pool_id,
},
headers=[f"authorization: Bearer {token}"],
)
wait_for_backend_status("BACKEND_STATUS_ONLINE", backend_id, token)
wait_for_local_vip(vip)
wait_for_metric("fiberlb_bgp_desired_routes", 1)
wait_for_frr_route(vip_prefix)
wait_for_bird_route(vip_prefix)
wait_for_gobgp_route(vip_prefix, True)
frr.wait_until_succeeds(
"vtysh -c 'show ip bgp 203.0.113.77/32' | grep -F 'metric 10'"
)
frr.wait_until_succeeds(
"vtysh -c 'show ip bgp 203.0.113.77/32' | grep -F 'Community: 65010:101'"
)
bird.wait_until_succeeds(
"birdc -s /run/bird.ctl show route for 203.0.113.77/32 all | grep -F 'BGP.med: 20'"
)
bird.wait_until_succeeds(
"birdc -s /run/bird.ctl show route for 203.0.113.77/32 all | grep -F 'BGP.community: (65010,202)'"
)
gobgp.succeed("systemctl stop gobgpd-peer.service")
wait_for_metric("fiberlb_bgp_connected_peers", 2)
wait_for_metric("fiberlb_bgp_peer_session_up", 0, {"peer": "192.168.100.4:179"})
wait_for_frr_route(vip_prefix)
wait_for_bird_route(vip_prefix)
gobgp.succeed("systemctl start gobgpd-peer.service")
wait_for_unit_or_dump(gobgp, "gobgpd-peer.service")
gobgp.wait_until_succeeds("gobgp -u 127.0.0.1 -p 50051 neighbor | grep -F 192.168.100.2")
wait_for_metric("fiberlb_bgp_connected_peers", 3)
wait_for_metric("fiberlb_bgp_peer_session_up", 1, {"peer": "192.168.100.4:179"})
wait_for_gobgp_route(vip_prefix, True)
'';
}