Move native runtime seed state into declarative Nix
Some checks failed
Nix CI / filter (push) Successful in 6s
Nix CI / gate () (push) Failing after 1s
Nix CI / gate (shared crates) (push) Has been skipped
Nix CI / build () (push) Has been skipped
Nix CI / ci-status (push) Failing after 1s

This commit is contained in:
centra 2026-03-28 07:38:45 +09:00
parent 9d21e2da95
commit d6d96b8c37
Signed by: centra
GPG key ID: 0C09689D20B25ACA
6 changed files with 868 additions and 410 deletions

View file

@ -13,6 +13,7 @@ This flow:
- builds all six VM images on the host
- boots the cluster in dependency order
- validates control-plane, worker, gateway, storage, and fault-injection behavior
- proves that `deployer` seeds scheduler-managed native services directly from declarative Nix cluster state
## Publishable Checks

View file

@ -184,6 +184,481 @@ let
};
};
mkServicePortsType = types: types.submodule {
options = {
http = mkOption {
type = types.nullOr types.port;
default = null;
description = "Optional HTTP port exposed by the service";
};
grpc = mkOption {
type = types.nullOr types.port;
default = null;
description = "Optional gRPC port exposed by the service";
};
};
};
mkProcessType = types: types.submodule {
options = {
command = mkOption {
type = types.str;
description = "Executable invoked by node-agent";
};
args = mkOption {
type = types.listOf types.str;
default = [ ];
description = "Command-line arguments passed to the process";
};
workingDir = mkOption {
type = types.nullOr types.str;
default = null;
description = "Optional working directory used when spawning the process";
};
env = mkOption {
type = types.attrsOf types.str;
default = { };
description = "Environment variables injected into the process";
};
};
};
mkContainerPortType = types: types.submodule {
options = {
containerPort = mkOption {
type = types.port;
description = "Port exposed inside the container";
};
hostPort = mkOption {
type = types.nullOr types.port;
default = null;
description = "Optional fixed host port published for this container port";
};
protocol = mkOption {
type = types.nullOr types.str;
default = null;
description = "Optional transport protocol for the published port";
};
};
};
mkContainerVolumeType = types: types.submodule {
options = {
source = mkOption {
type = types.str;
description = "Host-side volume source path";
};
target = mkOption {
type = types.str;
description = "Container mount target path";
};
readOnly = mkOption {
type = types.bool;
default = false;
description = "Whether the volume should be mounted read-only";
};
};
};
mkContainerType = types:
let
containerPortType = mkContainerPortType types;
containerVolumeType = mkContainerVolumeType types;
in types.submodule {
options = {
image = mkOption {
type = types.str;
description = "Container image reference";
};
runtime = mkOption {
type = types.nullOr types.str;
default = null;
description = "Container runtime invoked by node-agent";
};
command = mkOption {
type = types.listOf types.str;
default = [ ];
description = "Optional entrypoint override";
};
args = mkOption {
type = types.listOf types.str;
default = [ ];
description = "Container arguments appended after the image";
};
env = mkOption {
type = types.attrsOf types.str;
default = { };
description = "Environment variables passed to the container runtime";
};
ports = mkOption {
type = types.listOf containerPortType;
default = [ ];
description = "Published container ports";
};
volumes = mkOption {
type = types.listOf containerVolumeType;
default = [ ];
description = "Host volume mounts passed to the container runtime";
};
networkMode = mkOption {
type = types.nullOr types.str;
default = null;
description = "Optional container network mode";
};
pullPolicy = mkOption {
type = types.nullOr types.str;
default = null;
description = "Container image pull policy";
};
workingDir = mkOption {
type = types.nullOr types.str;
default = null;
description = "Optional container working directory";
};
};
};
mkHealthCheckType = types: types.submodule {
options = {
type = mkOption {
type = types.str;
description = "Health check type executed by node-agent";
};
path = mkOption {
type = types.nullOr types.str;
default = null;
description = "Optional path used by HTTP health checks";
};
intervalSecs = mkOption {
type = types.nullOr types.ints.positive;
default = null;
description = "Health check interval in seconds";
};
timeoutSecs = mkOption {
type = types.nullOr types.ints.positive;
default = null;
description = "Health check timeout in seconds";
};
startupGraceSecs = mkOption {
type = types.nullOr types.ints.positive;
default = null;
description = "Startup grace period before a service is considered unhealthy";
};
};
};
mkPlacementPolicyType = types: types.submodule {
options = {
roles = mkOption {
type = types.listOf types.str;
default = [ ];
description = "Roles matched by the scheduler placement filter";
};
pools = mkOption {
type = types.listOf types.str;
default = [ ];
description = "Pools matched by the scheduler placement filter";
};
nodeClasses = mkOption {
type = types.listOf types.str;
default = [ ];
description = "Node classes matched by the scheduler placement filter";
};
matchLabels = mkOption {
type = types.attrsOf types.str;
default = { };
description = "Additional label selectors matched by the scheduler";
};
spreadByLabel = mkOption {
type = types.nullOr types.str;
default = null;
description = "Optional spread key used when balancing replicas";
};
maxInstancesPerNode = mkOption {
type = types.ints.positive;
default = 1;
description = "Maximum number of replicas the scheduler may place on one node";
};
};
};
mkRolloutStrategyType = types: types.submodule {
options = {
maxUnavailable = mkOption {
type = types.ints.unsigned;
default = 1;
description = "Maximum unavailable instances allowed during a rollout";
};
maxSurge = mkOption {
type = types.ints.unsigned;
default = 1;
description = "Maximum extra instances allowed during a rollout";
};
};
};
mkServiceScheduleType = types:
let
placementPolicyType = mkPlacementPolicyType types;
rolloutStrategyType = mkRolloutStrategyType types;
processType = mkProcessType types;
containerType = mkContainerType types;
healthCheckType = mkHealthCheckType types;
in types.submodule {
options = {
replicas = mkOption {
type = types.ints.positive;
default = 1;
description = "Desired number of scheduler-managed replicas";
};
placement = mkOption {
type = placementPolicyType;
default = { };
description = "Scheduler placement rules for the service";
};
rollout = mkOption {
type = rolloutStrategyType;
default = { };
description = "Rollout budget used by the scheduler";
};
instancePort = mkOption {
type = types.nullOr types.port;
default = null;
description = "Host port used when creating service instances";
};
meshPort = mkOption {
type = types.nullOr types.port;
default = null;
description = "Optional service mesh port for the managed instances";
};
process = mkOption {
type = types.nullOr processType;
default = null;
description = "Process-based runtime specification";
};
container = mkOption {
type = types.nullOr containerType;
default = null;
description = "Container-based runtime specification";
};
healthCheck = mkOption {
type = types.nullOr healthCheckType;
default = null;
description = "Health check performed by node-agent";
};
};
};
mkDnsPublicationType = types: types.submodule {
options = {
zone = mkOption {
type = types.str;
description = "FlashDNS zone used for service publication";
};
name = mkOption {
type = types.nullOr types.str;
default = null;
description = "Optional record name inside the published zone";
};
ttl = mkOption {
type = types.ints.positive;
default = 30;
description = "DNS TTL for the published record";
};
mode = mkOption {
type = types.enum [ "load_balancer" "direct" ];
default = "load_balancer";
description = "Whether DNS publishes the load balancer VIP or a direct instance address";
};
};
};
mkLoadBalancerPublicationType = types: types.submodule {
options = {
orgId = mkOption {
type = types.nullOr types.str;
default = null;
description = "Optional organization used when provisioning FiberLB resources";
};
projectId = mkOption {
type = types.nullOr types.str;
default = null;
description = "Optional project used when provisioning FiberLB resources";
};
name = mkOption {
type = types.nullOr types.str;
default = null;
description = "Optional explicit load balancer name";
};
listenerPort = mkOption {
type = types.nullOr types.port;
default = null;
description = "Listener port exposed by the load balancer";
};
protocol = mkOption {
type = types.nullOr types.str;
default = null;
description = "Listener protocol for the published load balancer";
};
poolProtocol = mkOption {
type = types.nullOr types.str;
default = null;
description = "Backend pool protocol for the published load balancer";
};
};
};
mkServicePublicationType = types:
let
dnsPublicationType = mkDnsPublicationType types;
loadBalancerPublicationType = mkLoadBalancerPublicationType types;
in types.submodule {
options = {
orgId = mkOption {
type = types.nullOr types.str;
default = null;
description = "Default organization used for service publication";
};
projectId = mkOption {
type = types.nullOr types.str;
default = null;
description = "Default project used for service publication";
};
dns = mkOption {
type = types.nullOr dnsPublicationType;
default = null;
description = "Optional FlashDNS publication target";
};
loadBalancer = mkOption {
type = types.nullOr loadBalancerPublicationType;
default = null;
description = "Optional FiberLB publication target";
};
};
};
mkServiceType = types:
let
servicePortsType = mkServicePortsType types;
serviceScheduleType = mkServiceScheduleType types;
servicePublicationType = mkServicePublicationType types;
in types.submodule {
options = {
ports = mkOption {
type = types.nullOr servicePortsType;
default = null;
description = "Optional logical service ports";
};
protocol = mkOption {
type = types.nullOr types.str;
default = null;
description = "Optional service protocol";
};
mtlsRequired = mkOption {
type = types.nullOr types.bool;
default = null;
description = "Whether service-to-service traffic requires mTLS";
};
meshMode = mkOption {
type = types.nullOr types.str;
default = null;
description = "Optional mesh publication mode";
};
schedule = mkOption {
type = types.nullOr serviceScheduleType;
default = null;
description = "Scheduler-managed runtime intent";
};
publish = mkOption {
type = types.nullOr servicePublicationType;
default = null;
description = "Optional publication targets for the service";
};
};
};
mkMtlsPolicyType = types: types.submodule {
options = {
environment = mkOption {
type = types.nullOr types.str;
default = null;
description = "Optional environment scope for the policy";
};
sourceService = mkOption {
type = types.str;
description = "Source service matched by the policy";
};
targetService = mkOption {
type = types.str;
description = "Target service matched by the policy";
};
mtlsRequired = mkOption {
type = types.nullOr types.bool;
default = null;
description = "Whether the policy enforces mTLS";
};
mode = mkOption {
type = types.nullOr types.str;
default = null;
description = "Optional policy mode";
};
};
};
mkNodeType = types:
let
installPlanType = mkInstallPlanType types;
@ -664,6 +1139,205 @@ let
node_id_prefix = rule.nodeIdPrefix;
};
mkServicePorts = ports:
optionalAttrs (ports != null && ports.http != null) {
http = ports.http;
}
// optionalAttrs (ports != null && ports.grpc != null) {
grpc = ports.grpc;
};
mkProcessSpec = process:
{
command = process.command;
args = process.args;
env = process.env;
}
// optionalAttrs (process.workingDir != null) {
working_dir = process.workingDir;
};
mkContainerPortSpec = port:
{
container_port = port.containerPort;
}
// optionalAttrs (port.hostPort != null) {
host_port = port.hostPort;
}
// optionalAttrs (port.protocol != null) {
protocol = port.protocol;
};
mkContainerVolumeSpec = volume:
{
source = volume.source;
target = volume.target;
}
// optionalAttrs volume.readOnly {
read_only = true;
};
mkContainerSpec = container:
{
image = container.image;
command = container.command;
args = container.args;
env = container.env;
ports = map mkContainerPortSpec container.ports;
volumes = map mkContainerVolumeSpec container.volumes;
}
// optionalAttrs (container.runtime != null) {
runtime = container.runtime;
}
// optionalAttrs (container.networkMode != null) {
network_mode = container.networkMode;
}
// optionalAttrs (container.pullPolicy != null) {
pull_policy = container.pullPolicy;
}
// optionalAttrs (container.workingDir != null) {
working_dir = container.workingDir;
};
mkHealthCheckSpec = healthCheck:
{
type = healthCheck.type;
}
// optionalAttrs (healthCheck.path != null) {
path = healthCheck.path;
}
// optionalAttrs (healthCheck.intervalSecs != null) {
interval_secs = healthCheck.intervalSecs;
}
// optionalAttrs (healthCheck.timeoutSecs != null) {
timeout_secs = healthCheck.timeoutSecs;
}
// optionalAttrs (healthCheck.startupGraceSecs != null) {
startup_grace_secs = healthCheck.startupGraceSecs;
};
mkPlacementPolicySpec = placement:
{
roles = placement.roles;
pools = placement.pools;
node_classes = placement.nodeClasses;
match_labels = placement.matchLabels;
max_instances_per_node = placement.maxInstancesPerNode;
}
// optionalAttrs (placement.spreadByLabel != null) {
spread_by_label = placement.spreadByLabel;
};
mkRolloutStrategySpec = rollout: {
max_unavailable = rollout.maxUnavailable;
max_surge = rollout.maxSurge;
};
mkServiceScheduleSpec = schedule:
{
replicas = schedule.replicas;
placement = mkPlacementPolicySpec schedule.placement;
rollout = mkRolloutStrategySpec schedule.rollout;
}
// optionalAttrs (schedule.instancePort != null) {
instance_port = schedule.instancePort;
}
// optionalAttrs (schedule.meshPort != null) {
mesh_port = schedule.meshPort;
}
// optionalAttrs (schedule.process != null) {
process = mkProcessSpec schedule.process;
}
// optionalAttrs (schedule.container != null) {
container = mkContainerSpec schedule.container;
}
// optionalAttrs (schedule.healthCheck != null) {
health_check = mkHealthCheckSpec schedule.healthCheck;
};
mkDnsPublicationSpec = dns:
{
zone = dns.zone;
ttl = dns.ttl;
mode = dns.mode;
}
// optionalAttrs (dns.name != null) {
name = dns.name;
};
mkLoadBalancerPublicationSpec = loadBalancer:
optionalAttrs (loadBalancer.orgId != null) {
org_id = loadBalancer.orgId;
}
// optionalAttrs (loadBalancer.projectId != null) {
project_id = loadBalancer.projectId;
}
// optionalAttrs (loadBalancer.name != null) {
name = loadBalancer.name;
}
// optionalAttrs (loadBalancer.listenerPort != null) {
listener_port = loadBalancer.listenerPort;
}
// optionalAttrs (loadBalancer.protocol != null) {
protocol = loadBalancer.protocol;
}
// optionalAttrs (loadBalancer.poolProtocol != null) {
pool_protocol = loadBalancer.poolProtocol;
};
mkServicePublicationSpec = publish:
optionalAttrs (publish.orgId != null) {
org_id = publish.orgId;
}
// optionalAttrs (publish.projectId != null) {
project_id = publish.projectId;
}
// optionalAttrs (publish.dns != null) {
dns = mkDnsPublicationSpec publish.dns;
}
// optionalAttrs (publish.loadBalancer != null) {
load_balancer = mkLoadBalancerPublicationSpec publish.loadBalancer;
};
mkDeployerServiceSpec = name: service:
{
inherit name;
}
// optionalAttrs (service.ports != null && mkServicePorts service.ports != { }) {
ports = mkServicePorts service.ports;
}
// optionalAttrs (service.protocol != null) {
protocol = service.protocol;
}
// optionalAttrs (service.mtlsRequired != null) {
mtls_required = service.mtlsRequired;
}
// optionalAttrs (service.meshMode != null) {
mesh_mode = service.meshMode;
}
// optionalAttrs (service.schedule != null) {
schedule = mkServiceScheduleSpec service.schedule;
}
// optionalAttrs (service.publish != null) {
publish = mkServicePublicationSpec service.publish;
};
mkDeployerMtlsPolicySpec = name: policy:
{
policy_id = name;
source_service = policy.sourceService;
target_service = policy.targetService;
}
// optionalAttrs (policy.environment != null) {
environment = policy.environment;
}
// optionalAttrs (policy.mtlsRequired != null) {
mtls_required = policy.mtlsRequired;
}
// optionalAttrs (policy.mode != null) {
mode = policy.mode;
};
mkClusterConfig = {
cluster,
hostname,
@ -729,6 +1403,8 @@ let
pools = deployer.pools or { };
enrollmentRules = deployer.enrollmentRules or { };
hostDeployments = deployer.hostDeployments or { };
services = deployer.services or { };
mtlsPolicies = deployer.mtlsPolicies or { };
in {
cluster = {
cluster_id = clusterId;
@ -740,9 +1416,9 @@ let
pools = map (name: mkDeployerPoolSpec name pools.${name}) (attrNames pools);
enrollment_rules = map (name: mkDeployerEnrollmentRuleSpec name enrollmentRules.${name}) (attrNames enrollmentRules);
host_deployments = map (name: mkDeployerHostDeploymentSpec name hostDeployments.${name}) (attrNames hostDeployments);
services = [ ];
services = map (name: mkDeployerServiceSpec name services.${name}) (attrNames services);
instances = [ ];
mtls_policies = [ ];
mtls_policies = map (name: mkDeployerMtlsPolicySpec name mtlsPolicies.${name}) (attrNames mtlsPolicies);
};
in
{
@ -751,6 +1427,20 @@ in
mkDesiredSystemType
mkHostDeploymentSelectorType
mkHostDeploymentType
mkServicePortsType
mkProcessType
mkContainerPortType
mkContainerVolumeType
mkContainerType
mkHealthCheckType
mkPlacementPolicyType
mkRolloutStrategyType
mkServiceScheduleType
mkDnsPublicationType
mkLoadBalancerPublicationType
mkServicePublicationType
mkServiceType
mkMtlsPolicyType
mkNodeType
mkNodeClassType
mkNodePoolType

View file

@ -10,6 +10,8 @@ let
nodePoolType = clusterConfigLib.mkNodePoolType types;
enrollmentRuleType = clusterConfigLib.mkEnrollmentRuleType types;
hostDeploymentType = clusterConfigLib.mkHostDeploymentType types;
serviceType = clusterConfigLib.mkServiceType types;
mtlsPolicyType = clusterConfigLib.mkMtlsPolicyType types;
jsonFormat = pkgs.formats.json { };
# Generate cluster-config.json for the current node
@ -105,6 +107,18 @@ in {
default = { };
description = "Declarative host rollout objects derived from Nix";
};
services = mkOption {
type = types.attrsOf serviceType;
default = { };
description = "Scheduler-managed service definitions derived from Nix";
};
mtlsPolicies = mkOption {
type = types.attrsOf mtlsPolicyType;
default = { };
description = "Declarative mTLS policies derived from Nix";
};
};
generated = {

View file

@ -14,6 +14,7 @@ All VM images are built on the host in a single Nix invocation and then booted a
- gateway-node `apigateway`, `nightlight`, and minimal `creditservice` startup
- host-forwarded access to the API gateway and NightLight HTTP surfaces
- cross-node data replication smoke tests for `chainfire` and `flaredb`
- deployer-seeded native runtime scheduling from declarative Nix service definitions, including drain/failover recovery
## Validation layers

View file

@ -201,6 +201,7 @@ in
pool = "general";
nodeClass = "worker-linux";
failureDomain = "zone-b";
state = "provisioning";
raftPort = 2380;
apiPort = 2379;
};
@ -214,6 +215,7 @@ in
pool = "general";
nodeClass = "worker-linux";
failureDomain = "zone-c";
state = "provisioning";
raftPort = 2380;
apiPort = 2379;
};
@ -273,6 +275,95 @@ in
};
};
};
services = {
native-web = {
protocol = "http";
ports.http = 18190;
schedule = {
replicas = 2;
placement = {
roles = [ "worker" ];
pools = [ "general" ];
nodeClasses = [ "worker-linux" ];
matchLabels = {
runtime = "native";
};
spreadByLabel = "failure_domain";
maxInstancesPerNode = 1;
};
instancePort = 18190;
process = {
command = "python3";
args = [
"-m"
"http.server"
"\${INSTANCE_PORT}"
"--bind"
"\${INSTANCE_IP}"
];
};
healthCheck = {
type = "http";
path = "/";
intervalSecs = 5;
timeoutSecs = 3;
};
};
publish = {
dns = {
zone = "native.cluster.test";
name = "web";
ttl = 30;
mode = "load_balancer";
};
loadBalancer = {
orgId = "native-services";
projectId = "test-cluster";
listenerPort = 18191;
protocol = "http";
poolProtocol = "http";
};
};
};
native-container = {
protocol = "http";
ports.http = 18192;
schedule = {
replicas = 1;
placement = {
roles = [ "worker" ];
pools = [ "general" ];
nodeClasses = [ "worker-linux" ];
matchLabels = {
runtime = "native";
};
maxInstancesPerNode = 1;
};
instancePort = 18192;
container = {
image = "docker.io/library/nginx:1.27-alpine";
runtime = "podman";
pullPolicy = "if-not-present";
ports = [
{
containerPort = 80;
hostPort = 18192;
protocol = "tcp";
}
];
};
healthCheck = {
type = "http";
path = "/";
intervalSecs = 5;
timeoutSecs = 5;
startupGraceSecs = 120;
};
};
};
};
};
bootstrap.initialPeers = [ "node01" "node02" "node03" ];

View file

@ -4805,28 +4805,23 @@ validate_deployer_flow() {
}
validate_native_runtime_flow() {
log "Validating native deployer + scheduler runtime orchestration"
log "Validating native deployer + scheduler runtime orchestration from declarative Nix seed"
wait_for_unit node04 node-agent
wait_for_unit node05 node-agent
wait_for_unit node06 fleet-scheduler
wait_for_http node06 "http://127.0.0.1:8088/health"
local tmp_dir native_config drained_config restored_config
local chainfire_tunnel_node01="" chainfire_tunnel_node02="" chainfire_tunnel_node03=""
local chainfire_endpoint="http://127.0.0.1:12379,http://127.0.0.1:12380,http://127.0.0.1:12381"
local iam_tunnel="" lb_tunnel="" token lb_name
local native_fresh_healthy_map_expr native_fresh_healthy_count_expr
tmp_dir="$(mktemp -d -p "${TMPDIR:-/tmp}" photon-native-runtime-XXXXXX)"
native_config="${tmp_dir}/native-runtime.yaml"
drained_config="${tmp_dir}/native-runtime-drained.yaml"
restored_config="${tmp_dir}/native-runtime-restored.yaml"
native_fresh_healthy_map_expr='map(select(.state == "healthy" and (((((.last_heartbeat // .observed_at) // "") | sub("\\.[0-9]+"; "") | sub("\\+00:00$"; "Z") | fromdateiso8601?) // 0) >= (now - 300))))'
native_fresh_healthy_count_expr="${native_fresh_healthy_map_expr} | length"
chainfire_tunnel_node01="$(start_ssh_tunnel node01 12379 2379 "${NODE_IPS[node01]}")"
chainfire_tunnel_node02="$(start_ssh_tunnel node02 12380 2379 "${NODE_IPS[node02]}")"
chainfire_tunnel_node03="$(start_ssh_tunnel node03 12381 2379 "${NODE_IPS[node03]}")"
trap 'stop_ssh_tunnel node01 "${lb_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"; stop_ssh_tunnel node01 "${chainfire_tunnel_node01}"; stop_ssh_tunnel node02 "${chainfire_tunnel_node02}"; stop_ssh_tunnel node03 "${chainfire_tunnel_node03}"; rm -rf "${tmp_dir}"' RETURN
trap 'stop_ssh_tunnel node01 "${lb_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"; stop_ssh_tunnel node01 "${chainfire_tunnel_node01}"; stop_ssh_tunnel node02 "${chainfire_tunnel_node02}"; stop_ssh_tunnel node03 "${chainfire_tunnel_node03}"' RETURN
native_dump_values() {
local prefix="$1"
@ -4879,7 +4874,13 @@ validate_native_runtime_flow() {
local instance_value="" node_id=""
while true; do
instance_value="$(native_first_healthy_instance "${service}")"
instance_value="$(
native_dump_values "photoncloud/clusters/test-cluster/instances/${service}/" \
| sed '/^$/d' \
| jq -sr \
--arg node "${expected_node}" \
"${native_fresh_healthy_map_expr} | map(select(.node_id == \$node)) | sort_by(.instance_id) | first"
)"
node_id="$(printf '%s' "${instance_value}" | jq -r '.node_id // empty')"
if [[ "${node_id}" == "${expected_node}" ]]; then
printf '%s' "${instance_value}"
@ -4955,373 +4956,22 @@ validate_native_runtime_flow() {
done
}
cat >"${native_config}" <<'EOF'
cluster:
cluster_id: test-cluster
environment: test
node_classes:
- name: worker-linux
description: Native runtime worker
roles:
- worker
labels:
tier: general
runtime: native
pools:
- name: general
description: General-purpose native worker pool
node_class: worker-linux
labels:
pool.photoncloud.io/name: general
nodes:
- node_id: node04
hostname: node04
ip: 10.100.0.21
roles:
- worker
labels:
runtime: native
pool: general
node_class: worker-linux
failure_domain: zone-b
state: provisioning
- node_id: node05
hostname: node05
ip: 10.100.0.22
roles:
- worker
labels:
runtime: native
pool: general
node_class: worker-linux
failure_domain: zone-c
state: provisioning
services:
- name: native-web
protocol: http
ports:
http: 18190
schedule:
replicas: 2
placement:
roles:
- worker
pools:
- general
node_classes:
- worker-linux
match_labels:
runtime: native
spread_by_label: failure_domain
max_instances_per_node: 1
instance_port: 18190
process:
command: python3
args:
- -m
- http.server
- ${INSTANCE_PORT}
- --bind
- ${INSTANCE_IP}
health_check:
type: http
path: /
interval_secs: 5
timeout_secs: 3
publish:
dns:
zone: native.cluster.test
name: web
ttl: 30
mode: load_balancer
load_balancer:
org_id: native-services
project_id: test-cluster
listener_port: 18191
protocol: http
pool_protocol: http
- name: native-container
protocol: http
ports:
http: 18192
schedule:
replicas: 1
placement:
roles:
- worker
pools:
- general
node_classes:
- worker-linux
match_labels:
runtime: native
max_instances_per_node: 1
instance_port: 18192
container:
image: docker.io/library/nginx:1.27-alpine
runtime: podman
pull_policy: if-not-present
ports:
- container_port: 80
host_port: 18192
protocol: tcp
health_check:
type: http
path: /
interval_secs: 5
timeout_secs: 5
startup_grace_secs: 120
instances: []
mtls_policies: []
EOF
cat >"${drained_config}" <<'EOF'
cluster:
cluster_id: test-cluster
environment: test
node_classes:
- name: worker-linux
description: Native runtime worker
roles:
- worker
labels:
tier: general
runtime: native
pools:
- name: general
description: General-purpose native worker pool
node_class: worker-linux
labels:
pool.photoncloud.io/name: general
nodes:
- node_id: node04
hostname: node04
ip: 10.100.0.21
roles:
- worker
labels:
runtime: native
pool: general
node_class: worker-linux
failure_domain: zone-b
state: draining
- node_id: node05
hostname: node05
ip: 10.100.0.22
roles:
- worker
labels:
runtime: native
pool: general
node_class: worker-linux
failure_domain: zone-c
state: active
services:
- name: native-web
protocol: http
ports:
http: 18190
schedule:
replicas: 1
placement:
roles:
- worker
pools:
- general
node_classes:
- worker-linux
match_labels:
runtime: native
spread_by_label: failure_domain
max_instances_per_node: 1
instance_port: 18190
process:
command: python3
args:
- -m
- http.server
- ${INSTANCE_PORT}
- --bind
- ${INSTANCE_IP}
health_check:
type: http
path: /
interval_secs: 5
timeout_secs: 3
publish:
dns:
zone: native.cluster.test
name: web
ttl: 30
mode: load_balancer
load_balancer:
org_id: native-services
project_id: test-cluster
listener_port: 18191
protocol: http
pool_protocol: http
- name: native-container
protocol: http
ports:
http: 18192
schedule:
replicas: 1
placement:
roles:
- worker
pools:
- general
node_classes:
- worker-linux
match_labels:
runtime: native
max_instances_per_node: 1
instance_port: 18192
container:
image: docker.io/library/nginx:1.27-alpine
runtime: podman
pull_policy: if-not-present
ports:
- container_port: 80
host_port: 18192
protocol: tcp
health_check:
type: http
path: /
interval_secs: 5
timeout_secs: 5
startup_grace_secs: 120
instances: []
mtls_policies: []
EOF
cat >"${restored_config}" <<'EOF'
cluster:
cluster_id: test-cluster
environment: test
node_classes:
- name: worker-linux
description: Native runtime worker
roles:
- worker
labels:
tier: general
runtime: native
pools:
- name: general
description: General-purpose native worker pool
node_class: worker-linux
labels:
pool.photoncloud.io/name: general
nodes:
- node_id: node04
hostname: node04
ip: 10.100.0.21
roles:
- worker
labels:
runtime: native
pool: general
node_class: worker-linux
failure_domain: zone-b
state: active
- node_id: node05
hostname: node05
ip: 10.100.0.22
roles:
- worker
labels:
runtime: native
pool: general
node_class: worker-linux
failure_domain: zone-c
state: active
services:
- name: native-web
protocol: http
ports:
http: 18190
schedule:
replicas: 1
placement:
roles:
- worker
pools:
- general
node_classes:
- worker-linux
match_labels:
runtime: native
spread_by_label: failure_domain
max_instances_per_node: 1
instance_port: 18190
process:
command: python3
args:
- -m
- http.server
- ${INSTANCE_PORT}
- --bind
- ${INSTANCE_IP}
health_check:
type: http
path: /
interval_secs: 5
timeout_secs: 3
publish:
dns:
zone: native.cluster.test
name: web
ttl: 30
mode: load_balancer
load_balancer:
org_id: native-services
project_id: test-cluster
listener_port: 18191
protocol: http
pool_protocol: http
- name: native-container
protocol: http
ports:
http: 18192
schedule:
replicas: 1
placement:
roles:
- worker
pools:
- general
node_classes:
- worker-linux
match_labels:
runtime: native
max_instances_per_node: 1
instance_port: 18192
container:
image: docker.io/library/nginx:1.27-alpine
runtime: podman
pull_policy: if-not-present
ports:
- container_port: 80
host_port: 18192
protocol: tcp
health_check:
type: http
path: /
interval_secs: 5
timeout_secs: 5
startup_grace_secs: 120
instances: []
mtls_policies: []
EOF
set_native_node_state() {
local node_id="$1"
local state="$2"
run_deployer_ctl \
--chainfire-endpoint "${chainfire_endpoint}" \
--cluster-id "test-cluster" \
--cluster-namespace "photoncloud" \
--deployer-namespace "deployer" \
apply --config "${native_config}"
node set-state --node-id "${node_id}" --state "${state}"
}
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/services/" \
'map(select(.name == "native-web" or .name == "native-container")) | length' \
"2" \
180
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/nodes/" \
'map(select(.labels.runtime == "native" and .state == "active")) | length' \
@ -5390,13 +5040,13 @@ EOF
wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180
wait_for_native_lb_backends "${publication_pool_id}" "2" 180 10.100.0.21 10.100.0.22
run_deployer_ctl \
--chainfire-endpoint "${chainfire_endpoint}" \
--cluster-id "test-cluster" \
--cluster-namespace "photoncloud" \
--deployer-namespace "deployer" \
apply --config "${drained_config}"
log "Draining node04 through deployer lifecycle state"
set_native_node_state "node04" "draining"
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/nodes/" \
'map(select(.node_id == "node04" and .state == "draining")) | length' \
"1" \
120
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-web/" \
'length' \
@ -5433,44 +5083,38 @@ EOF
wait_for_native_lb_backends "${publication_pool_id}" "1" 180 10.100.0.22
wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180
run_deployer_ctl \
--chainfire-endpoint "${chainfire_endpoint}" \
--cluster-id "test-cluster" \
--cluster-namespace "photoncloud" \
--deployer-namespace "deployer" \
apply --config "${restored_config}"
log "Restoring node04 and ensuring capacity returns without moving healthy singleton work"
set_native_node_state "node04" "active"
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/nodes/" \
'map(select(.node_id == "node04" and .state == "active")) | length' \
"1" \
120
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-web/" \
'length' \
"1" \
"2" \
240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-web/" \
"${native_fresh_healthy_count_expr}" \
"1" \
240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-container/" \
'length' \
"1" \
"2" \
240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-container/" \
"${native_fresh_healthy_count_expr}" \
"1" \
240
local restored_web_value restored_web_node restored_container_value restored_container_node
restored_web_value="$(wait_for_native_instance_node "native-web" "node05" 240)"
restored_web_node="$(printf '%s' "${restored_web_value}" | jq -r '.node_id')"
[[ "${restored_web_node}" == "node05" ]] || die "native-web unexpectedly moved after node04 returned to service"
wait_for_native_instance_node "native-web" "node04" 240 >/dev/null
wait_for_native_instance_node "native-web" "node05" 240 >/dev/null
local restored_container_value restored_container_node
restored_container_value="$(wait_for_native_instance_node "native-container" "node05" 240)"
restored_container_node="$(printf '%s' "${restored_container_value}" | jq -r '.node_id')"
[[ "${restored_container_node}" == "node05" ]] || die "native-container unexpectedly moved after node04 returned to service"
publication_value="$(native_publication_state)"
publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')"
publication_ip="$(printf '%s' "${publication_value}" | jq -r '.dns.value')"
wait_for_native_lb_backends "${publication_pool_id}" "1" 180 10.100.0.22
wait_for_native_lb_backends "${publication_pool_id}" "2" 180 10.100.0.21 10.100.0.22
wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180
wait_for_http node01 "http://127.0.0.1:18191/" 240
@ -5505,24 +5149,42 @@ EOF
wait_for_http node04 "http://10.100.0.21:18192/" 240
wait_for_http node01 "http://127.0.0.1:18191/" 240
log "Restarting native worker and ensuring placement stays stable"
log "Restarting native worker and ensuring declarative replica count is restored"
start_vm node05
wait_for_ssh node05
wait_for_unit node05 plasmavmc
wait_for_unit node05 lightningstor
wait_for_unit node05 node-agent
local recovered_web_value recovered_web_node recovered_container_value recovered_container_node
recovered_web_value="$(wait_for_native_instance_node "native-web" "node04" 240)"
recovered_web_node="$(printf '%s' "${recovered_web_value}" | jq -r '.node_id')"
[[ "${recovered_web_node}" == "node04" ]] || die "native-web unexpectedly churned after node05 recovered"
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/nodes/" \
'map(select(.labels.runtime == "native" and .state == "active")) | length' \
"2" \
240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-web/" \
'length' \
"2" \
240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-web/" \
"${native_fresh_healthy_count_expr}" \
"2" \
240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-container/" \
"${native_fresh_healthy_count_expr}" \
"1" \
240
wait_for_native_instance_node "native-web" "node04" 240 >/dev/null
wait_for_native_instance_node "native-web" "node05" 240 >/dev/null
local recovered_container_value recovered_container_node
recovered_container_value="$(wait_for_native_instance_node "native-container" "node04" 240)"
recovered_container_node="$(printf '%s' "${recovered_container_value}" | jq -r '.node_id')"
[[ "${recovered_container_node}" == "node04" ]] || die "native-container unexpectedly churned after node05 recovered"
publication_value="$(native_publication_state)"
publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')"
publication_ip="$(printf '%s' "${publication_value}" | jq -r '.dns.value')"
wait_for_native_lb_backends "${publication_pool_id}" "1" 180 10.100.0.21
wait_for_native_lb_backends "${publication_pool_id}" "2" 180 10.100.0.21 10.100.0.22
wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180
wait_for_http node01 "http://127.0.0.1:18191/" 240
@ -5532,7 +5194,6 @@ EOF
stop_ssh_tunnel node01 "${chainfire_tunnel_node01}"
stop_ssh_tunnel node02 "${chainfire_tunnel_node02}"
stop_ssh_tunnel node03 "${chainfire_tunnel_node03}"
rm -rf "${tmp_dir}"
}
validate_network_provider_matrix() {