photoncloud-monorepo/nix/modules/nix-agent.nix

111 lines
4.1 KiB
Nix

{ config, lib, pkgs, ... }:
let
cfg = config.services.nix-agent;
extraArgs =
map (arg: "--health-check-command ${lib.escapeShellArg arg}") cfg.healthCheckCommand
++ lib.optionals cfg.rollbackOnFailure [ "--rollback-on-failure" ]
++ lib.optionals cfg.apply [ "--apply" ];
renderedExtraArgs = lib.concatStringsSep " \\\n " extraArgs;
in
{
options.services.nix-agent = {
enable = lib.mkEnableOption "UltraCloud nix-agent service (host OS convergence only, consuming desired-system state published by deployer)";
chainfireEndpoint = lib.mkOption {
type = lib.types.str;
default = "http://127.0.0.1:2379";
description = "ChainFire endpoint consumed by nix-agent for desired-system data; nix-agent does not schedule native services.";
};
clusterNamespace = lib.mkOption {
type = lib.types.str;
default = "ultracloud";
description = "Cluster namespace prefix";
};
clusterId = lib.mkOption {
type = lib.types.str;
description = "Cluster ID reconciled by nix-agent";
};
nodeId = lib.mkOption {
type = lib.types.str;
default = config.networking.hostName;
description = "Node ID represented by this agent";
};
flakeRoot = lib.mkOption {
type = lib.types.str;
default = "/etc/nixos";
description = "Flake root used to build target nixosConfigurations";
};
intervalSecs = lib.mkOption {
type = lib.types.int;
default = 30;
description = "Polling interval in seconds";
};
switchAction = lib.mkOption {
type = lib.types.enum [ "switch" "test" "boot" "dry-activate" ];
default = "switch";
description = "switch-to-configuration action executed after building the target system";
};
healthCheckCommand = lib.mkOption {
type = lib.types.listOf lib.types.str;
default = [ ];
description = "Command vector executed after activation to verify node health. Entries are argv items, not a shell snippet; a non-zero exit triggers rollback when rollbackOnFailure is enabled.";
};
rollbackOnFailure = lib.mkOption {
type = lib.types.bool;
default = true;
description = "Roll back to the previous system if the post-activation health check fails, leaving observed status as rolled-back instead of keeping the rejected target active.";
};
apply = lib.mkOption {
type = lib.types.bool;
default = true;
description = "Apply desired NixOS system state on the node; runtime process placement remains the node-agent and fleet-scheduler path.";
};
package = lib.mkOption {
type = lib.types.package;
default = pkgs.nix-agent or (throw "nix-agent package not found");
description = "Package to use for nix-agent";
};
};
config = lib.mkIf cfg.enable {
systemd.services.nix-agent = {
description = "UltraCloud Nix Agent (host OS reconcile only)";
wantedBy = [ "multi-user.target" ];
after = [ "network-online.target" ];
wants = [ "network-online.target" ];
path = [ config.system.path ];
serviceConfig = {
Type = "simple";
Restart = "on-failure";
RestartSec = "5s";
# The agent can invoke switch-to-configuration on its own host. Keep
# that child process alive when systemd stops the agent during the
# switch transaction so activation can finish and restart services in
# the new generation.
KillMode = "process";
ExecStart = ''
${cfg.package}/bin/nix-agent \
--chainfire-endpoint ${lib.escapeShellArg cfg.chainfireEndpoint} \
--cluster-namespace ${lib.escapeShellArg cfg.clusterNamespace} \
--cluster-id ${lib.escapeShellArg cfg.clusterId} \
--node-id ${lib.escapeShellArg cfg.nodeId} \
--flake-root ${lib.escapeShellArg cfg.flakeRoot} \
--interval-secs ${toString cfg.intervalSecs} \
--switch-action ${lib.escapeShellArg cfg.switchAction}${lib.optionalString (renderedExtraArgs != "") " \\\n ${renderedExtraArgs}"}
'';
};
};
};
}