- Remove queue_config.max_retries option from observability.nix - Option deprecated/removed in recent NixOS/Prometheus versions - Found by nix eval audit (T039.S3 pre-deployment validation) Error: services.prometheus.remoteWrite."[...]".queue_config.max_retries' does not exist 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
338 lines
8.2 KiB
Nix
338 lines
8.2 KiB
Nix
{ config, lib, pkgs, ... }:
|
|
|
|
let
|
|
cfg = config.services.cloud-observability;
|
|
in
|
|
{
|
|
options.services.cloud-observability = {
|
|
enable = lib.mkEnableOption "cloud platform observability stack (Prometheus, Grafana, Loki)";
|
|
|
|
prometheusPort = lib.mkOption {
|
|
type = lib.types.port;
|
|
default = 9090;
|
|
description = "Port for Prometheus web interface";
|
|
};
|
|
|
|
grafanaPort = lib.mkOption {
|
|
type = lib.types.port;
|
|
default = 3000;
|
|
description = "Port for Grafana web interface";
|
|
};
|
|
|
|
lokiPort = lib.mkOption {
|
|
type = lib.types.port;
|
|
default = 3100;
|
|
description = "Port for Loki API";
|
|
};
|
|
|
|
scrapeInterval = lib.mkOption {
|
|
type = lib.types.str;
|
|
default = "15s";
|
|
description = "Default Prometheus scrape interval";
|
|
};
|
|
|
|
enableAllTargets = lib.mkOption {
|
|
type = lib.types.bool;
|
|
default = true;
|
|
description = "Enable scraping all cloud platform services";
|
|
};
|
|
};
|
|
|
|
config = lib.mkIf cfg.enable {
|
|
# Prometheus configuration
|
|
services.prometheus = {
|
|
enable = true;
|
|
port = cfg.prometheusPort;
|
|
|
|
globalConfig = {
|
|
scrape_interval = cfg.scrapeInterval;
|
|
evaluation_interval = cfg.scrapeInterval;
|
|
};
|
|
|
|
remoteWrite = [
|
|
{
|
|
url = "http://localhost:9101/api/v1/write"; # Nightlight's remote_write endpoint
|
|
queue_config = {
|
|
capacity = 2500; # Increased capacity for better handling of metric bursts
|
|
max_shards = 20;
|
|
min_backoff = "30ms";
|
|
max_backoff = "10s";
|
|
# max_retries removed - deprecated in recent Prometheus/NixOS versions
|
|
};
|
|
}
|
|
];
|
|
|
|
scrapeConfigs = lib.mkIf cfg.enableAllTargets [
|
|
# Prometheus self-monitoring
|
|
{
|
|
job_name = "prometheus";
|
|
static_configs = [{
|
|
targets = [ "localhost:${toString cfg.prometheusPort}" ];
|
|
}];
|
|
}
|
|
|
|
# Chainfire metrics
|
|
{
|
|
job_name = "chainfire";
|
|
static_configs = [{
|
|
targets = [ "localhost:9091" ];
|
|
}];
|
|
}
|
|
|
|
# FlareDB metrics
|
|
{
|
|
job_name = "flaredb";
|
|
static_configs = [{
|
|
targets = [ "localhost:9092" ];
|
|
}];
|
|
}
|
|
|
|
# IAM metrics
|
|
{
|
|
job_name = "iam";
|
|
static_configs = [{
|
|
targets = [ "localhost:9093" ];
|
|
}];
|
|
}
|
|
|
|
# k8shost metrics
|
|
{
|
|
job_name = "k8shost";
|
|
static_configs = [{
|
|
targets = [ "localhost:9094" ];
|
|
}];
|
|
}
|
|
|
|
# PlasmaVMC metrics
|
|
{
|
|
job_name = "plasmavmc";
|
|
static_configs = [{
|
|
targets = [ "localhost:9095" ];
|
|
}];
|
|
}
|
|
|
|
# PrismNET metrics
|
|
{
|
|
job_name = "prismnet";
|
|
static_configs = [{
|
|
targets = [ "localhost:9096" ];
|
|
}];
|
|
}
|
|
|
|
# FlashDNS metrics
|
|
{
|
|
job_name = "flashdns";
|
|
static_configs = [{
|
|
targets = [ "localhost:9097" ];
|
|
}];
|
|
}
|
|
|
|
# FiberLB metrics
|
|
{
|
|
job_name = "fiberlb";
|
|
static_configs = [{
|
|
targets = [ "localhost:9098" ];
|
|
}];
|
|
}
|
|
|
|
# LightningSTOR metrics
|
|
{
|
|
job_name = "lightningstor";
|
|
static_configs = [{
|
|
targets = [ "localhost:9099" ];
|
|
}];
|
|
}
|
|
];
|
|
|
|
exporters = {
|
|
node = {
|
|
enable = true;
|
|
enabledCollectors = [ "systemd" ];
|
|
port = 9100;
|
|
};
|
|
};
|
|
};
|
|
|
|
# Loki configuration
|
|
services.loki = {
|
|
enable = true;
|
|
configuration = {
|
|
server.http_listen_port = cfg.lokiPort;
|
|
|
|
auth_enabled = false;
|
|
|
|
ingester = {
|
|
lifecycler = {
|
|
address = "127.0.0.1";
|
|
ring = {
|
|
kvstore.store = "inmemory";
|
|
replication_factor = 1;
|
|
};
|
|
final_sleep = "0s";
|
|
};
|
|
chunk_idle_period = "5m";
|
|
chunk_retain_period = "30s";
|
|
};
|
|
|
|
schema_config = {
|
|
configs = [{
|
|
from = "2024-01-01";
|
|
store = "tsdb";
|
|
object_store = "filesystem";
|
|
schema = "v13";
|
|
index = {
|
|
prefix = "index_";
|
|
period = "24h";
|
|
};
|
|
}];
|
|
};
|
|
|
|
storage_config = {
|
|
tsdb_shipper = {
|
|
active_index_directory = "/var/lib/loki/tsdb-index";
|
|
cache_location = "/var/lib/loki/tsdb-cache";
|
|
};
|
|
filesystem = {
|
|
directory = "/var/lib/loki/chunks";
|
|
};
|
|
};
|
|
|
|
limits_config = {
|
|
reject_old_samples = true;
|
|
reject_old_samples_max_age = "168h";
|
|
};
|
|
|
|
compactor = {
|
|
working_directory = "/var/lib/loki/compactor";
|
|
compaction_interval = "10m";
|
|
};
|
|
};
|
|
};
|
|
|
|
# Promtail for shipping logs to Loki
|
|
services.promtail = {
|
|
enable = true;
|
|
configuration = {
|
|
server = {
|
|
http_listen_port = 9080;
|
|
grpc_listen_port = 0;
|
|
};
|
|
|
|
positions.filename = "/var/lib/promtail/positions.yaml";
|
|
|
|
clients = [{
|
|
url = "http://localhost:${toString cfg.lokiPort}/loki/api/v1/push";
|
|
}];
|
|
|
|
scrape_configs = [
|
|
# Systemd journal scraping
|
|
{
|
|
job_name = "journal";
|
|
journal = {
|
|
max_age = "12h";
|
|
labels = {
|
|
job = "systemd-journal";
|
|
host = config.networking.hostName;
|
|
};
|
|
};
|
|
relabel_configs = [
|
|
{
|
|
source_labels = [ "__journal__systemd_unit" ];
|
|
target_label = "unit";
|
|
}
|
|
{
|
|
source_labels = [ "__journal__hostname" ];
|
|
target_label = "hostname";
|
|
}
|
|
];
|
|
}
|
|
|
|
# Application logs (if services write to files)
|
|
{
|
|
job_name = "cloud-services";
|
|
static_configs = [{
|
|
targets = [ "localhost" ];
|
|
labels = {
|
|
job = "cloud-services";
|
|
host = config.networking.hostName;
|
|
__path__ = "/var/log/cloud/*.log";
|
|
};
|
|
}];
|
|
}
|
|
];
|
|
};
|
|
};
|
|
|
|
# Grafana configuration
|
|
services.grafana = {
|
|
enable = true;
|
|
|
|
settings = {
|
|
server = {
|
|
http_port = cfg.grafanaPort;
|
|
http_addr = "0.0.0.0";
|
|
};
|
|
|
|
analytics.reporting_enabled = false;
|
|
|
|
security = {
|
|
admin_user = "admin";
|
|
admin_password = "admin"; # TODO: Make this configurable
|
|
};
|
|
};
|
|
|
|
provision = {
|
|
enable = true;
|
|
|
|
datasources.settings.datasources = [
|
|
{
|
|
name = "Prometheus";
|
|
type = "prometheus";
|
|
access = "proxy";
|
|
url = "http://localhost:${toString cfg.prometheusPort}";
|
|
isDefault = true;
|
|
jsonData = {
|
|
timeInterval = cfg.scrapeInterval;
|
|
};
|
|
}
|
|
{
|
|
name = "Loki";
|
|
type = "loki";
|
|
access = "proxy";
|
|
url = "http://localhost:${toString cfg.lokiPort}";
|
|
jsonData = {
|
|
maxLines = 1000;
|
|
};
|
|
}
|
|
];
|
|
|
|
dashboards.settings.providers = [
|
|
{
|
|
name = "Cloud Platform";
|
|
type = "file";
|
|
options.path = "/var/lib/grafana/dashboards";
|
|
}
|
|
];
|
|
};
|
|
};
|
|
|
|
# Ensure directories exist
|
|
systemd.tmpfiles.rules = [
|
|
"d /var/lib/loki 0750 loki loki -"
|
|
"d /var/lib/loki/chunks 0750 loki loki -"
|
|
"d /var/lib/loki/tsdb-index 0750 loki loki -"
|
|
"d /var/lib/loki/tsdb-cache 0750 loki loki -"
|
|
"d /var/lib/loki/compactor 0750 loki loki -"
|
|
"d /var/lib/promtail 0750 promtail promtail -"
|
|
"d /var/lib/grafana/dashboards 0755 grafana grafana -"
|
|
"d /var/log/cloud 0755 root root -"
|
|
];
|
|
|
|
# Open firewall ports if needed
|
|
networking.firewall.allowedTCPPorts = lib.mkIf cfg.enable [
|
|
cfg.prometheusPort
|
|
cfg.grafanaPort
|
|
cfg.lokiPort
|
|
];
|
|
};
|
|
}
|