photoncloud-monorepo/nightlight/crates/nightlight-server/src/ingestion.rs

//! Prometheus remote_write ingestion endpoint
//!
//! Implements the Prometheus remote_write protocol v1.0 for push-based
//! metric ingestion with snappy compression and protobuf encoding.

use axum::{
    body::Bytes,
    extract::State,
    http::StatusCode,
    response::{IntoResponse, Response},
    routing::post,
    Router,
};
use nightlight_api::prometheus::{Label, WriteRequest};
use nightlight_types::Error;
use prost::Message;
use snap::raw::Decoder as SnappyDecoder;
use std::sync::Arc;
use std::sync::atomic::{AtomicU64, Ordering};
use std::time::Instant;
use tracing::{debug, error, info, warn};

use crate::storage::Storage;

/// Maximum write request size (10 MB uncompressed)
const MAX_REQUEST_SIZE: usize = 10 * 1024 * 1024;

/// Ingestion service state
#[derive(Clone)]
pub struct IngestionService {
    storage: Arc<Storage>,
    metrics: Arc<IngestionMetrics>,
}

#[derive(Debug)]
pub struct IngestionMetrics {
    samples_received: AtomicU64,
    samples_invalid: AtomicU64,
    requests_total: AtomicU64,
    requests_failed: AtomicU64,
    started_at: Instant,
}

#[derive(Debug, Clone, Copy, Default)]
pub struct IngestionMetricsSnapshot {
    pub samples_ingested_total: u64,
    pub write_requests_total: u64,
    pub write_requests_failed: u64,
    pub samples_per_second: f64,
    pub buffer_samples: u64,
}

impl IngestionService {
    pub fn new(storage: Arc<Storage>) -> Self {
        Self {
            storage,
            metrics: Arc::new(IngestionMetrics::new()),
        }
    }

    /// Create Axum router for ingestion endpoints
    pub fn router(self) -> Router {
        Router::new()
            .route("/api/v1/write", post(handle_remote_write))
            .with_state(self)
    }

    pub fn metrics(&self) -> Arc<IngestionMetrics> {
        Arc::clone(&self.metrics)
    }

    /// Process a WriteRequest and write to shared storage
    pub(crate) async fn process_write_request(&self, request: WriteRequest) -> Result<u64, Error> {
        let mut samples_processed = 0;
        let mut series_to_append = Vec::new();

        for ts in request.timeseries {
            // Validate and normalize labels
            let labels = validate_labels(ts.labels)?;

            // Convert to internal types
            let internal_labels: Vec<nightlight_types::Label> = labels
                .into_iter()
                .map(|l| nightlight_types::Label {
                    name: l.name,
                    value: l.value,
                })
                .collect();

            // Process samples
            let mut internal_samples = Vec::new();
            for sample in ts.samples {
                // Validate sample
                if !sample.value.is_finite() {
                    warn!("Invalid sample value: {}", sample.value);
                    self.metrics.samples_invalid.fetch_add(1, Ordering::Relaxed);
                    continue;
                }

                // Convert to internal type
                let internal_sample = nightlight_types::Sample {
                    timestamp: sample.timestamp,
                    value: sample.value,
                };

                internal_samples.push(internal_sample);
                samples_processed += 1;
            }

            // Skip if no valid samples
            if internal_samples.is_empty() {
                continue;
            }

            // Store series with samples in shared storage
            let series_id = nightlight_types::SeriesId(
                compute_series_fingerprint(&internal_labels)
            );

            let time_series = nightlight_types::TimeSeries {
                id: series_id,
                labels: internal_labels,
                samples: internal_samples,
            };

            series_to_append.push(time_series);
        }

        self.storage
            .append(series_to_append)
            .await
            .map_err(|error| Error::Storage(error.to_string()))?;
        self.metrics
            .samples_received
            .fetch_add(samples_processed, Ordering::Relaxed);

        Ok(samples_processed)
    }

    /// Get current storage statistics
    #[cfg(test)]
    pub async fn storage_stats(&self) -> Result<(usize, usize), Error> {
        let stats = self
            .storage
            .stats()
            .await
            .map_err(|error| Error::Storage(error.to_string()))?;
        Ok((stats.total_samples as usize, stats.active_series as usize))
    }
}

impl IngestionMetrics {
    fn new() -> Self {
        Self {
            samples_received: AtomicU64::new(0),
            samples_invalid: AtomicU64::new(0),
            requests_total: AtomicU64::new(0),
            requests_failed: AtomicU64::new(0),
            started_at: Instant::now(),
        }
    }

    pub fn snapshot(&self) -> IngestionMetricsSnapshot {
        let uptime = self.started_at.elapsed().as_secs_f64();
        let samples_ingested_total = self.samples_received.load(Ordering::Relaxed);
        IngestionMetricsSnapshot {
            samples_ingested_total,
            write_requests_total: self.requests_total.load(Ordering::Relaxed),
            write_requests_failed: self.requests_failed.load(Ordering::Relaxed),
            samples_per_second: if uptime > 0.0 {
                samples_ingested_total as f64 / uptime
            } else {
                0.0
            },
            buffer_samples: 0,
        }
    }
}

/// Axum handler for /api/v1/write endpoint
async fn handle_remote_write(
    State(service): State<IngestionService>,
    body: Bytes,
) -> Response {
    service.metrics.requests_total.fetch_add(1, Ordering::Relaxed);

    debug!("Received remote_write request, size: {} bytes", body.len());

    // Check request size
    if body.len() > MAX_REQUEST_SIZE {
        warn!("Request too large: {} bytes", body.len());
        return IngestionError::PayloadTooLarge.into_response();
    }

    // Decompress snappy-encoded payload
    let decompressed = match decompress_snappy(&body) {
        Ok(data) => data,
        Err(e) => {
            error!("Snappy decompression failed: {}", e);
            return IngestionError::DecompressionFailed.into_response();
        }
    };

    debug!("Decompressed payload: {} bytes", decompressed.len());

    // Decode protobuf WriteRequest
    let write_request = match WriteRequest::decode(&decompressed[..]) {
        Ok(req) => req,
        Err(e) => {
            error!("Protobuf decode failed: {}", e);
            return IngestionError::InvalidProtobuf.into_response();
        }
    };

    info!(
        "Decoded WriteRequest with {} time series",
        write_request.timeseries.len()
    );

    // Process the request
    match service.process_write_request(write_request).await {
        Ok(samples_count) => {
            info!("Successfully ingested {} samples", samples_count);
            (StatusCode::NO_CONTENT, "").into_response()
        }
        Err(Error::Storage(msg)) if msg.contains("buffer full") => {
            warn!("Write buffer full, returning 429");
            service.metrics.requests_failed.fetch_add(1, Ordering::Relaxed);
            IngestionError::Backpressure.into_response()
        }
        Err(Error::InvalidLabel(msg)) => {
            warn!("Invalid labels: {}", msg);
            service.metrics.requests_failed.fetch_add(1, Ordering::Relaxed);
            IngestionError::InvalidLabels.into_response()
        }
        Err(e) => {
            error!("Failed to process write request: {}", e);
            service.metrics.requests_failed.fetch_add(1, Ordering::Relaxed);
            IngestionError::StorageError.into_response()
        }
    }
}

/// Decompress snappy-encoded data
fn decompress_snappy(compressed: &[u8]) -> Result<Vec<u8>, Error> {
    let mut decoder = SnappyDecoder::new();
    let decompressed_len = snap::raw::decompress_len(compressed)
        .map_err(|e| Error::InvalidMetric(format!("Invalid snappy data: {}", e)))?;

    let mut decompressed = vec![0u8; decompressed_len];
    decoder
        .decompress(compressed, &mut decompressed)
        .map_err(|e| Error::InvalidMetric(format!("Snappy decompression failed: {}", e)))?;

    Ok(decompressed)
}

/// Validate and normalize Prometheus labels
fn validate_labels(labels: Vec<Label>) -> Result<Vec<Label>, Error> {
    if labels.is_empty() {
        return Err(Error::InvalidLabel("Empty label set".into()));
    }

    // Check for __name__ label (metric name)
    let has_name = labels.iter().any(|l| l.name == "__name__");
    if !has_name {
        return Err(Error::InvalidLabel("Missing __name__ label".into()));
    }

    let mut validated = Vec::with_capacity(labels.len());

    for label in labels {
        // Validate label name
        if label.name.is_empty() {
            return Err(Error::InvalidLabel("Empty label name".into()));
        }

        // Label names must start with [a-zA-Z_]
        let first_char = label.name.chars().next().unwrap();
        if !first_char.is_ascii_alphabetic() && first_char != '_' {
            return Err(Error::InvalidLabel(format!(
                "Invalid label name '{}': must start with [a-zA-Z_]",
                label.name
            )));
        }

        // Label names must contain only [a-zA-Z0-9_]
        if !label.name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_') {
            return Err(Error::InvalidLabel(format!(
                "Invalid label name '{}': must contain only [a-zA-Z0-9_]",
                label.name
            )));
        }

        validated.push(label);
    }

    // Sort labels for consistent fingerprinting
    validated.sort_by(|a, b| a.name.cmp(&b.name));

    Ok(validated)
}

/// Compute stable fingerprint for a label set
fn compute_series_fingerprint(labels: &[nightlight_types::Label]) -> u64 {
    use std::collections::hash_map::DefaultHasher;
    use std::hash::{Hash, Hasher};

    let mut hasher = DefaultHasher::new();
    for label in labels {
        label.name.hash(&mut hasher);
        label.value.hash(&mut hasher);
    }
    hasher.finish()
}

/// Ingestion error types for HTTP responses
#[derive(Debug)]
enum IngestionError {
    PayloadTooLarge,
    DecompressionFailed,
    InvalidProtobuf,
    InvalidLabels,
    StorageError,
    Backpressure,
}

impl IntoResponse for IngestionError {
    fn into_response(self) -> Response {
        let (status, message) = match self {
            IngestionError::PayloadTooLarge => {
                (StatusCode::PAYLOAD_TOO_LARGE, "Request payload too large")
            }
            IngestionError::DecompressionFailed => {
                (StatusCode::BAD_REQUEST, "Snappy decompression failed")
            }
            IngestionError::InvalidProtobuf => {
                (StatusCode::BAD_REQUEST, "Invalid protobuf encoding")
            }
            IngestionError::InvalidLabels => {
                (StatusCode::BAD_REQUEST, "Invalid metric labels")
            }
            IngestionError::StorageError => {
                (StatusCode::INTERNAL_SERVER_ERROR, "Storage error")
            }
            IngestionError::Backpressure => {
                (StatusCode::TOO_MANY_REQUESTS, "Write buffer full, retry later")
            }
        };

        (status, message).into_response()
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::storage::Storage;

    #[test]
    fn test_validate_labels_success() {
        let labels = vec![
            Label {
                name: "__name__".to_string(),
                value: "http_requests_total".to_string(),
            },
            Label {
                name: "method".to_string(),
                value: "GET".to_string(),
            },
        ];

        let result = validate_labels(labels);
        assert!(result.is_ok());
        let validated = result.unwrap();
        assert_eq!(validated.len(), 2);
        // Should be sorted
        assert_eq!(validated[0].name, "__name__");
    }

    #[test]
    fn test_validate_labels_missing_name() {
        let labels = vec![Label {
            name: "method".to_string(),
            value: "GET".to_string(),
        }];

        let result = validate_labels(labels);
        assert!(result.is_err());
    }

    #[test]
    fn test_validate_labels_invalid_name() {
        let labels = vec![
            Label {
                name: "__name__".to_string(),
                value: "metric".to_string(),
            },
            Label {
                name: "123invalid".to_string(), // Cannot start with digit
                value: "value".to_string(),
            },
        ];

        let result = validate_labels(labels);
        assert!(result.is_err());
    }

    #[test]
    fn test_compute_fingerprint_stable() {
        let labels = vec![
            nightlight_types::Label {
                name: "__name__".to_string(),
                value: "metric".to_string(),
            },
            nightlight_types::Label {
                name: "label1".to_string(),
                value: "value1".to_string(),
            },
        ];

        let fp1 = compute_series_fingerprint(&labels);
        let fp2 = compute_series_fingerprint(&labels);
        assert_eq!(fp1, fp2);
    }

    #[tokio::test]
    async fn test_ingestion_service_storage() {
        let dir = tempfile::tempdir().unwrap();
        let storage = Arc::new(Storage::new(dir.path().to_str().unwrap()).unwrap());
        let service = IngestionService::new(storage);
        let (samples, series) = service.storage_stats().await.unwrap();
        assert_eq!(samples, 0);
        assert_eq!(series, 0);
    }

    #[tokio::test]
    async fn test_process_write_request_persists_samples() {
        let dir = tempfile::tempdir().unwrap();
        let storage = Arc::new(Storage::new(dir.path().to_str().unwrap()).unwrap());
        let service = IngestionService::new(Arc::clone(&storage));

        let request = WriteRequest {
            timeseries: vec![nightlight_api::prometheus::TimeSeries {
                labels: vec![
                    Label {
                        name: "__name__".to_string(),
                        value: "ingest_metric".to_string(),
                    },
                    Label {
                        name: "job".to_string(),
                        value: "test".to_string(),
                    },
                ],
                samples: vec![nightlight_api::prometheus::Sample {
                    value: 42.0,
                    timestamp: 1_000,
                }],
            }],
        };

        let processed = service.process_write_request(request).await.unwrap();
        assert_eq!(processed, 1);

        storage.flush().await.unwrap();
        let reloaded = Storage::new(dir.path().to_str().unwrap()).unwrap();
        let ids = reloaded
            .find_series(vec![
                "__name__=ingest_metric".to_string(),
                "job=test".to_string(),
            ])
            .await
            .unwrap();
        assert_eq!(ids.len(), 1);
        let series = reloaded
            .query_series(ids[0], 0, 10_000)
            .await
            .unwrap()
            .unwrap();
        assert_eq!(series.samples.len(), 1);
        assert_eq!(series.samples[0].value, 42.0);
    }
}