photoncloud-monorepo/nightlight/crates/nightlight-server/src/ingestion.rs

483 lines
15 KiB
Rust

//! Prometheus remote_write ingestion endpoint
//!
//! Implements the Prometheus remote_write protocol v1.0 for push-based
//! metric ingestion with snappy compression and protobuf encoding.
use axum::{
body::Bytes,
extract::State,
http::StatusCode,
response::{IntoResponse, Response},
routing::post,
Router,
};
use nightlight_api::prometheus::{Label, WriteRequest};
use nightlight_types::Error;
use prost::Message;
use snap::raw::Decoder as SnappyDecoder;
use std::sync::Arc;
use std::sync::atomic::{AtomicU64, Ordering};
use std::time::Instant;
use tracing::{debug, error, info, warn};
use crate::storage::Storage;
/// Maximum write request size (10 MB uncompressed)
const MAX_REQUEST_SIZE: usize = 10 * 1024 * 1024;
/// Ingestion service state
#[derive(Clone)]
pub struct IngestionService {
storage: Arc<Storage>,
metrics: Arc<IngestionMetrics>,
}
#[derive(Debug)]
pub struct IngestionMetrics {
samples_received: AtomicU64,
samples_invalid: AtomicU64,
requests_total: AtomicU64,
requests_failed: AtomicU64,
started_at: Instant,
}
#[derive(Debug, Clone, Copy, Default)]
pub struct IngestionMetricsSnapshot {
pub samples_ingested_total: u64,
pub write_requests_total: u64,
pub write_requests_failed: u64,
pub samples_per_second: f64,
pub buffer_samples: u64,
}
impl IngestionService {
pub fn new(storage: Arc<Storage>) -> Self {
Self {
storage,
metrics: Arc::new(IngestionMetrics::new()),
}
}
/// Create Axum router for ingestion endpoints
pub fn router(self) -> Router {
Router::new()
.route("/api/v1/write", post(handle_remote_write))
.with_state(self)
}
pub fn metrics(&self) -> Arc<IngestionMetrics> {
Arc::clone(&self.metrics)
}
/// Process a WriteRequest and write to shared storage
pub(crate) async fn process_write_request(&self, request: WriteRequest) -> Result<u64, Error> {
let mut samples_processed = 0;
let mut series_to_append = Vec::new();
for ts in request.timeseries {
// Validate and normalize labels
let labels = validate_labels(ts.labels)?;
// Convert to internal types
let internal_labels: Vec<nightlight_types::Label> = labels
.into_iter()
.map(|l| nightlight_types::Label {
name: l.name,
value: l.value,
})
.collect();
// Process samples
let mut internal_samples = Vec::new();
for sample in ts.samples {
// Validate sample
if !sample.value.is_finite() {
warn!("Invalid sample value: {}", sample.value);
self.metrics.samples_invalid.fetch_add(1, Ordering::Relaxed);
continue;
}
// Convert to internal type
let internal_sample = nightlight_types::Sample {
timestamp: sample.timestamp,
value: sample.value,
};
internal_samples.push(internal_sample);
samples_processed += 1;
}
// Skip if no valid samples
if internal_samples.is_empty() {
continue;
}
// Store series with samples in shared storage
let series_id = nightlight_types::SeriesId(
compute_series_fingerprint(&internal_labels)
);
let time_series = nightlight_types::TimeSeries {
id: series_id,
labels: internal_labels,
samples: internal_samples,
};
series_to_append.push(time_series);
}
self.storage
.append(series_to_append)
.await
.map_err(|error| Error::Storage(error.to_string()))?;
self.metrics
.samples_received
.fetch_add(samples_processed, Ordering::Relaxed);
Ok(samples_processed)
}
/// Get current storage statistics
#[cfg(test)]
pub async fn storage_stats(&self) -> Result<(usize, usize), Error> {
let stats = self
.storage
.stats()
.await
.map_err(|error| Error::Storage(error.to_string()))?;
Ok((stats.total_samples as usize, stats.active_series as usize))
}
}
impl IngestionMetrics {
fn new() -> Self {
Self {
samples_received: AtomicU64::new(0),
samples_invalid: AtomicU64::new(0),
requests_total: AtomicU64::new(0),
requests_failed: AtomicU64::new(0),
started_at: Instant::now(),
}
}
pub fn snapshot(&self) -> IngestionMetricsSnapshot {
let uptime = self.started_at.elapsed().as_secs_f64();
let samples_ingested_total = self.samples_received.load(Ordering::Relaxed);
IngestionMetricsSnapshot {
samples_ingested_total,
write_requests_total: self.requests_total.load(Ordering::Relaxed),
write_requests_failed: self.requests_failed.load(Ordering::Relaxed),
samples_per_second: if uptime > 0.0 {
samples_ingested_total as f64 / uptime
} else {
0.0
},
buffer_samples: 0,
}
}
}
/// Axum handler for /api/v1/write endpoint
async fn handle_remote_write(
State(service): State<IngestionService>,
body: Bytes,
) -> Response {
service.metrics.requests_total.fetch_add(1, Ordering::Relaxed);
debug!("Received remote_write request, size: {} bytes", body.len());
// Check request size
if body.len() > MAX_REQUEST_SIZE {
warn!("Request too large: {} bytes", body.len());
return IngestionError::PayloadTooLarge.into_response();
}
// Decompress snappy-encoded payload
let decompressed = match decompress_snappy(&body) {
Ok(data) => data,
Err(e) => {
error!("Snappy decompression failed: {}", e);
return IngestionError::DecompressionFailed.into_response();
}
};
debug!("Decompressed payload: {} bytes", decompressed.len());
// Decode protobuf WriteRequest
let write_request = match WriteRequest::decode(&decompressed[..]) {
Ok(req) => req,
Err(e) => {
error!("Protobuf decode failed: {}", e);
return IngestionError::InvalidProtobuf.into_response();
}
};
info!(
"Decoded WriteRequest with {} time series",
write_request.timeseries.len()
);
// Process the request
match service.process_write_request(write_request).await {
Ok(samples_count) => {
info!("Successfully ingested {} samples", samples_count);
(StatusCode::NO_CONTENT, "").into_response()
}
Err(Error::Storage(msg)) if msg.contains("buffer full") => {
warn!("Write buffer full, returning 429");
service.metrics.requests_failed.fetch_add(1, Ordering::Relaxed);
IngestionError::Backpressure.into_response()
}
Err(Error::InvalidLabel(msg)) => {
warn!("Invalid labels: {}", msg);
service.metrics.requests_failed.fetch_add(1, Ordering::Relaxed);
IngestionError::InvalidLabels.into_response()
}
Err(e) => {
error!("Failed to process write request: {}", e);
service.metrics.requests_failed.fetch_add(1, Ordering::Relaxed);
IngestionError::StorageError.into_response()
}
}
}
/// Decompress snappy-encoded data
fn decompress_snappy(compressed: &[u8]) -> Result<Vec<u8>, Error> {
let mut decoder = SnappyDecoder::new();
let decompressed_len = snap::raw::decompress_len(compressed)
.map_err(|e| Error::InvalidMetric(format!("Invalid snappy data: {}", e)))?;
let mut decompressed = vec![0u8; decompressed_len];
decoder
.decompress(compressed, &mut decompressed)
.map_err(|e| Error::InvalidMetric(format!("Snappy decompression failed: {}", e)))?;
Ok(decompressed)
}
/// Validate and normalize Prometheus labels
fn validate_labels(labels: Vec<Label>) -> Result<Vec<Label>, Error> {
if labels.is_empty() {
return Err(Error::InvalidLabel("Empty label set".into()));
}
// Check for __name__ label (metric name)
let has_name = labels.iter().any(|l| l.name == "__name__");
if !has_name {
return Err(Error::InvalidLabel("Missing __name__ label".into()));
}
let mut validated = Vec::with_capacity(labels.len());
for label in labels {
// Validate label name
if label.name.is_empty() {
return Err(Error::InvalidLabel("Empty label name".into()));
}
// Label names must start with [a-zA-Z_]
let first_char = label.name.chars().next().unwrap();
if !first_char.is_ascii_alphabetic() && first_char != '_' {
return Err(Error::InvalidLabel(format!(
"Invalid label name '{}': must start with [a-zA-Z_]",
label.name
)));
}
// Label names must contain only [a-zA-Z0-9_]
if !label.name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_') {
return Err(Error::InvalidLabel(format!(
"Invalid label name '{}': must contain only [a-zA-Z0-9_]",
label.name
)));
}
validated.push(label);
}
// Sort labels for consistent fingerprinting
validated.sort_by(|a, b| a.name.cmp(&b.name));
Ok(validated)
}
/// Compute stable fingerprint for a label set
fn compute_series_fingerprint(labels: &[nightlight_types::Label]) -> u64 {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
for label in labels {
label.name.hash(&mut hasher);
label.value.hash(&mut hasher);
}
hasher.finish()
}
/// Ingestion error types for HTTP responses
#[derive(Debug)]
enum IngestionError {
PayloadTooLarge,
DecompressionFailed,
InvalidProtobuf,
InvalidLabels,
StorageError,
Backpressure,
}
impl IntoResponse for IngestionError {
fn into_response(self) -> Response {
let (status, message) = match self {
IngestionError::PayloadTooLarge => {
(StatusCode::PAYLOAD_TOO_LARGE, "Request payload too large")
}
IngestionError::DecompressionFailed => {
(StatusCode::BAD_REQUEST, "Snappy decompression failed")
}
IngestionError::InvalidProtobuf => {
(StatusCode::BAD_REQUEST, "Invalid protobuf encoding")
}
IngestionError::InvalidLabels => {
(StatusCode::BAD_REQUEST, "Invalid metric labels")
}
IngestionError::StorageError => {
(StatusCode::INTERNAL_SERVER_ERROR, "Storage error")
}
IngestionError::Backpressure => {
(StatusCode::TOO_MANY_REQUESTS, "Write buffer full, retry later")
}
};
(status, message).into_response()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::storage::Storage;
#[test]
fn test_validate_labels_success() {
let labels = vec![
Label {
name: "__name__".to_string(),
value: "http_requests_total".to_string(),
},
Label {
name: "method".to_string(),
value: "GET".to_string(),
},
];
let result = validate_labels(labels);
assert!(result.is_ok());
let validated = result.unwrap();
assert_eq!(validated.len(), 2);
// Should be sorted
assert_eq!(validated[0].name, "__name__");
}
#[test]
fn test_validate_labels_missing_name() {
let labels = vec![Label {
name: "method".to_string(),
value: "GET".to_string(),
}];
let result = validate_labels(labels);
assert!(result.is_err());
}
#[test]
fn test_validate_labels_invalid_name() {
let labels = vec![
Label {
name: "__name__".to_string(),
value: "metric".to_string(),
},
Label {
name: "123invalid".to_string(), // Cannot start with digit
value: "value".to_string(),
},
];
let result = validate_labels(labels);
assert!(result.is_err());
}
#[test]
fn test_compute_fingerprint_stable() {
let labels = vec![
nightlight_types::Label {
name: "__name__".to_string(),
value: "metric".to_string(),
},
nightlight_types::Label {
name: "label1".to_string(),
value: "value1".to_string(),
},
];
let fp1 = compute_series_fingerprint(&labels);
let fp2 = compute_series_fingerprint(&labels);
assert_eq!(fp1, fp2);
}
#[tokio::test]
async fn test_ingestion_service_storage() {
let dir = tempfile::tempdir().unwrap();
let storage = Arc::new(Storage::new(dir.path().to_str().unwrap()).unwrap());
let service = IngestionService::new(storage);
let (samples, series) = service.storage_stats().await.unwrap();
assert_eq!(samples, 0);
assert_eq!(series, 0);
}
#[tokio::test]
async fn test_process_write_request_persists_samples() {
let dir = tempfile::tempdir().unwrap();
let storage = Arc::new(Storage::new(dir.path().to_str().unwrap()).unwrap());
let service = IngestionService::new(Arc::clone(&storage));
let request = WriteRequest {
timeseries: vec![nightlight_api::prometheus::TimeSeries {
labels: vec![
Label {
name: "__name__".to_string(),
value: "ingest_metric".to_string(),
},
Label {
name: "job".to_string(),
value: "test".to_string(),
},
],
samples: vec![nightlight_api::prometheus::Sample {
value: 42.0,
timestamp: 1_000,
}],
}],
};
let processed = service.process_write_request(request).await.unwrap();
assert_eq!(processed, 1);
storage.flush().await.unwrap();
let reloaded = Storage::new(dir.path().to_str().unwrap()).unwrap();
let ids = reloaded
.find_series(vec![
"__name__=ingest_metric".to_string(),
"job=test".to_string(),
])
.await
.unwrap();
assert_eq!(ids.len(), 1);
let series = reloaded
.query_series(ids[0], 0, 10_000)
.await
.unwrap()
.unwrap();
assert_eq!(series.samples.len(), 1);
assert_eq!(series.samples[0].value, 42.0);
}
}