483 lines
15 KiB
Rust
483 lines
15 KiB
Rust
//! Prometheus remote_write ingestion endpoint
|
|
//!
|
|
//! Implements the Prometheus remote_write protocol v1.0 for push-based
|
|
//! metric ingestion with snappy compression and protobuf encoding.
|
|
|
|
use axum::{
|
|
body::Bytes,
|
|
extract::State,
|
|
http::StatusCode,
|
|
response::{IntoResponse, Response},
|
|
routing::post,
|
|
Router,
|
|
};
|
|
use nightlight_api::prometheus::{Label, WriteRequest};
|
|
use nightlight_types::Error;
|
|
use prost::Message;
|
|
use snap::raw::Decoder as SnappyDecoder;
|
|
use std::sync::Arc;
|
|
use std::sync::atomic::{AtomicU64, Ordering};
|
|
use std::time::Instant;
|
|
use tracing::{debug, error, info, warn};
|
|
|
|
use crate::storage::Storage;
|
|
|
|
/// Maximum write request size (10 MB uncompressed)
|
|
const MAX_REQUEST_SIZE: usize = 10 * 1024 * 1024;
|
|
|
|
/// Ingestion service state
|
|
#[derive(Clone)]
|
|
pub struct IngestionService {
|
|
storage: Arc<Storage>,
|
|
metrics: Arc<IngestionMetrics>,
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub struct IngestionMetrics {
|
|
samples_received: AtomicU64,
|
|
samples_invalid: AtomicU64,
|
|
requests_total: AtomicU64,
|
|
requests_failed: AtomicU64,
|
|
started_at: Instant,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, Default)]
|
|
pub struct IngestionMetricsSnapshot {
|
|
pub samples_ingested_total: u64,
|
|
pub write_requests_total: u64,
|
|
pub write_requests_failed: u64,
|
|
pub samples_per_second: f64,
|
|
pub buffer_samples: u64,
|
|
}
|
|
|
|
impl IngestionService {
|
|
pub fn new(storage: Arc<Storage>) -> Self {
|
|
Self {
|
|
storage,
|
|
metrics: Arc::new(IngestionMetrics::new()),
|
|
}
|
|
}
|
|
|
|
/// Create Axum router for ingestion endpoints
|
|
pub fn router(self) -> Router {
|
|
Router::new()
|
|
.route("/api/v1/write", post(handle_remote_write))
|
|
.with_state(self)
|
|
}
|
|
|
|
pub fn metrics(&self) -> Arc<IngestionMetrics> {
|
|
Arc::clone(&self.metrics)
|
|
}
|
|
|
|
/// Process a WriteRequest and write to shared storage
|
|
pub(crate) async fn process_write_request(&self, request: WriteRequest) -> Result<u64, Error> {
|
|
let mut samples_processed = 0;
|
|
let mut series_to_append = Vec::new();
|
|
|
|
for ts in request.timeseries {
|
|
// Validate and normalize labels
|
|
let labels = validate_labels(ts.labels)?;
|
|
|
|
// Convert to internal types
|
|
let internal_labels: Vec<nightlight_types::Label> = labels
|
|
.into_iter()
|
|
.map(|l| nightlight_types::Label {
|
|
name: l.name,
|
|
value: l.value,
|
|
})
|
|
.collect();
|
|
|
|
// Process samples
|
|
let mut internal_samples = Vec::new();
|
|
for sample in ts.samples {
|
|
// Validate sample
|
|
if !sample.value.is_finite() {
|
|
warn!("Invalid sample value: {}", sample.value);
|
|
self.metrics.samples_invalid.fetch_add(1, Ordering::Relaxed);
|
|
continue;
|
|
}
|
|
|
|
// Convert to internal type
|
|
let internal_sample = nightlight_types::Sample {
|
|
timestamp: sample.timestamp,
|
|
value: sample.value,
|
|
};
|
|
|
|
internal_samples.push(internal_sample);
|
|
samples_processed += 1;
|
|
}
|
|
|
|
// Skip if no valid samples
|
|
if internal_samples.is_empty() {
|
|
continue;
|
|
}
|
|
|
|
// Store series with samples in shared storage
|
|
let series_id = nightlight_types::SeriesId(
|
|
compute_series_fingerprint(&internal_labels)
|
|
);
|
|
|
|
let time_series = nightlight_types::TimeSeries {
|
|
id: series_id,
|
|
labels: internal_labels,
|
|
samples: internal_samples,
|
|
};
|
|
|
|
series_to_append.push(time_series);
|
|
}
|
|
|
|
self.storage
|
|
.append(series_to_append)
|
|
.await
|
|
.map_err(|error| Error::Storage(error.to_string()))?;
|
|
self.metrics
|
|
.samples_received
|
|
.fetch_add(samples_processed, Ordering::Relaxed);
|
|
|
|
Ok(samples_processed)
|
|
}
|
|
|
|
/// Get current storage statistics
|
|
#[cfg(test)]
|
|
pub async fn storage_stats(&self) -> Result<(usize, usize), Error> {
|
|
let stats = self
|
|
.storage
|
|
.stats()
|
|
.await
|
|
.map_err(|error| Error::Storage(error.to_string()))?;
|
|
Ok((stats.total_samples as usize, stats.active_series as usize))
|
|
}
|
|
}
|
|
|
|
impl IngestionMetrics {
|
|
fn new() -> Self {
|
|
Self {
|
|
samples_received: AtomicU64::new(0),
|
|
samples_invalid: AtomicU64::new(0),
|
|
requests_total: AtomicU64::new(0),
|
|
requests_failed: AtomicU64::new(0),
|
|
started_at: Instant::now(),
|
|
}
|
|
}
|
|
|
|
pub fn snapshot(&self) -> IngestionMetricsSnapshot {
|
|
let uptime = self.started_at.elapsed().as_secs_f64();
|
|
let samples_ingested_total = self.samples_received.load(Ordering::Relaxed);
|
|
IngestionMetricsSnapshot {
|
|
samples_ingested_total,
|
|
write_requests_total: self.requests_total.load(Ordering::Relaxed),
|
|
write_requests_failed: self.requests_failed.load(Ordering::Relaxed),
|
|
samples_per_second: if uptime > 0.0 {
|
|
samples_ingested_total as f64 / uptime
|
|
} else {
|
|
0.0
|
|
},
|
|
buffer_samples: 0,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Axum handler for /api/v1/write endpoint
|
|
async fn handle_remote_write(
|
|
State(service): State<IngestionService>,
|
|
body: Bytes,
|
|
) -> Response {
|
|
service.metrics.requests_total.fetch_add(1, Ordering::Relaxed);
|
|
|
|
debug!("Received remote_write request, size: {} bytes", body.len());
|
|
|
|
// Check request size
|
|
if body.len() > MAX_REQUEST_SIZE {
|
|
warn!("Request too large: {} bytes", body.len());
|
|
return IngestionError::PayloadTooLarge.into_response();
|
|
}
|
|
|
|
// Decompress snappy-encoded payload
|
|
let decompressed = match decompress_snappy(&body) {
|
|
Ok(data) => data,
|
|
Err(e) => {
|
|
error!("Snappy decompression failed: {}", e);
|
|
return IngestionError::DecompressionFailed.into_response();
|
|
}
|
|
};
|
|
|
|
debug!("Decompressed payload: {} bytes", decompressed.len());
|
|
|
|
// Decode protobuf WriteRequest
|
|
let write_request = match WriteRequest::decode(&decompressed[..]) {
|
|
Ok(req) => req,
|
|
Err(e) => {
|
|
error!("Protobuf decode failed: {}", e);
|
|
return IngestionError::InvalidProtobuf.into_response();
|
|
}
|
|
};
|
|
|
|
info!(
|
|
"Decoded WriteRequest with {} time series",
|
|
write_request.timeseries.len()
|
|
);
|
|
|
|
// Process the request
|
|
match service.process_write_request(write_request).await {
|
|
Ok(samples_count) => {
|
|
info!("Successfully ingested {} samples", samples_count);
|
|
(StatusCode::NO_CONTENT, "").into_response()
|
|
}
|
|
Err(Error::Storage(msg)) if msg.contains("buffer full") => {
|
|
warn!("Write buffer full, returning 429");
|
|
service.metrics.requests_failed.fetch_add(1, Ordering::Relaxed);
|
|
IngestionError::Backpressure.into_response()
|
|
}
|
|
Err(Error::InvalidLabel(msg)) => {
|
|
warn!("Invalid labels: {}", msg);
|
|
service.metrics.requests_failed.fetch_add(1, Ordering::Relaxed);
|
|
IngestionError::InvalidLabels.into_response()
|
|
}
|
|
Err(e) => {
|
|
error!("Failed to process write request: {}", e);
|
|
service.metrics.requests_failed.fetch_add(1, Ordering::Relaxed);
|
|
IngestionError::StorageError.into_response()
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Decompress snappy-encoded data
|
|
fn decompress_snappy(compressed: &[u8]) -> Result<Vec<u8>, Error> {
|
|
let mut decoder = SnappyDecoder::new();
|
|
let decompressed_len = snap::raw::decompress_len(compressed)
|
|
.map_err(|e| Error::InvalidMetric(format!("Invalid snappy data: {}", e)))?;
|
|
|
|
let mut decompressed = vec![0u8; decompressed_len];
|
|
decoder
|
|
.decompress(compressed, &mut decompressed)
|
|
.map_err(|e| Error::InvalidMetric(format!("Snappy decompression failed: {}", e)))?;
|
|
|
|
Ok(decompressed)
|
|
}
|
|
|
|
/// Validate and normalize Prometheus labels
|
|
fn validate_labels(labels: Vec<Label>) -> Result<Vec<Label>, Error> {
|
|
if labels.is_empty() {
|
|
return Err(Error::InvalidLabel("Empty label set".into()));
|
|
}
|
|
|
|
// Check for __name__ label (metric name)
|
|
let has_name = labels.iter().any(|l| l.name == "__name__");
|
|
if !has_name {
|
|
return Err(Error::InvalidLabel("Missing __name__ label".into()));
|
|
}
|
|
|
|
let mut validated = Vec::with_capacity(labels.len());
|
|
|
|
for label in labels {
|
|
// Validate label name
|
|
if label.name.is_empty() {
|
|
return Err(Error::InvalidLabel("Empty label name".into()));
|
|
}
|
|
|
|
// Label names must start with [a-zA-Z_]
|
|
let first_char = label.name.chars().next().unwrap();
|
|
if !first_char.is_ascii_alphabetic() && first_char != '_' {
|
|
return Err(Error::InvalidLabel(format!(
|
|
"Invalid label name '{}': must start with [a-zA-Z_]",
|
|
label.name
|
|
)));
|
|
}
|
|
|
|
// Label names must contain only [a-zA-Z0-9_]
|
|
if !label.name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_') {
|
|
return Err(Error::InvalidLabel(format!(
|
|
"Invalid label name '{}': must contain only [a-zA-Z0-9_]",
|
|
label.name
|
|
)));
|
|
}
|
|
|
|
validated.push(label);
|
|
}
|
|
|
|
// Sort labels for consistent fingerprinting
|
|
validated.sort_by(|a, b| a.name.cmp(&b.name));
|
|
|
|
Ok(validated)
|
|
}
|
|
|
|
/// Compute stable fingerprint for a label set
|
|
fn compute_series_fingerprint(labels: &[nightlight_types::Label]) -> u64 {
|
|
use std::collections::hash_map::DefaultHasher;
|
|
use std::hash::{Hash, Hasher};
|
|
|
|
let mut hasher = DefaultHasher::new();
|
|
for label in labels {
|
|
label.name.hash(&mut hasher);
|
|
label.value.hash(&mut hasher);
|
|
}
|
|
hasher.finish()
|
|
}
|
|
|
|
/// Ingestion error types for HTTP responses
|
|
#[derive(Debug)]
|
|
enum IngestionError {
|
|
PayloadTooLarge,
|
|
DecompressionFailed,
|
|
InvalidProtobuf,
|
|
InvalidLabels,
|
|
StorageError,
|
|
Backpressure,
|
|
}
|
|
|
|
impl IntoResponse for IngestionError {
|
|
fn into_response(self) -> Response {
|
|
let (status, message) = match self {
|
|
IngestionError::PayloadTooLarge => {
|
|
(StatusCode::PAYLOAD_TOO_LARGE, "Request payload too large")
|
|
}
|
|
IngestionError::DecompressionFailed => {
|
|
(StatusCode::BAD_REQUEST, "Snappy decompression failed")
|
|
}
|
|
IngestionError::InvalidProtobuf => {
|
|
(StatusCode::BAD_REQUEST, "Invalid protobuf encoding")
|
|
}
|
|
IngestionError::InvalidLabels => {
|
|
(StatusCode::BAD_REQUEST, "Invalid metric labels")
|
|
}
|
|
IngestionError::StorageError => {
|
|
(StatusCode::INTERNAL_SERVER_ERROR, "Storage error")
|
|
}
|
|
IngestionError::Backpressure => {
|
|
(StatusCode::TOO_MANY_REQUESTS, "Write buffer full, retry later")
|
|
}
|
|
};
|
|
|
|
(status, message).into_response()
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::storage::Storage;
|
|
|
|
#[test]
|
|
fn test_validate_labels_success() {
|
|
let labels = vec![
|
|
Label {
|
|
name: "__name__".to_string(),
|
|
value: "http_requests_total".to_string(),
|
|
},
|
|
Label {
|
|
name: "method".to_string(),
|
|
value: "GET".to_string(),
|
|
},
|
|
];
|
|
|
|
let result = validate_labels(labels);
|
|
assert!(result.is_ok());
|
|
let validated = result.unwrap();
|
|
assert_eq!(validated.len(), 2);
|
|
// Should be sorted
|
|
assert_eq!(validated[0].name, "__name__");
|
|
}
|
|
|
|
#[test]
|
|
fn test_validate_labels_missing_name() {
|
|
let labels = vec![Label {
|
|
name: "method".to_string(),
|
|
value: "GET".to_string(),
|
|
}];
|
|
|
|
let result = validate_labels(labels);
|
|
assert!(result.is_err());
|
|
}
|
|
|
|
#[test]
|
|
fn test_validate_labels_invalid_name() {
|
|
let labels = vec![
|
|
Label {
|
|
name: "__name__".to_string(),
|
|
value: "metric".to_string(),
|
|
},
|
|
Label {
|
|
name: "123invalid".to_string(), // Cannot start with digit
|
|
value: "value".to_string(),
|
|
},
|
|
];
|
|
|
|
let result = validate_labels(labels);
|
|
assert!(result.is_err());
|
|
}
|
|
|
|
#[test]
|
|
fn test_compute_fingerprint_stable() {
|
|
let labels = vec![
|
|
nightlight_types::Label {
|
|
name: "__name__".to_string(),
|
|
value: "metric".to_string(),
|
|
},
|
|
nightlight_types::Label {
|
|
name: "label1".to_string(),
|
|
value: "value1".to_string(),
|
|
},
|
|
];
|
|
|
|
let fp1 = compute_series_fingerprint(&labels);
|
|
let fp2 = compute_series_fingerprint(&labels);
|
|
assert_eq!(fp1, fp2);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_ingestion_service_storage() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let storage = Arc::new(Storage::new(dir.path().to_str().unwrap()).unwrap());
|
|
let service = IngestionService::new(storage);
|
|
let (samples, series) = service.storage_stats().await.unwrap();
|
|
assert_eq!(samples, 0);
|
|
assert_eq!(series, 0);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_process_write_request_persists_samples() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let storage = Arc::new(Storage::new(dir.path().to_str().unwrap()).unwrap());
|
|
let service = IngestionService::new(Arc::clone(&storage));
|
|
|
|
let request = WriteRequest {
|
|
timeseries: vec![nightlight_api::prometheus::TimeSeries {
|
|
labels: vec![
|
|
Label {
|
|
name: "__name__".to_string(),
|
|
value: "ingest_metric".to_string(),
|
|
},
|
|
Label {
|
|
name: "job".to_string(),
|
|
value: "test".to_string(),
|
|
},
|
|
],
|
|
samples: vec![nightlight_api::prometheus::Sample {
|
|
value: 42.0,
|
|
timestamp: 1_000,
|
|
}],
|
|
}],
|
|
};
|
|
|
|
let processed = service.process_write_request(request).await.unwrap();
|
|
assert_eq!(processed, 1);
|
|
|
|
storage.flush().await.unwrap();
|
|
let reloaded = Storage::new(dir.path().to_str().unwrap()).unwrap();
|
|
let ids = reloaded
|
|
.find_series(vec![
|
|
"__name__=ingest_metric".to_string(),
|
|
"job=test".to_string(),
|
|
])
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(ids.len(), 1);
|
|
let series = reloaded
|
|
.query_series(ids[0], 0, 10_000)
|
|
.await
|
|
.unwrap()
|
|
.unwrap();
|
|
assert_eq!(series.samples.len(), 1);
|
|
assert_eq!(series.samples[0].value, 42.0);
|
|
}
|
|
}
|