nightlight: add durable grpc runtime
This commit is contained in:
parent
9dfe86f92a
commit
bd09761def
8 changed files with 1379 additions and 287 deletions
|
|
@ -129,17 +129,6 @@ impl Config {
|
||||||
Ok(config)
|
Ok(config)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Load configuration from file, or use defaults if file doesn't exist
|
|
||||||
pub fn load_or_default() -> Result<Self> {
|
|
||||||
match Self::from_file("config.yaml") {
|
|
||||||
Ok(config) => Ok(config),
|
|
||||||
Err(_) => {
|
|
||||||
tracing::warn!("No config file found, using defaults");
|
|
||||||
Ok(Self::default())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Save configuration to a YAML file
|
/// Save configuration to a YAML file
|
||||||
pub fn save(&self, path: &str) -> Result<()> {
|
pub fn save(&self, path: &str) -> Result<()> {
|
||||||
let content = serde_yaml::to_string(self)?;
|
let content = serde_yaml::to_string(self)?;
|
||||||
|
|
|
||||||
502
nightlight/crates/nightlight-server/src/grpc.rs
Normal file
502
nightlight/crates/nightlight-server/src/grpc.rs
Normal file
|
|
@ -0,0 +1,502 @@
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
use chrono::Utc;
|
||||||
|
use tonic::{Request, Response, Status};
|
||||||
|
|
||||||
|
use crate::ingestion::IngestionMetrics;
|
||||||
|
use crate::query::{QueryMetrics, QueryResult as InstantQueryData, QueryService, RangeQueryResult};
|
||||||
|
use crate::storage::{Storage, StorageStats};
|
||||||
|
use nightlight_api::nightlight::admin_server::Admin;
|
||||||
|
use nightlight_api::nightlight::metric_query_server::MetricQuery;
|
||||||
|
use nightlight_api::nightlight::{
|
||||||
|
BuildInfoRequest, BuildInfoResponse, ComponentHealth, HealthRequest, HealthResponse,
|
||||||
|
LabelValuesRequest, LabelValuesResponse, QueryData, QueryResponse, QueryResult, SamplePair,
|
||||||
|
SeriesLabels, SeriesQueryRequest, SeriesQueryResponse, StatsRequest, StatsResponse,
|
||||||
|
};
|
||||||
|
use nightlight_types::Error;
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct MetricQueryServiceImpl {
|
||||||
|
query: QueryService,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct AdminServiceImpl {
|
||||||
|
storage: Arc<Storage>,
|
||||||
|
ingestion_metrics: Arc<IngestionMetrics>,
|
||||||
|
query_metrics: Arc<QueryMetrics>,
|
||||||
|
started_at: Instant,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MetricQueryServiceImpl {
|
||||||
|
pub fn new(query: QueryService) -> Self {
|
||||||
|
Self { query }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AdminServiceImpl {
|
||||||
|
pub fn new(
|
||||||
|
storage: Arc<Storage>,
|
||||||
|
ingestion_metrics: Arc<IngestionMetrics>,
|
||||||
|
query_metrics: Arc<QueryMetrics>,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
storage,
|
||||||
|
ingestion_metrics,
|
||||||
|
query_metrics,
|
||||||
|
started_at: Instant::now(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tonic::async_trait]
|
||||||
|
impl MetricQuery for MetricQueryServiceImpl {
|
||||||
|
async fn instant_query(
|
||||||
|
&self,
|
||||||
|
request: Request<nightlight_api::nightlight::InstantQueryRequest>,
|
||||||
|
) -> Result<Response<QueryResponse>, Status> {
|
||||||
|
let request = request.into_inner();
|
||||||
|
let time = if request.time == 0 {
|
||||||
|
Utc::now().timestamp_millis()
|
||||||
|
} else {
|
||||||
|
request.time
|
||||||
|
};
|
||||||
|
|
||||||
|
let response = match self.query.execute_instant_query(&request.query, time).await {
|
||||||
|
Ok(result) => QueryResponse {
|
||||||
|
status: "success".to_string(),
|
||||||
|
data: Some(instant_query_data_to_proto(result)),
|
||||||
|
error: String::new(),
|
||||||
|
error_type: String::new(),
|
||||||
|
warnings: Vec::new(),
|
||||||
|
},
|
||||||
|
Err(error) => QueryResponse {
|
||||||
|
status: "error".to_string(),
|
||||||
|
data: None,
|
||||||
|
error: error.to_string(),
|
||||||
|
error_type: query_error_type(&error).to_string(),
|
||||||
|
warnings: Vec::new(),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(Response::new(response))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn range_query(
|
||||||
|
&self,
|
||||||
|
request: Request<nightlight_api::nightlight::RangeQueryRequest>,
|
||||||
|
) -> Result<Response<QueryResponse>, Status> {
|
||||||
|
let request = request.into_inner();
|
||||||
|
|
||||||
|
let response = match self
|
||||||
|
.query
|
||||||
|
.execute_range_query(&request.query, request.start, request.end, request.step)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(result) => QueryResponse {
|
||||||
|
status: "success".to_string(),
|
||||||
|
data: Some(range_query_data_to_proto(result)),
|
||||||
|
error: String::new(),
|
||||||
|
error_type: String::new(),
|
||||||
|
warnings: Vec::new(),
|
||||||
|
},
|
||||||
|
Err(error) => QueryResponse {
|
||||||
|
status: "error".to_string(),
|
||||||
|
data: None,
|
||||||
|
error: error.to_string(),
|
||||||
|
error_type: query_error_type(&error).to_string(),
|
||||||
|
warnings: Vec::new(),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(Response::new(response))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn series_query(
|
||||||
|
&self,
|
||||||
|
request: Request<SeriesQueryRequest>,
|
||||||
|
) -> Result<Response<SeriesQueryResponse>, Status> {
|
||||||
|
let request = request.into_inner();
|
||||||
|
let response = match self
|
||||||
|
.query
|
||||||
|
.series_metadata(
|
||||||
|
&request.r#match,
|
||||||
|
optional_millis(request.start),
|
||||||
|
optional_millis(request.end),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(series) => SeriesQueryResponse {
|
||||||
|
status: "success".to_string(),
|
||||||
|
data: series
|
||||||
|
.into_iter()
|
||||||
|
.map(|labels| SeriesLabels { labels })
|
||||||
|
.collect(),
|
||||||
|
error: String::new(),
|
||||||
|
},
|
||||||
|
Err(error) => SeriesQueryResponse {
|
||||||
|
status: "error".to_string(),
|
||||||
|
data: Vec::new(),
|
||||||
|
error: error.to_string(),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(Response::new(response))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn label_values_query(
|
||||||
|
&self,
|
||||||
|
request: Request<LabelValuesRequest>,
|
||||||
|
) -> Result<Response<LabelValuesResponse>, Status> {
|
||||||
|
let request = request.into_inner();
|
||||||
|
let response = match self
|
||||||
|
.query
|
||||||
|
.label_values_for_matchers(
|
||||||
|
&request.label_name,
|
||||||
|
&request.r#match,
|
||||||
|
optional_millis(request.start),
|
||||||
|
optional_millis(request.end),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(values) => LabelValuesResponse {
|
||||||
|
status: "success".to_string(),
|
||||||
|
data: values,
|
||||||
|
error: String::new(),
|
||||||
|
},
|
||||||
|
Err(error) => LabelValuesResponse {
|
||||||
|
status: "error".to_string(),
|
||||||
|
data: Vec::new(),
|
||||||
|
error: error.to_string(),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(Response::new(response))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tonic::async_trait]
|
||||||
|
impl Admin for AdminServiceImpl {
|
||||||
|
async fn health(
|
||||||
|
&self,
|
||||||
|
_request: Request<HealthRequest>,
|
||||||
|
) -> Result<Response<HealthResponse>, Status> {
|
||||||
|
let storage_result = self.storage.stats().await;
|
||||||
|
let status = if storage_result.is_ok() { "ok" } else { "degraded" };
|
||||||
|
let storage_message = match &storage_result {
|
||||||
|
Ok(_) => "storage ready".to_string(),
|
||||||
|
Err(error) => error.to_string(),
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(Response::new(HealthResponse {
|
||||||
|
status: status.to_string(),
|
||||||
|
message: "nightlight ready".to_string(),
|
||||||
|
components: vec![
|
||||||
|
ComponentHealth {
|
||||||
|
name: "storage".to_string(),
|
||||||
|
status: status.to_string(),
|
||||||
|
message: storage_message,
|
||||||
|
},
|
||||||
|
ComponentHealth {
|
||||||
|
name: "ingestion".to_string(),
|
||||||
|
status: "ok".to_string(),
|
||||||
|
message: "remote_write endpoint ready".to_string(),
|
||||||
|
},
|
||||||
|
ComponentHealth {
|
||||||
|
name: "query_engine".to_string(),
|
||||||
|
status: "ok".to_string(),
|
||||||
|
message: "http and grpc query paths ready".to_string(),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn stats(
|
||||||
|
&self,
|
||||||
|
_request: Request<StatsRequest>,
|
||||||
|
) -> Result<Response<StatsResponse>, Status> {
|
||||||
|
let storage = self
|
||||||
|
.storage
|
||||||
|
.stats()
|
||||||
|
.await
|
||||||
|
.map_err(|error| Status::internal(error.to_string()))?;
|
||||||
|
let ingestion = self.ingestion_metrics.snapshot();
|
||||||
|
let query = self.query_metrics.snapshot();
|
||||||
|
|
||||||
|
Ok(Response::new(StatsResponse {
|
||||||
|
storage: Some(storage_stats_to_proto(storage)),
|
||||||
|
ingestion: Some(nightlight_api::nightlight::IngestionStats {
|
||||||
|
samples_ingested_total: ingestion.samples_ingested_total,
|
||||||
|
write_requests_total: ingestion.write_requests_total,
|
||||||
|
write_requests_failed: ingestion.write_requests_failed,
|
||||||
|
samples_per_second: ingestion.samples_per_second,
|
||||||
|
buffer_samples: ingestion.buffer_samples,
|
||||||
|
}),
|
||||||
|
query: Some(nightlight_api::nightlight::QueryStats {
|
||||||
|
queries_total: query.queries_total,
|
||||||
|
queries_failed: query.queries_failed,
|
||||||
|
queries_active: query.queries_active,
|
||||||
|
query_duration_p50: query.query_duration_p50,
|
||||||
|
query_duration_p95: query.query_duration_p95,
|
||||||
|
query_duration_p99: query.query_duration_p99,
|
||||||
|
}),
|
||||||
|
uptime_seconds: self.started_at.elapsed().as_secs(),
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn build_info(
|
||||||
|
&self,
|
||||||
|
_request: Request<BuildInfoRequest>,
|
||||||
|
) -> Result<Response<BuildInfoResponse>, Status> {
|
||||||
|
Ok(Response::new(BuildInfoResponse {
|
||||||
|
version: env!("CARGO_PKG_VERSION").to_string(),
|
||||||
|
commit: option_env!("GIT_COMMIT").unwrap_or("unknown").to_string(),
|
||||||
|
build_time: option_env!("BUILD_TIME").unwrap_or("unknown").to_string(),
|
||||||
|
rust_version: option_env!("RUSTC_VERSION").unwrap_or("unknown").to_string(),
|
||||||
|
target: format!("{}-{}", std::env::consts::ARCH, std::env::consts::OS),
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn optional_millis(value: i64) -> Option<i64> {
|
||||||
|
if value == 0 {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn query_error_type(error: &Error) -> &'static str {
|
||||||
|
match error {
|
||||||
|
Error::InvalidMetric(_) | Error::InvalidLabel(_) | Error::InvalidTimeRange(_) => "bad_data",
|
||||||
|
Error::Timeout(_) => "timeout",
|
||||||
|
_ => "execution",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn instant_query_data_to_proto(result: InstantQueryData) -> QueryData {
|
||||||
|
QueryData {
|
||||||
|
result_type: result.result_type,
|
||||||
|
result: result
|
||||||
|
.result
|
||||||
|
.into_iter()
|
||||||
|
.map(|series| QueryResult {
|
||||||
|
metric: series.metric,
|
||||||
|
values: Vec::new(),
|
||||||
|
value: series.value.map(sample_pair_from_tuple),
|
||||||
|
})
|
||||||
|
.collect(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn range_query_data_to_proto(result: RangeQueryResult) -> QueryData {
|
||||||
|
QueryData {
|
||||||
|
result_type: result.result_type,
|
||||||
|
result: result
|
||||||
|
.result
|
||||||
|
.into_iter()
|
||||||
|
.map(|series| QueryResult {
|
||||||
|
metric: series.metric,
|
||||||
|
values: series
|
||||||
|
.values
|
||||||
|
.into_iter()
|
||||||
|
.map(sample_pair_from_tuple)
|
||||||
|
.collect(),
|
||||||
|
value: None,
|
||||||
|
})
|
||||||
|
.collect(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn sample_pair_from_tuple((timestamp, value): (i64, f64)) -> SamplePair {
|
||||||
|
SamplePair { timestamp, value }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn storage_stats_to_proto(stats: StorageStats) -> nightlight_api::nightlight::StorageStats {
|
||||||
|
nightlight_api::nightlight::StorageStats {
|
||||||
|
active_series: stats.active_series,
|
||||||
|
total_samples: stats.total_samples,
|
||||||
|
blocks_count: stats.blocks_count,
|
||||||
|
head_samples: stats.head_samples,
|
||||||
|
disk_bytes_used: stats.disk_bytes_used,
|
||||||
|
oldest_sample_time: stats.oldest_sample_time,
|
||||||
|
newest_sample_time: stats.newest_sample_time,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::ingestion::IngestionService;
|
||||||
|
use crate::storage::Storage;
|
||||||
|
use nightlight_api::nightlight::{
|
||||||
|
InstantQueryRequest, LabelValuesRequest, SeriesQueryRequest,
|
||||||
|
};
|
||||||
|
use nightlight_api::prometheus::{Label, Sample, TimeSeries, WriteRequest};
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn instant_query_grpc_returns_metric_data() {
|
||||||
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
let storage = Arc::new(Storage::new(dir.path().to_str().unwrap()).unwrap());
|
||||||
|
let ingestion = IngestionService::new(Arc::clone(&storage));
|
||||||
|
ingestion
|
||||||
|
.process_write_request(WriteRequest {
|
||||||
|
timeseries: vec![TimeSeries {
|
||||||
|
labels: vec![
|
||||||
|
Label {
|
||||||
|
name: "__name__".to_string(),
|
||||||
|
value: "grpc_metric".to_string(),
|
||||||
|
},
|
||||||
|
Label {
|
||||||
|
name: "job".to_string(),
|
||||||
|
value: "nightlight".to_string(),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
samples: vec![Sample {
|
||||||
|
value: 12.5,
|
||||||
|
timestamp: 1_000,
|
||||||
|
}],
|
||||||
|
}],
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let service = MetricQueryServiceImpl::new(QueryService::from_storage(storage.queryable()));
|
||||||
|
let response = service
|
||||||
|
.instant_query(Request::new(InstantQueryRequest {
|
||||||
|
query: "grpc_metric{job=\"nightlight\"}".to_string(),
|
||||||
|
time: 2_000,
|
||||||
|
timeout: 0,
|
||||||
|
}))
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.into_inner();
|
||||||
|
|
||||||
|
assert_eq!(response.status, "success");
|
||||||
|
let data = response.data.unwrap();
|
||||||
|
assert_eq!(data.result.len(), 1);
|
||||||
|
assert_eq!(
|
||||||
|
data.result[0].metric.get("__name__").map(String::as_str),
|
||||||
|
Some("grpc_metric")
|
||||||
|
);
|
||||||
|
assert_eq!(data.result[0].value.as_ref().map(|value| value.value), Some(12.5));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn metadata_queries_grpc_filter_series() {
|
||||||
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
let storage = Arc::new(Storage::new(dir.path().to_str().unwrap()).unwrap());
|
||||||
|
let ingestion = IngestionService::new(Arc::clone(&storage));
|
||||||
|
ingestion
|
||||||
|
.process_write_request(WriteRequest {
|
||||||
|
timeseries: vec![
|
||||||
|
TimeSeries {
|
||||||
|
labels: vec![
|
||||||
|
Label {
|
||||||
|
name: "__name__".to_string(),
|
||||||
|
value: "grpc_metric".to_string(),
|
||||||
|
},
|
||||||
|
Label {
|
||||||
|
name: "job".to_string(),
|
||||||
|
value: "api".to_string(),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
samples: vec![Sample {
|
||||||
|
value: 1.0,
|
||||||
|
timestamp: 1_000,
|
||||||
|
}],
|
||||||
|
},
|
||||||
|
TimeSeries {
|
||||||
|
labels: vec![
|
||||||
|
Label {
|
||||||
|
name: "__name__".to_string(),
|
||||||
|
value: "grpc_metric".to_string(),
|
||||||
|
},
|
||||||
|
Label {
|
||||||
|
name: "job".to_string(),
|
||||||
|
value: "worker".to_string(),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
samples: vec![Sample {
|
||||||
|
value: 2.0,
|
||||||
|
timestamp: 2_000,
|
||||||
|
}],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let service = MetricQueryServiceImpl::new(QueryService::from_storage(storage.queryable()));
|
||||||
|
let series = service
|
||||||
|
.series_query(Request::new(SeriesQueryRequest {
|
||||||
|
r#match: vec!["job=api".to_string()],
|
||||||
|
start: 0,
|
||||||
|
end: 0,
|
||||||
|
}))
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.into_inner();
|
||||||
|
assert_eq!(series.status, "success");
|
||||||
|
assert_eq!(series.data.len(), 1);
|
||||||
|
|
||||||
|
let label_values = service
|
||||||
|
.label_values_query(Request::new(LabelValuesRequest {
|
||||||
|
label_name: "job".to_string(),
|
||||||
|
r#match: vec!["__name__=grpc_metric".to_string()],
|
||||||
|
start: 0,
|
||||||
|
end: 0,
|
||||||
|
}))
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.into_inner();
|
||||||
|
assert_eq!(label_values.status, "success");
|
||||||
|
assert_eq!(label_values.data, vec!["api".to_string(), "worker".to_string()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn admin_stats_report_ingestion_and_query_counters() {
|
||||||
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
let storage = Arc::new(Storage::new(dir.path().to_str().unwrap()).unwrap());
|
||||||
|
let ingestion = IngestionService::new(Arc::clone(&storage));
|
||||||
|
ingestion
|
||||||
|
.process_write_request(WriteRequest {
|
||||||
|
timeseries: vec![TimeSeries {
|
||||||
|
labels: vec![Label {
|
||||||
|
name: "__name__".to_string(),
|
||||||
|
value: "admin_metric".to_string(),
|
||||||
|
}],
|
||||||
|
samples: vec![Sample {
|
||||||
|
value: 3.0,
|
||||||
|
timestamp: 1_000,
|
||||||
|
}],
|
||||||
|
}],
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let query = QueryService::from_storage(storage.queryable());
|
||||||
|
query.execute_instant_query("admin_metric", 2_000).await.unwrap();
|
||||||
|
|
||||||
|
let admin = AdminServiceImpl::new(
|
||||||
|
Arc::clone(&storage),
|
||||||
|
ingestion.metrics(),
|
||||||
|
query.metrics(),
|
||||||
|
);
|
||||||
|
let stats = admin
|
||||||
|
.stats(Request::new(StatsRequest {}))
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.into_inner();
|
||||||
|
|
||||||
|
assert_eq!(stats.storage.as_ref().map(|value| value.total_samples), Some(1));
|
||||||
|
assert_eq!(
|
||||||
|
stats.ingestion
|
||||||
|
.as_ref()
|
||||||
|
.map(|value| value.samples_ingested_total),
|
||||||
|
Some(1)
|
||||||
|
);
|
||||||
|
assert_eq!(stats.query.as_ref().map(|value| value.queries_total), Some(1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -16,10 +16,11 @@ use nightlight_types::Error;
|
||||||
use prost::Message;
|
use prost::Message;
|
||||||
use snap::raw::Decoder as SnappyDecoder;
|
use snap::raw::Decoder as SnappyDecoder;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use tokio::sync::RwLock;
|
use std::sync::atomic::{AtomicU64, Ordering};
|
||||||
|
use std::time::Instant;
|
||||||
use tracing::{debug, error, info, warn};
|
use tracing::{debug, error, info, warn};
|
||||||
|
|
||||||
use crate::query::QueryableStorage;
|
use crate::storage::Storage;
|
||||||
|
|
||||||
/// Maximum write request size (10 MB uncompressed)
|
/// Maximum write request size (10 MB uncompressed)
|
||||||
const MAX_REQUEST_SIZE: usize = 10 * 1024 * 1024;
|
const MAX_REQUEST_SIZE: usize = 10 * 1024 * 1024;
|
||||||
|
|
@ -27,28 +28,33 @@ const MAX_REQUEST_SIZE: usize = 10 * 1024 * 1024;
|
||||||
/// Ingestion service state
|
/// Ingestion service state
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct IngestionService {
|
pub struct IngestionService {
|
||||||
storage: Arc<RwLock<QueryableStorage>>,
|
storage: Arc<Storage>,
|
||||||
metrics: Arc<IngestionMetrics>,
|
metrics: Arc<IngestionMetrics>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Ingestion metrics for monitoring
|
#[derive(Debug)]
|
||||||
struct IngestionMetrics {
|
pub struct IngestionMetrics {
|
||||||
samples_received: Arc<std::sync::atomic::AtomicU64>,
|
samples_received: AtomicU64,
|
||||||
samples_invalid: Arc<std::sync::atomic::AtomicU64>,
|
samples_invalid: AtomicU64,
|
||||||
requests_total: Arc<std::sync::atomic::AtomicU64>,
|
requests_total: AtomicU64,
|
||||||
requests_failed: Arc<std::sync::atomic::AtomicU64>,
|
requests_failed: AtomicU64,
|
||||||
|
started_at: Instant,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, Default)]
|
||||||
|
pub struct IngestionMetricsSnapshot {
|
||||||
|
pub samples_ingested_total: u64,
|
||||||
|
pub write_requests_total: u64,
|
||||||
|
pub write_requests_failed: u64,
|
||||||
|
pub samples_per_second: f64,
|
||||||
|
pub buffer_samples: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl IngestionService {
|
impl IngestionService {
|
||||||
pub fn new(storage: Arc<RwLock<QueryableStorage>>) -> Self {
|
pub fn new(storage: Arc<Storage>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
storage,
|
storage,
|
||||||
metrics: Arc::new(IngestionMetrics {
|
metrics: Arc::new(IngestionMetrics::new()),
|
||||||
samples_received: Arc::new(std::sync::atomic::AtomicU64::new(0)),
|
|
||||||
samples_invalid: Arc::new(std::sync::atomic::AtomicU64::new(0)),
|
|
||||||
requests_total: Arc::new(std::sync::atomic::AtomicU64::new(0)),
|
|
||||||
requests_failed: Arc::new(std::sync::atomic::AtomicU64::new(0)),
|
|
||||||
}),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -59,10 +65,14 @@ impl IngestionService {
|
||||||
.with_state(self)
|
.with_state(self)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn metrics(&self) -> Arc<IngestionMetrics> {
|
||||||
|
Arc::clone(&self.metrics)
|
||||||
|
}
|
||||||
|
|
||||||
/// Process a WriteRequest and write to shared storage
|
/// Process a WriteRequest and write to shared storage
|
||||||
async fn process_write_request(&self, request: WriteRequest) -> Result<u64, Error> {
|
pub(crate) async fn process_write_request(&self, request: WriteRequest) -> Result<u64, Error> {
|
||||||
let mut storage = self.storage.write().await;
|
|
||||||
let mut samples_processed = 0;
|
let mut samples_processed = 0;
|
||||||
|
let mut series_to_append = Vec::new();
|
||||||
|
|
||||||
for ts in request.timeseries {
|
for ts in request.timeseries {
|
||||||
// Validate and normalize labels
|
// Validate and normalize labels
|
||||||
|
|
@ -83,7 +93,7 @@ impl IngestionService {
|
||||||
// Validate sample
|
// Validate sample
|
||||||
if !sample.value.is_finite() {
|
if !sample.value.is_finite() {
|
||||||
warn!("Invalid sample value: {}", sample.value);
|
warn!("Invalid sample value: {}", sample.value);
|
||||||
self.metrics.samples_invalid.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
self.metrics.samples_invalid.fetch_add(1, Ordering::Relaxed);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -113,20 +123,56 @@ impl IngestionService {
|
||||||
samples: internal_samples,
|
samples: internal_samples,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Write to shared storage (upsert merges samples)
|
series_to_append.push(time_series);
|
||||||
storage.upsert_series(time_series);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
self.metrics.samples_received.fetch_add(samples_processed, std::sync::atomic::Ordering::Relaxed);
|
self.storage
|
||||||
|
.append(series_to_append)
|
||||||
|
.await
|
||||||
|
.map_err(|error| Error::Storage(error.to_string()))?;
|
||||||
|
self.metrics
|
||||||
|
.samples_received
|
||||||
|
.fetch_add(samples_processed, Ordering::Relaxed);
|
||||||
|
|
||||||
Ok(samples_processed)
|
Ok(samples_processed)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get current storage statistics
|
/// Get current storage statistics
|
||||||
pub async fn storage_stats(&self) -> (usize, usize) {
|
pub async fn storage_stats(&self) -> Result<(usize, usize), Error> {
|
||||||
let storage = self.storage.read().await;
|
let stats = self
|
||||||
let total_samples: usize = storage.series.values().map(|s| s.samples.len()).sum();
|
.storage
|
||||||
(total_samples, storage.series.len())
|
.stats()
|
||||||
|
.await
|
||||||
|
.map_err(|error| Error::Storage(error.to_string()))?;
|
||||||
|
Ok((stats.total_samples as usize, stats.active_series as usize))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl IngestionMetrics {
|
||||||
|
fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
samples_received: AtomicU64::new(0),
|
||||||
|
samples_invalid: AtomicU64::new(0),
|
||||||
|
requests_total: AtomicU64::new(0),
|
||||||
|
requests_failed: AtomicU64::new(0),
|
||||||
|
started_at: Instant::now(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn snapshot(&self) -> IngestionMetricsSnapshot {
|
||||||
|
let uptime = self.started_at.elapsed().as_secs_f64();
|
||||||
|
let samples_ingested_total = self.samples_received.load(Ordering::Relaxed);
|
||||||
|
IngestionMetricsSnapshot {
|
||||||
|
samples_ingested_total,
|
||||||
|
write_requests_total: self.requests_total.load(Ordering::Relaxed),
|
||||||
|
write_requests_failed: self.requests_failed.load(Ordering::Relaxed),
|
||||||
|
samples_per_second: if uptime > 0.0 {
|
||||||
|
samples_ingested_total as f64 / uptime
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
},
|
||||||
|
buffer_samples: 0,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -135,7 +181,7 @@ async fn handle_remote_write(
|
||||||
State(service): State<IngestionService>,
|
State(service): State<IngestionService>,
|
||||||
body: Bytes,
|
body: Bytes,
|
||||||
) -> Response {
|
) -> Response {
|
||||||
service.metrics.requests_total.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
service.metrics.requests_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
|
||||||
debug!("Received remote_write request, size: {} bytes", body.len());
|
debug!("Received remote_write request, size: {} bytes", body.len());
|
||||||
|
|
||||||
|
|
@ -150,7 +196,7 @@ async fn handle_remote_write(
|
||||||
Ok(data) => data,
|
Ok(data) => data,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("Snappy decompression failed: {}", e);
|
error!("Snappy decompression failed: {}", e);
|
||||||
return IngestionError::DecompressionFailed(e.to_string()).into_response();
|
return IngestionError::DecompressionFailed.into_response();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -161,7 +207,7 @@ async fn handle_remote_write(
|
||||||
Ok(req) => req,
|
Ok(req) => req,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("Protobuf decode failed: {}", e);
|
error!("Protobuf decode failed: {}", e);
|
||||||
return IngestionError::InvalidProtobuf(e.to_string()).into_response();
|
return IngestionError::InvalidProtobuf.into_response();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -178,18 +224,18 @@ async fn handle_remote_write(
|
||||||
}
|
}
|
||||||
Err(Error::Storage(msg)) if msg.contains("buffer full") => {
|
Err(Error::Storage(msg)) if msg.contains("buffer full") => {
|
||||||
warn!("Write buffer full, returning 429");
|
warn!("Write buffer full, returning 429");
|
||||||
service.metrics.requests_failed.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
service.metrics.requests_failed.fetch_add(1, Ordering::Relaxed);
|
||||||
IngestionError::Backpressure.into_response()
|
IngestionError::Backpressure.into_response()
|
||||||
}
|
}
|
||||||
Err(Error::InvalidLabel(msg)) => {
|
Err(Error::InvalidLabel(msg)) => {
|
||||||
warn!("Invalid labels: {}", msg);
|
warn!("Invalid labels: {}", msg);
|
||||||
service.metrics.requests_failed.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
service.metrics.requests_failed.fetch_add(1, Ordering::Relaxed);
|
||||||
IngestionError::InvalidLabels(msg).into_response()
|
IngestionError::InvalidLabels.into_response()
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("Failed to process write request: {}", e);
|
error!("Failed to process write request: {}", e);
|
||||||
service.metrics.requests_failed.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
service.metrics.requests_failed.fetch_add(1, Ordering::Relaxed);
|
||||||
IngestionError::StorageError(e.to_string()).into_response()
|
IngestionError::StorageError.into_response()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -271,10 +317,10 @@ fn compute_series_fingerprint(labels: &[nightlight_types::Label]) -> u64 {
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
enum IngestionError {
|
enum IngestionError {
|
||||||
PayloadTooLarge,
|
PayloadTooLarge,
|
||||||
DecompressionFailed(String),
|
DecompressionFailed,
|
||||||
InvalidProtobuf(String),
|
InvalidProtobuf,
|
||||||
InvalidLabels(String),
|
InvalidLabels,
|
||||||
StorageError(String),
|
StorageError,
|
||||||
Backpressure,
|
Backpressure,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -284,16 +330,16 @@ impl IntoResponse for IngestionError {
|
||||||
IngestionError::PayloadTooLarge => {
|
IngestionError::PayloadTooLarge => {
|
||||||
(StatusCode::PAYLOAD_TOO_LARGE, "Request payload too large")
|
(StatusCode::PAYLOAD_TOO_LARGE, "Request payload too large")
|
||||||
}
|
}
|
||||||
IngestionError::DecompressionFailed(_) => {
|
IngestionError::DecompressionFailed => {
|
||||||
(StatusCode::BAD_REQUEST, "Snappy decompression failed")
|
(StatusCode::BAD_REQUEST, "Snappy decompression failed")
|
||||||
}
|
}
|
||||||
IngestionError::InvalidProtobuf(_) => {
|
IngestionError::InvalidProtobuf => {
|
||||||
(StatusCode::BAD_REQUEST, "Invalid protobuf encoding")
|
(StatusCode::BAD_REQUEST, "Invalid protobuf encoding")
|
||||||
}
|
}
|
||||||
IngestionError::InvalidLabels(_) => {
|
IngestionError::InvalidLabels => {
|
||||||
(StatusCode::BAD_REQUEST, "Invalid metric labels")
|
(StatusCode::BAD_REQUEST, "Invalid metric labels")
|
||||||
}
|
}
|
||||||
IngestionError::StorageError(_) => {
|
IngestionError::StorageError => {
|
||||||
(StatusCode::INTERNAL_SERVER_ERROR, "Storage error")
|
(StatusCode::INTERNAL_SERVER_ERROR, "Storage error")
|
||||||
}
|
}
|
||||||
IngestionError::Backpressure => {
|
IngestionError::Backpressure => {
|
||||||
|
|
@ -308,6 +354,7 @@ impl IntoResponse for IngestionError {
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use crate::storage::Storage;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_validate_labels_success() {
|
fn test_validate_labels_success() {
|
||||||
|
|
@ -378,16 +425,58 @@ mod tests {
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_ingestion_service_storage() {
|
async fn test_ingestion_service_storage() {
|
||||||
use crate::query::QueryableStorage;
|
let dir = tempfile::tempdir().unwrap();
|
||||||
use std::collections::HashMap;
|
let storage = Arc::new(Storage::new(dir.path().to_str().unwrap()).unwrap());
|
||||||
|
|
||||||
let storage = Arc::new(RwLock::new(QueryableStorage {
|
|
||||||
series: HashMap::new(),
|
|
||||||
label_index: HashMap::new(),
|
|
||||||
}));
|
|
||||||
let service = IngestionService::new(storage);
|
let service = IngestionService::new(storage);
|
||||||
let (samples, series) = service.storage_stats().await;
|
let (samples, series) = service.storage_stats().await.unwrap();
|
||||||
assert_eq!(samples, 0);
|
assert_eq!(samples, 0);
|
||||||
assert_eq!(series, 0);
|
assert_eq!(series, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_process_write_request_persists_samples() {
|
||||||
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
let storage = Arc::new(Storage::new(dir.path().to_str().unwrap()).unwrap());
|
||||||
|
let service = IngestionService::new(Arc::clone(&storage));
|
||||||
|
|
||||||
|
let request = WriteRequest {
|
||||||
|
timeseries: vec![nightlight_api::prometheus::TimeSeries {
|
||||||
|
labels: vec![
|
||||||
|
Label {
|
||||||
|
name: "__name__".to_string(),
|
||||||
|
value: "ingest_metric".to_string(),
|
||||||
|
},
|
||||||
|
Label {
|
||||||
|
name: "job".to_string(),
|
||||||
|
value: "test".to_string(),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
samples: vec![nightlight_api::prometheus::Sample {
|
||||||
|
value: 42.0,
|
||||||
|
timestamp: 1_000,
|
||||||
|
}],
|
||||||
|
}],
|
||||||
|
};
|
||||||
|
|
||||||
|
let processed = service.process_write_request(request).await.unwrap();
|
||||||
|
assert_eq!(processed, 1);
|
||||||
|
|
||||||
|
storage.flush().await.unwrap();
|
||||||
|
let reloaded = Storage::new(dir.path().to_str().unwrap()).unwrap();
|
||||||
|
let ids = reloaded
|
||||||
|
.find_series(vec![
|
||||||
|
"__name__=ingest_metric".to_string(),
|
||||||
|
"job=test".to_string(),
|
||||||
|
])
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(ids.len(), 1);
|
||||||
|
let series = reloaded
|
||||||
|
.query_series(ids[0], 0, 10_000)
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(series.samples.len(), 1);
|
||||||
|
assert_eq!(series.samples[0].value, 42.0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@
|
||||||
//! This library exposes the internal modules for integration testing.
|
//! This library exposes the internal modules for integration testing.
|
||||||
|
|
||||||
pub mod config;
|
pub mod config;
|
||||||
|
pub mod grpc;
|
||||||
pub mod ingestion;
|
pub mod ingestion;
|
||||||
pub mod query;
|
pub mod query;
|
||||||
pub mod storage;
|
pub mod storage;
|
||||||
|
|
|
||||||
|
|
@ -1,127 +1,202 @@
|
||||||
//! Nightlight Server
|
//! Nightlight server binary.
|
||||||
//!
|
//!
|
||||||
//! A Prometheus-compatible metrics storage system with mTLS support.
|
//! Nightlight exposes:
|
||||||
//!
|
//! - Prometheus remote_write ingestion over HTTP
|
||||||
//! # Architecture
|
//! - PromQL-compatible query endpoints over HTTP and gRPC
|
||||||
//!
|
//! - gRPC admin endpoints for health and stats
|
||||||
//! - **Ingestion**: Prometheus remote_write API (HTTP POST with snappy compression)
|
//! - durable local storage backed by a WAL and snapshots
|
||||||
//! - **Query**: PromQL query engine (gRPC and HTTP APIs)
|
|
||||||
//! - **Storage**: Time-series database with retention and compaction
|
use std::net::SocketAddr;
|
||||||
//! - **Security**: mTLS for all connections (following T027 patterns)
|
use std::sync::Arc;
|
||||||
//!
|
use std::time::Duration;
|
||||||
//! # Configuration
|
|
||||||
//!
|
|
||||||
//! Configuration is loaded from a YAML file (default: config.yaml).
|
|
||||||
//! See config.rs for the full configuration schema.
|
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use tracing::{info, Level};
|
use axum::{routing::get, Router};
|
||||||
|
use nightlight_api::nightlight::admin_server::AdminServer;
|
||||||
|
use nightlight_api::nightlight::metric_query_server::MetricQueryServer;
|
||||||
|
use tokio::time::MissedTickBehavior;
|
||||||
|
use tonic::transport::Server as TonicServer;
|
||||||
|
use tonic_health::server::health_reporter;
|
||||||
|
use tracing::{error, info, warn, Level};
|
||||||
|
|
||||||
mod config;
|
mod config;
|
||||||
|
mod grpc;
|
||||||
mod ingestion;
|
mod ingestion;
|
||||||
mod query;
|
mod query;
|
||||||
mod storage;
|
mod storage;
|
||||||
|
|
||||||
use config::Config;
|
use config::{Config, StorageConfig};
|
||||||
|
use grpc::{AdminServiceImpl, MetricQueryServiceImpl};
|
||||||
|
use ingestion::IngestionService;
|
||||||
|
use query::QueryService;
|
||||||
|
use storage::Storage;
|
||||||
|
|
||||||
|
const DEFAULT_SNAPSHOT_INTERVAL_SECS: u64 = 30;
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<()> {
|
async fn main() -> Result<()> {
|
||||||
// Initialize tracing subscriber for structured logging
|
|
||||||
tracing_subscriber::fmt()
|
tracing_subscriber::fmt()
|
||||||
.with_max_level(Level::INFO)
|
.with_max_level(Level::INFO)
|
||||||
.with_target(false)
|
.with_target(false)
|
||||||
.with_thread_ids(true)
|
.with_thread_ids(true)
|
||||||
.init();
|
.init();
|
||||||
|
|
||||||
info!("Nightlight server starting...");
|
info!("Nightlight server starting");
|
||||||
info!("Version: {}", env!("CARGO_PKG_VERSION"));
|
info!("Version: {}", env!("CARGO_PKG_VERSION"));
|
||||||
|
|
||||||
// Load configuration from file or use defaults
|
|
||||||
let mut config = match Config::from_file("config.yaml") {
|
let mut config = match Config::from_file("config.yaml") {
|
||||||
Ok(cfg) => {
|
Ok(config) => {
|
||||||
info!("Configuration loaded from config.yaml");
|
info!("Configuration loaded from config.yaml");
|
||||||
cfg
|
config
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(error) => {
|
||||||
info!("Failed to load config.yaml: {}, using defaults", e);
|
info!("Failed to load config.yaml: {}, using defaults", error);
|
||||||
Config::default()
|
Config::default()
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Apply environment variable overrides (for NixOS module integration)
|
|
||||||
config.apply_env_overrides();
|
config.apply_env_overrides();
|
||||||
|
|
||||||
|
if config.tls.is_some() {
|
||||||
|
warn!("Nightlight TLS configuration is currently ignored; starting plaintext listeners");
|
||||||
|
}
|
||||||
|
|
||||||
|
let http_addr: SocketAddr = config.server.http_addr.parse()?;
|
||||||
|
let grpc_addr: SocketAddr = config.server.grpc_addr.parse()?;
|
||||||
|
let storage = Arc::new(Storage::new(&config.storage.data_dir)?);
|
||||||
|
let query_service = QueryService::from_storage(storage.queryable());
|
||||||
|
let ingestion_service = IngestionService::new(Arc::clone(&storage));
|
||||||
|
let admin_service = AdminServiceImpl::new(
|
||||||
|
Arc::clone(&storage),
|
||||||
|
ingestion_service.metrics(),
|
||||||
|
query_service.metrics(),
|
||||||
|
);
|
||||||
|
let metric_query_service = MetricQueryServiceImpl::new(query_service.clone());
|
||||||
|
|
||||||
info!("Server configuration:");
|
info!("Server configuration:");
|
||||||
info!(" gRPC address: {}", config.server.grpc_addr);
|
info!(" HTTP address: {}", http_addr);
|
||||||
info!(" HTTP address: {}", config.server.http_addr);
|
info!(" gRPC address: {}", grpc_addr);
|
||||||
info!(" Data directory: {}", config.storage.data_dir);
|
info!(" Data directory: {}", config.storage.data_dir);
|
||||||
info!(" Retention: {} days", config.storage.retention_days);
|
info!(" Retention: {} days", config.storage.retention_days);
|
||||||
info!(
|
info!(
|
||||||
" TLS enabled: {}",
|
" Compaction interval: {} seconds",
|
||||||
config.tls.as_ref().map_or("no", |_| "yes")
|
config.storage.compaction_interval_seconds
|
||||||
);
|
);
|
||||||
|
|
||||||
// TODO (S5): Initialize storage layer
|
let http_listener = tokio::net::TcpListener::bind(http_addr).await?;
|
||||||
// let storage = storage::Storage::new(&config.storage)?;
|
let http_app = Router::new()
|
||||||
// info!("Storage initialized");
|
.route("/healthz", get(healthz))
|
||||||
|
.merge(ingestion_service.clone().router())
|
||||||
|
.merge(query_service.clone().router());
|
||||||
|
|
||||||
// S5: Load persistent state from disk
|
let (mut health_reporter, health_service) = health_reporter();
|
||||||
let data_path = std::path::PathBuf::from(&config.storage.data_dir)
|
health_reporter
|
||||||
.join("nightlight.db");
|
.set_serving::<MetricQueryServer<MetricQueryServiceImpl>>()
|
||||||
let query_service = query::QueryService::new_with_persistence(&data_path)?;
|
.await;
|
||||||
info!("Query service initialized");
|
health_reporter
|
||||||
|
.set_serving::<AdminServer<AdminServiceImpl>>()
|
||||||
|
.await;
|
||||||
|
|
||||||
// Initialize ingestion service with shared storage
|
let (shutdown_tx, _) = tokio::sync::broadcast::channel::<()>(1);
|
||||||
let shared_storage = query_service.storage();
|
let mut http_shutdown = shutdown_tx.subscribe();
|
||||||
let ingestion_service = ingestion::IngestionService::new(shared_storage);
|
let mut grpc_shutdown = shutdown_tx.subscribe();
|
||||||
info!("Ingestion service initialized (sharing storage with query service)");
|
let maintenance_shutdown = shutdown_tx.subscribe();
|
||||||
|
|
||||||
// Clone for shutdown handler
|
let http_server = async move {
|
||||||
let query_service_for_shutdown = query_service.clone();
|
axum::serve(http_listener, http_app)
|
||||||
let data_path_for_shutdown = data_path.clone();
|
.with_graceful_shutdown(async move {
|
||||||
|
let _ = http_shutdown.recv().await;
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
};
|
||||||
|
|
||||||
// Create router with both ingestion and query endpoints
|
let grpc_server = TonicServer::builder()
|
||||||
let app = ingestion_service.router().merge(query_service.router());
|
.add_service(health_service)
|
||||||
|
.add_service(MetricQueryServer::new(metric_query_service))
|
||||||
|
.add_service(AdminServer::new(admin_service))
|
||||||
|
.serve_with_shutdown(grpc_addr, async move {
|
||||||
|
let _ = grpc_shutdown.recv().await;
|
||||||
|
});
|
||||||
|
|
||||||
// Start HTTP server for both ingestion and query endpoints
|
let maintenance_handle = tokio::spawn(maintenance_loop(
|
||||||
let listener = tokio::net::TcpListener::bind(&config.server.http_addr).await?;
|
Arc::clone(&storage),
|
||||||
info!("HTTP server listening on {}", config.server.http_addr);
|
config.storage.clone(),
|
||||||
|
maintenance_shutdown,
|
||||||
|
));
|
||||||
|
|
||||||
|
info!("HTTP server listening on {}", http_addr);
|
||||||
info!(" - Ingestion: POST /api/v1/write");
|
info!(" - Ingestion: POST /api/v1/write");
|
||||||
info!(" - Query: GET /api/v1/query, /api/v1/query_range");
|
info!(" - Query: GET /api/v1/query, /api/v1/query_range");
|
||||||
info!(" - Metadata: GET /api/v1/series, /api/v1/label/:name/values");
|
info!(" - Metadata: GET /api/v1/series, /api/v1/label/:name/values");
|
||||||
|
info!(" - Health: GET /healthz");
|
||||||
|
info!("gRPC server listening on {}", grpc_addr);
|
||||||
|
info!(" - MetricQuery.InstantQuery / RangeQuery / SeriesQuery / LabelValuesQuery");
|
||||||
|
info!(" - Admin.Health / Stats / BuildInfo");
|
||||||
|
|
||||||
// TODO (S5): Start background tasks
|
let shutdown = async {
|
||||||
// - Compaction
|
tokio::signal::ctrl_c().await.expect("failed to install Ctrl+C handler");
|
||||||
// - Retention enforcement
|
};
|
||||||
// - Metrics export
|
tokio::pin!(shutdown);
|
||||||
|
|
||||||
info!("Nightlight server ready");
|
tokio::select! {
|
||||||
info!("Press Ctrl+C to shutdown");
|
result = http_server => {
|
||||||
|
result?;
|
||||||
|
}
|
||||||
|
result = grpc_server => {
|
||||||
|
result?;
|
||||||
|
}
|
||||||
|
_ = &mut shutdown => {
|
||||||
|
info!("Shutdown signal received");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Serve with graceful shutdown
|
let _ = shutdown_tx.send(());
|
||||||
axum::serve(listener, app)
|
if let Err(error) = maintenance_handle.await {
|
||||||
.with_graceful_shutdown(shutdown_signal(query_service_for_shutdown, data_path_for_shutdown))
|
error!(error = %error, "Nightlight maintenance task failed to join");
|
||||||
.await?;
|
}
|
||||||
|
|
||||||
info!("Nightlight server stopped");
|
info!("Nightlight server stopped");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn shutdown_signal(
|
async fn maintenance_loop(
|
||||||
query_service: query::QueryService,
|
storage: Arc<Storage>,
|
||||||
data_path: std::path::PathBuf,
|
config: StorageConfig,
|
||||||
|
mut shutdown: tokio::sync::broadcast::Receiver<()>,
|
||||||
) {
|
) {
|
||||||
tokio::signal::ctrl_c()
|
let snapshot_interval_secs =
|
||||||
.await
|
config.compaction_interval_seconds.clamp(5, DEFAULT_SNAPSHOT_INTERVAL_SECS);
|
||||||
.expect("Failed to install CTRL+C handler");
|
let mut snapshot_interval = tokio::time::interval(Duration::from_secs(snapshot_interval_secs));
|
||||||
info!("Shutdown signal received, saving data...");
|
snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Delay);
|
||||||
|
|
||||||
// S5: Save persistent state to disk before shutdown
|
let mut retention_interval = tokio::time::interval(Duration::from_secs(
|
||||||
if let Err(e) = query_service.save_to_disk(&data_path).await {
|
config.compaction_interval_seconds.max(1),
|
||||||
tracing::error!("Failed to save data on shutdown: {}", e);
|
));
|
||||||
} else {
|
retention_interval.set_missed_tick_behavior(MissedTickBehavior::Delay);
|
||||||
info!("Data saved successfully");
|
|
||||||
|
loop {
|
||||||
|
tokio::select! {
|
||||||
|
_ = shutdown.recv() => break,
|
||||||
|
_ = snapshot_interval.tick() => {
|
||||||
|
if let Err(error) = storage.flush().await {
|
||||||
|
error!(error = %error, "Nightlight snapshot flush failed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ = retention_interval.tick() => {
|
||||||
|
if let Err(error) = storage.enforce_retention(config.retention_days).await {
|
||||||
|
error!(error = %error, "Nightlight retention sweep failed");
|
||||||
|
}
|
||||||
|
if let Err(error) = storage.compact().await {
|
||||||
|
error!(error = %error, "Nightlight compaction checkpoint failed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("Stopping server...");
|
if let Err(error) = storage.flush().await {
|
||||||
|
error!(error = %error, "Nightlight final snapshot flush failed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn healthz() -> &'static str {
|
||||||
|
"ok"
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -6,10 +6,11 @@
|
||||||
use axum::{
|
use axum::{
|
||||||
extract::{Path, Query, State},
|
extract::{Path, Query, State},
|
||||||
http::StatusCode,
|
http::StatusCode,
|
||||||
response::{IntoResponse, Json, Response},
|
response::{IntoResponse, Json},
|
||||||
routing::get,
|
routing::get,
|
||||||
Router,
|
Router,
|
||||||
};
|
};
|
||||||
|
use parking_lot::Mutex;
|
||||||
use nightlight_types::{Error, Label, Result, Sample, SeriesId, TimeSeries};
|
use nightlight_types::{Error, Label, Result, Sample, SeriesId, TimeSeries};
|
||||||
use promql_parser::{
|
use promql_parser::{
|
||||||
label::Matchers,
|
label::Matchers,
|
||||||
|
|
@ -18,16 +19,21 @@ use promql_parser::{
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::collections::HashMap;
|
use std::collections::{HashMap, VecDeque};
|
||||||
|
use std::sync::atomic::{AtomicU64, Ordering};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use std::time::Instant;
|
||||||
use tokio::sync::RwLock;
|
use tokio::sync::RwLock;
|
||||||
use tracing::{debug, error, info};
|
use tracing::{debug, error, info};
|
||||||
|
|
||||||
|
const QUERY_DURATION_HISTORY_LIMIT: usize = 512;
|
||||||
|
|
||||||
/// Query service state
|
/// Query service state
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct QueryService {
|
pub struct QueryService {
|
||||||
// Reference to queryable storage (shared with ingestion)
|
// Reference to queryable storage (shared with ingestion)
|
||||||
storage: Arc<RwLock<QueryableStorage>>,
|
storage: Arc<RwLock<QueryableStorage>>,
|
||||||
|
metrics: Arc<QueryMetrics>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// In-memory queryable storage (reads from ingestion buffer)
|
/// In-memory queryable storage (reads from ingestion buffer)
|
||||||
|
|
@ -39,6 +45,24 @@ pub struct QueryableStorage {
|
||||||
pub label_index: HashMap<String, HashMap<String, Vec<SeriesId>>>,
|
pub label_index: HashMap<String, HashMap<String, Vec<SeriesId>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct QueryMetrics {
|
||||||
|
queries_total: AtomicU64,
|
||||||
|
queries_failed: AtomicU64,
|
||||||
|
queries_active: AtomicU64,
|
||||||
|
durations_ms: Mutex<VecDeque<u64>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, Default)]
|
||||||
|
pub struct QueryMetricsSnapshot {
|
||||||
|
pub queries_total: u64,
|
||||||
|
pub queries_failed: u64,
|
||||||
|
pub queries_active: u64,
|
||||||
|
pub query_duration_p50: f64,
|
||||||
|
pub query_duration_p95: f64,
|
||||||
|
pub query_duration_p99: f64,
|
||||||
|
}
|
||||||
|
|
||||||
impl QueryService {
|
impl QueryService {
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
Self {
|
Self {
|
||||||
|
|
@ -46,12 +70,16 @@ impl QueryService {
|
||||||
series: HashMap::new(),
|
series: HashMap::new(),
|
||||||
label_index: HashMap::new(),
|
label_index: HashMap::new(),
|
||||||
})),
|
})),
|
||||||
|
metrics: Arc::new(QueryMetrics::new()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create QueryService from existing shared storage
|
/// Create QueryService from existing shared storage
|
||||||
pub fn from_storage(storage: Arc<RwLock<QueryableStorage>>) -> Self {
|
pub fn from_storage(storage: Arc<RwLock<QueryableStorage>>) -> Self {
|
||||||
Self { storage }
|
Self {
|
||||||
|
storage,
|
||||||
|
metrics: Arc::new(QueryMetrics::new()),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create QueryService and load persistent state from disk if it exists
|
/// Create QueryService and load persistent state from disk if it exists
|
||||||
|
|
@ -61,6 +89,7 @@ impl QueryService {
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
storage: Arc::new(RwLock::new(storage)),
|
storage: Arc::new(RwLock::new(storage)),
|
||||||
|
metrics: Arc::new(QueryMetrics::new()),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -82,17 +111,32 @@ impl QueryService {
|
||||||
.with_state(self)
|
.with_state(self)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn metrics(&self) -> Arc<QueryMetrics> {
|
||||||
|
Arc::clone(&self.metrics)
|
||||||
|
}
|
||||||
|
|
||||||
/// Execute an instant query at a specific timestamp
|
/// Execute an instant query at a specific timestamp
|
||||||
pub async fn execute_instant_query(&self, query: &str, time: i64) -> Result<QueryResult> {
|
pub async fn execute_instant_query(&self, query: &str, time: i64) -> Result<QueryResult> {
|
||||||
debug!("Executing instant query: {} at time {}", query, time);
|
debug!("Executing instant query: {} at time {}", query, time);
|
||||||
|
let started = self.metrics.begin_query();
|
||||||
|
|
||||||
// Parse PromQL expression
|
// Parse PromQL expression
|
||||||
let expr = promql_parser::parser::parse(query)
|
let expr = promql_parser::parser::parse(query)
|
||||||
.map_err(|e| Error::Query(format!("Parse error: {:?}", e)))?;
|
.map_err(|e| Error::Query(format!("Parse error: {:?}", e)));
|
||||||
|
let expr = match expr {
|
||||||
|
Ok(expr) => expr,
|
||||||
|
Err(error) => {
|
||||||
|
self.metrics.finish_query(started, false);
|
||||||
|
return Err(error);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// Execute the expression
|
// Execute the expression
|
||||||
let storage = self.storage.read().await;
|
let storage = self.storage.read().await;
|
||||||
let result = self.evaluate_expr(&expr, time, time, 0, &storage).await?;
|
let result = self.evaluate_expr(&expr, time, time, 0, &storage).await;
|
||||||
|
let success = result.is_ok();
|
||||||
|
self.metrics.finish_query(started, success);
|
||||||
|
let result = result?;
|
||||||
|
|
||||||
Ok(QueryResult {
|
Ok(QueryResult {
|
||||||
result_type: "vector".to_string(),
|
result_type: "vector".to_string(),
|
||||||
|
|
@ -123,10 +167,31 @@ impl QueryService {
|
||||||
"Executing range query: {} from {} to {} step {}",
|
"Executing range query: {} from {} to {} step {}",
|
||||||
query, start, end, step
|
query, start, end, step
|
||||||
);
|
);
|
||||||
|
let started = self.metrics.begin_query();
|
||||||
|
|
||||||
|
if step <= 0 {
|
||||||
|
self.metrics.finish_query(started, false);
|
||||||
|
return Err(Error::InvalidTimeRange(
|
||||||
|
"range query step must be greater than zero".to_string(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
if end < start {
|
||||||
|
self.metrics.finish_query(started, false);
|
||||||
|
return Err(Error::InvalidTimeRange(
|
||||||
|
"range query end must be greater than or equal to start".to_string(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
// Parse PromQL expression
|
// Parse PromQL expression
|
||||||
let expr = promql_parser::parser::parse(query)
|
let expr = promql_parser::parser::parse(query)
|
||||||
.map_err(|e| Error::Query(format!("Parse error: {:?}", e)))?;
|
.map_err(|e| Error::Query(format!("Parse error: {:?}", e)));
|
||||||
|
let expr = match expr {
|
||||||
|
Ok(expr) => expr,
|
||||||
|
Err(error) => {
|
||||||
|
self.metrics.finish_query(started, false);
|
||||||
|
return Err(error);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let storage = self.storage.read().await;
|
let storage = self.storage.read().await;
|
||||||
let mut results: HashMap<String, RangeResult> = HashMap::new();
|
let mut results: HashMap<String, RangeResult> = HashMap::new();
|
||||||
|
|
@ -136,7 +201,14 @@ impl QueryService {
|
||||||
while current_time <= end {
|
while current_time <= end {
|
||||||
let step_result = self
|
let step_result = self
|
||||||
.evaluate_expr(&expr, current_time, end, step, &storage)
|
.evaluate_expr(&expr, current_time, end, step, &storage)
|
||||||
.await?;
|
.await;
|
||||||
|
let step_result = match step_result {
|
||||||
|
Ok(step_result) => step_result,
|
||||||
|
Err(error) => {
|
||||||
|
self.metrics.finish_query(started, false);
|
||||||
|
return Err(error);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
for ts in step_result {
|
for ts in step_result {
|
||||||
// Create a unique key for this series based on labels
|
// Create a unique key for this series based on labels
|
||||||
|
|
@ -169,10 +241,12 @@ impl QueryService {
|
||||||
current_time += step;
|
current_time += step;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(RangeQueryResult {
|
let result = RangeQueryResult {
|
||||||
result_type: "matrix".to_string(),
|
result_type: "matrix".to_string(),
|
||||||
result: results.into_values().collect(),
|
result: results.into_values().collect(),
|
||||||
})
|
};
|
||||||
|
self.metrics.finish_query(started, true);
|
||||||
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Evaluate a PromQL expression (recursive with boxing for async)
|
/// Evaluate a PromQL expression (recursive with boxing for async)
|
||||||
|
|
@ -589,9 +663,108 @@ impl QueryService {
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get storage handle (for ingestion integration)
|
pub async fn series_metadata(
|
||||||
pub fn storage(&self) -> Arc<RwLock<QueryableStorage>> {
|
&self,
|
||||||
self.storage.clone()
|
matchers: &[String],
|
||||||
|
start: Option<i64>,
|
||||||
|
end: Option<i64>,
|
||||||
|
) -> Result<Vec<HashMap<String, String>>> {
|
||||||
|
let started = self.metrics.begin_query();
|
||||||
|
let storage = self.storage.read().await;
|
||||||
|
let series = self.matching_series(&storage, matchers, start, end);
|
||||||
|
let result = Ok(series
|
||||||
|
.into_iter()
|
||||||
|
.map(|ts| {
|
||||||
|
ts.labels
|
||||||
|
.iter()
|
||||||
|
.map(|label| (label.name.clone(), label.value.clone()))
|
||||||
|
.collect()
|
||||||
|
})
|
||||||
|
.collect());
|
||||||
|
self.metrics.finish_query(started, true);
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn label_values_for_matchers(
|
||||||
|
&self,
|
||||||
|
label_name: &str,
|
||||||
|
matchers: &[String],
|
||||||
|
start: Option<i64>,
|
||||||
|
end: Option<i64>,
|
||||||
|
) -> Result<Vec<String>> {
|
||||||
|
let started = self.metrics.begin_query();
|
||||||
|
let storage = self.storage.read().await;
|
||||||
|
let mut values: Vec<String> = self
|
||||||
|
.matching_series(&storage, matchers, start, end)
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(|series| series.get_label(label_name).map(str::to_string))
|
||||||
|
.collect();
|
||||||
|
values.sort();
|
||||||
|
values.dedup();
|
||||||
|
self.metrics.finish_query(started, true);
|
||||||
|
Ok(values)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn matching_series(
|
||||||
|
&self,
|
||||||
|
storage: &QueryableStorage,
|
||||||
|
matchers: &[String],
|
||||||
|
start: Option<i64>,
|
||||||
|
end: Option<i64>,
|
||||||
|
) -> Vec<TimeSeries> {
|
||||||
|
let parsed_matchers = parse_label_matchers(matchers);
|
||||||
|
storage
|
||||||
|
.series
|
||||||
|
.values()
|
||||||
|
.filter(|series| series_matches(series, &parsed_matchers))
|
||||||
|
.filter(|series| series_in_time_range(series, start, end))
|
||||||
|
.cloned()
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl QueryMetrics {
|
||||||
|
fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
queries_total: AtomicU64::new(0),
|
||||||
|
queries_failed: AtomicU64::new(0),
|
||||||
|
queries_active: AtomicU64::new(0),
|
||||||
|
durations_ms: Mutex::new(VecDeque::with_capacity(QUERY_DURATION_HISTORY_LIMIT)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn begin_query(&self) -> Instant {
|
||||||
|
self.queries_total.fetch_add(1, Ordering::Relaxed);
|
||||||
|
self.queries_active.fetch_add(1, Ordering::Relaxed);
|
||||||
|
Instant::now()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn finish_query(&self, started: Instant, success: bool) {
|
||||||
|
if !success {
|
||||||
|
self.queries_failed.fetch_add(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
self.queries_active.fetch_sub(1, Ordering::Relaxed);
|
||||||
|
|
||||||
|
let elapsed_ms = started.elapsed().as_millis() as u64;
|
||||||
|
let mut durations = self.durations_ms.lock();
|
||||||
|
if durations.len() >= QUERY_DURATION_HISTORY_LIMIT {
|
||||||
|
durations.pop_front();
|
||||||
|
}
|
||||||
|
durations.push_back(elapsed_ms);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn snapshot(&self) -> QueryMetricsSnapshot {
|
||||||
|
let mut sorted_durations: Vec<u64> = self.durations_ms.lock().iter().copied().collect();
|
||||||
|
sorted_durations.sort_unstable();
|
||||||
|
|
||||||
|
QueryMetricsSnapshot {
|
||||||
|
queries_total: self.queries_total.load(Ordering::Relaxed),
|
||||||
|
queries_failed: self.queries_failed.load(Ordering::Relaxed),
|
||||||
|
queries_active: self.queries_active.load(Ordering::Relaxed),
|
||||||
|
query_duration_p50: percentile(&sorted_durations, 0.50),
|
||||||
|
query_duration_p95: percentile(&sorted_durations, 0.95),
|
||||||
|
query_duration_p99: percentile(&sorted_durations, 0.99),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -600,12 +773,15 @@ impl QueryableStorage {
|
||||||
pub fn upsert_series(&mut self, series: TimeSeries) {
|
pub fn upsert_series(&mut self, series: TimeSeries) {
|
||||||
// Update label index
|
// Update label index
|
||||||
for label in &series.labels {
|
for label in &series.labels {
|
||||||
self.label_index
|
let series_ids = self
|
||||||
|
.label_index
|
||||||
.entry(label.name.clone())
|
.entry(label.name.clone())
|
||||||
.or_default()
|
.or_default()
|
||||||
.entry(label.value.clone())
|
.entry(label.value.clone())
|
||||||
.or_default()
|
.or_default();
|
||||||
.push(series.id);
|
if !series_ids.contains(&series.id) {
|
||||||
|
series_ids.push(series.id);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Upsert series
|
// Upsert series
|
||||||
|
|
@ -624,11 +800,91 @@ impl QueryableStorage {
|
||||||
|
|
||||||
/// Get label values for a specific label name
|
/// Get label values for a specific label name
|
||||||
pub fn label_values(&self, label_name: &str) -> Vec<String> {
|
pub fn label_values(&self, label_name: &str) -> Vec<String> {
|
||||||
self.label_index
|
let mut values: Vec<String> = self
|
||||||
|
.label_index
|
||||||
.get(label_name)
|
.get(label_name)
|
||||||
.map(|values| values.keys().cloned().collect())
|
.map(|values| values.keys().cloned().collect())
|
||||||
.unwrap_or_default()
|
.unwrap_or_default();
|
||||||
|
values.sort();
|
||||||
|
values
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn rebuild_index(&mut self) {
|
||||||
|
self.label_index.clear();
|
||||||
|
let series: Vec<TimeSeries> = self.series.values().cloned().collect();
|
||||||
|
for series in series {
|
||||||
|
for label in &series.labels {
|
||||||
|
self.label_index
|
||||||
|
.entry(label.name.clone())
|
||||||
|
.or_default()
|
||||||
|
.entry(label.value.clone())
|
||||||
|
.or_default()
|
||||||
|
.push(series.id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn prune_before(&mut self, cutoff: i64) -> usize {
|
||||||
|
let mut removed_samples = 0usize;
|
||||||
|
self.series.retain(|_, series| {
|
||||||
|
let before = series.samples.len();
|
||||||
|
series.samples.retain(|sample| sample.timestamp >= cutoff);
|
||||||
|
removed_samples += before.saturating_sub(series.samples.len());
|
||||||
|
!series.samples.is_empty()
|
||||||
|
});
|
||||||
|
self.rebuild_index();
|
||||||
|
removed_samples
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn percentile(values: &[u64], quantile: f64) -> f64 {
|
||||||
|
if values.is_empty() {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
let index = ((values.len() - 1) as f64 * quantile).round() as usize;
|
||||||
|
values[index.min(values.len() - 1)] as f64
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_label_matchers(matchers: &[String]) -> Vec<(String, String)> {
|
||||||
|
matchers
|
||||||
|
.iter()
|
||||||
|
.filter_map(|matcher| matcher.split_once('='))
|
||||||
|
.map(|(key, value)| {
|
||||||
|
(
|
||||||
|
key.trim().to_string(),
|
||||||
|
value.trim().trim_matches('"').to_string(),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn series_matches(series: &TimeSeries, matchers: &[(String, String)]) -> bool {
|
||||||
|
matchers.iter().all(|(key, value)| {
|
||||||
|
series
|
||||||
|
.labels
|
||||||
|
.iter()
|
||||||
|
.any(|label| &label.name == key && &label.value == value)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn series_in_time_range(series: &TimeSeries, start: Option<i64>, end: Option<i64>) -> bool {
|
||||||
|
let Some((series_start, series_end)) = series.time_range() else {
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some(start) = start {
|
||||||
|
if series_end < start {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(end) = end {
|
||||||
|
if series_start > end {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
/// HTTP handler for instant queries
|
/// HTTP handler for instant queries
|
||||||
|
|
@ -696,46 +952,57 @@ async fn handle_range_query(
|
||||||
async fn handle_label_values(
|
async fn handle_label_values(
|
||||||
State(service): State<QueryService>,
|
State(service): State<QueryService>,
|
||||||
Path(label_name): Path<String>,
|
Path(label_name): Path<String>,
|
||||||
|
Query(params): Query<SeriesQueryParams>,
|
||||||
) -> impl IntoResponse {
|
) -> impl IntoResponse {
|
||||||
let storage = service.storage.read().await;
|
match service
|
||||||
let values = storage.label_values(&label_name);
|
.label_values_for_matchers(&label_name, ¶ms.matchers, params.start, params.end)
|
||||||
|
.await
|
||||||
(
|
{
|
||||||
StatusCode::OK,
|
Ok(values) => (
|
||||||
Json(LabelValuesResponse {
|
StatusCode::OK,
|
||||||
status: "success".to_string(),
|
Json(LabelValuesResponse {
|
||||||
data: values,
|
status: "success".to_string(),
|
||||||
}),
|
data: values,
|
||||||
)
|
}),
|
||||||
|
)
|
||||||
|
.into_response(),
|
||||||
|
Err(error) => (
|
||||||
|
StatusCode::BAD_REQUEST,
|
||||||
|
Json(serde_json::json!({
|
||||||
|
"status": "error",
|
||||||
|
"error": error.to_string(),
|
||||||
|
})),
|
||||||
|
)
|
||||||
|
.into_response(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// HTTP handler for series metadata
|
/// HTTP handler for series metadata
|
||||||
async fn handle_series(
|
async fn handle_series(
|
||||||
State(service): State<QueryService>,
|
State(service): State<QueryService>,
|
||||||
Query(_params): Query<SeriesQueryParams>,
|
Query(params): Query<SeriesQueryParams>,
|
||||||
) -> impl IntoResponse {
|
) -> impl IntoResponse {
|
||||||
let storage = service.storage.read().await;
|
match service
|
||||||
|
.series_metadata(¶ms.matchers, params.start, params.end)
|
||||||
// Return all series metadata (limited implementation)
|
.await
|
||||||
let series: Vec<HashMap<String, String>> = storage
|
{
|
||||||
.series
|
Ok(series) => (
|
||||||
.values()
|
StatusCode::OK,
|
||||||
.take(1000) // Limit to prevent OOM
|
Json(SeriesResponse {
|
||||||
.map(|ts| {
|
status: "success".to_string(),
|
||||||
ts.labels
|
data: series,
|
||||||
.iter()
|
}),
|
||||||
.map(|l| (l.name.clone(), l.value.clone()))
|
)
|
||||||
.collect()
|
.into_response(),
|
||||||
})
|
Err(error) => (
|
||||||
.collect();
|
StatusCode::BAD_REQUEST,
|
||||||
|
Json(serde_json::json!({
|
||||||
(
|
"status": "error",
|
||||||
StatusCode::OK,
|
"error": error.to_string(),
|
||||||
Json(SeriesResponse {
|
})),
|
||||||
status: "success".to_string(),
|
)
|
||||||
data: series,
|
.into_response(),
|
||||||
}),
|
}
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Request/Response Types
|
// Request/Response Types
|
||||||
|
|
@ -760,6 +1027,10 @@ struct SeriesQueryParams {
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
#[serde(rename = "match[]")]
|
#[serde(rename = "match[]")]
|
||||||
matchers: Vec<String>,
|
matchers: Vec<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
start: Option<i64>,
|
||||||
|
#[serde(default)]
|
||||||
|
end: Option<i64>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Serialize)]
|
||||||
|
|
@ -770,30 +1041,30 @@ struct QueryResponse {
|
||||||
error_type: Option<String>,
|
error_type: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Clone, Serialize)]
|
||||||
pub(crate) struct QueryResult {
|
pub struct QueryResult {
|
||||||
#[serde(rename = "resultType")]
|
#[serde(rename = "resultType")]
|
||||||
result_type: String,
|
pub result_type: String,
|
||||||
result: Vec<InstantQueryResult>,
|
pub result: Vec<InstantQueryResult>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Clone, Serialize)]
|
||||||
struct InstantQueryResult {
|
pub struct InstantQueryResult {
|
||||||
metric: HashMap<String, String>,
|
pub metric: HashMap<String, String>,
|
||||||
value: Option<(i64, f64)>,
|
pub value: Option<(i64, f64)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Clone, Serialize)]
|
||||||
pub(crate) struct RangeQueryResult {
|
pub struct RangeQueryResult {
|
||||||
#[serde(rename = "resultType")]
|
#[serde(rename = "resultType")]
|
||||||
result_type: String,
|
pub result_type: String,
|
||||||
result: Vec<RangeResult>,
|
pub result: Vec<RangeResult>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Clone, Serialize)]
|
||||||
struct RangeResult {
|
pub struct RangeResult {
|
||||||
metric: HashMap<String, String>,
|
pub metric: HashMap<String, String>,
|
||||||
values: Vec<(i64, f64)>,
|
pub values: Vec<(i64, f64)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Serialize)]
|
||||||
|
|
@ -808,29 +1079,6 @@ struct SeriesResponse {
|
||||||
data: Vec<HashMap<String, String>>,
|
data: Vec<HashMap<String, String>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
enum QueryError {
|
|
||||||
ParseFailed(String),
|
|
||||||
ExecutionFailed(String),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl IntoResponse for QueryError {
|
|
||||||
fn into_response(self) -> Response {
|
|
||||||
let (status, message) = match self {
|
|
||||||
QueryError::ParseFailed(msg) => (StatusCode::BAD_REQUEST, msg),
|
|
||||||
QueryError::ExecutionFailed(msg) => (StatusCode::INTERNAL_SERVER_ERROR, msg),
|
|
||||||
};
|
|
||||||
|
|
||||||
let body = serde_json::json!({
|
|
||||||
"status": "error",
|
|
||||||
"errorType": "execution",
|
|
||||||
"error": message
|
|
||||||
});
|
|
||||||
|
|
||||||
(status, Json(body)).into_response()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for QueryService {
|
impl Default for QueryService {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self::new()
|
Self::new()
|
||||||
|
|
@ -892,8 +1140,9 @@ impl QueryableStorage {
|
||||||
.map_err(|e| Error::Storage(format!("Failed to read file: {}", e)))?;
|
.map_err(|e| Error::Storage(format!("Failed to read file: {}", e)))?;
|
||||||
|
|
||||||
// Deserialize from bincode
|
// Deserialize from bincode
|
||||||
let storage = bincode::deserialize(&buffer)
|
let mut storage: Self = bincode::deserialize(&buffer)
|
||||||
.map_err(|e| Error::Storage(format!("Deserialization failed: {}", e)))?;
|
.map_err(|e| Error::Storage(format!("Deserialization failed: {}", e)))?;
|
||||||
|
storage.rebuild_index();
|
||||||
|
|
||||||
Ok(storage)
|
Ok(storage)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,8 @@
|
||||||
//! Time-series storage layer
|
//! Time-series storage layer backed by an in-memory head, a write-ahead log,
|
||||||
//!
|
//! and periodic snapshots.
|
||||||
//! シンプルなWAL付きストレージ実装(S5足場)。
|
|
||||||
//! - in-memory head: `QueryableStorage` を共有
|
|
||||||
//! - WAL: bincode長さプレフィックスでappend / replay
|
|
||||||
//! - スナップショット: `nightlight.db` にbincode保存
|
|
||||||
//! - Retention/compactionは将来タスク(現状no-op)
|
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
|
use chrono::Utc;
|
||||||
use nightlight_types::{SeriesId, TimeSeries};
|
use nightlight_types::{SeriesId, TimeSeries};
|
||||||
use std::{
|
use std::{
|
||||||
fs::{File, OpenOptions},
|
fs::{File, OpenOptions},
|
||||||
|
|
@ -15,16 +11,15 @@ use std::{
|
||||||
sync::Arc,
|
sync::Arc,
|
||||||
};
|
};
|
||||||
use tokio::sync::{Mutex, RwLock};
|
use tokio::sync::{Mutex, RwLock};
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
use crate::query::QueryableStorage;
|
use crate::query::QueryableStorage;
|
||||||
|
|
||||||
/// WALレコード
|
|
||||||
#[derive(serde::Serialize, serde::Deserialize)]
|
#[derive(serde::Serialize, serde::Deserialize)]
|
||||||
struct WalRecord {
|
struct WalRecord {
|
||||||
series: TimeSeries,
|
series: TimeSeries,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// ストレージ本体
|
|
||||||
pub struct Storage {
|
pub struct Storage {
|
||||||
head: Arc<RwLock<QueryableStorage>>,
|
head: Arc<RwLock<QueryableStorage>>,
|
||||||
wal_path: PathBuf,
|
wal_path: PathBuf,
|
||||||
|
|
@ -33,7 +28,6 @@ pub struct Storage {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Storage {
|
impl Storage {
|
||||||
/// data_dirを初期化し、snapshot + WALをリプレイする
|
|
||||||
pub fn new(data_dir: &str) -> Result<Self> {
|
pub fn new(data_dir: &str) -> Result<Self> {
|
||||||
let data_dir = PathBuf::from(data_dir);
|
let data_dir = PathBuf::from(data_dir);
|
||||||
std::fs::create_dir_all(&data_dir)?;
|
std::fs::create_dir_all(&data_dir)?;
|
||||||
|
|
@ -41,12 +35,11 @@ impl Storage {
|
||||||
let snapshot_path = data_dir.join("nightlight.db");
|
let snapshot_path = data_dir.join("nightlight.db");
|
||||||
let wal_path = data_dir.join("wal.log");
|
let wal_path = data_dir.join("wal.log");
|
||||||
|
|
||||||
// snapshotロード
|
|
||||||
let mut head = QueryableStorage::load_from_file(&snapshot_path)?;
|
let mut head = QueryableStorage::load_from_file(&snapshot_path)?;
|
||||||
// WALリプレイ
|
|
||||||
if wal_path.exists() {
|
if wal_path.exists() {
|
||||||
replay_wal(&wal_path, &mut head)?;
|
replay_wal(&wal_path, &mut head)?;
|
||||||
}
|
}
|
||||||
|
head.rebuild_index();
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
head: Arc::new(RwLock::new(head)),
|
head: Arc::new(RwLock::new(head)),
|
||||||
|
|
@ -56,13 +49,15 @@ impl Storage {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// 共有QueryableStorageを取得
|
|
||||||
pub fn queryable(&self) -> Arc<RwLock<QueryableStorage>> {
|
pub fn queryable(&self) -> Arc<RwLock<QueryableStorage>> {
|
||||||
self.head.clone()
|
Arc::clone(&self.head)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// WALへappendし、headへ反映
|
|
||||||
pub async fn append(&self, series_list: Vec<TimeSeries>) -> Result<()> {
|
pub async fn append(&self, series_list: Vec<TimeSeries>) -> Result<()> {
|
||||||
|
if series_list.is_empty() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
let _guard = self.wal_lock.lock().await;
|
let _guard = self.wal_lock.lock().await;
|
||||||
let mut wal_file = OpenOptions::new()
|
let mut wal_file = OpenOptions::new()
|
||||||
.create(true)
|
.create(true)
|
||||||
|
|
@ -78,14 +73,12 @@ impl Storage {
|
||||||
let len = encoded.len() as u32;
|
let len = encoded.len() as u32;
|
||||||
wal_file.write_all(&len.to_le_bytes())?;
|
wal_file.write_all(&len.to_le_bytes())?;
|
||||||
wal_file.write_all(&encoded)?;
|
wal_file.write_all(&encoded)?;
|
||||||
|
|
||||||
head.upsert_series(series);
|
head.upsert_series(series);
|
||||||
}
|
}
|
||||||
wal_file.flush()?;
|
wal_file.flush()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// 指定IDのシリーズを時間範囲で返す
|
|
||||||
pub async fn query_series(
|
pub async fn query_series(
|
||||||
&self,
|
&self,
|
||||||
series_id: SeriesId,
|
series_id: SeriesId,
|
||||||
|
|
@ -93,110 +86,142 @@ impl Storage {
|
||||||
end: i64,
|
end: i64,
|
||||||
) -> Result<Option<TimeSeries>> {
|
) -> Result<Option<TimeSeries>> {
|
||||||
let head = self.head.read().await;
|
let head = self.head.read().await;
|
||||||
if let Some(series) = head.series.get(&series_id) {
|
Ok(head
|
||||||
let mut filtered = series.clone();
|
.series
|
||||||
filtered
|
.get(&series_id)
|
||||||
.samples
|
.map(|series| series.filter_by_time(start, end)))
|
||||||
.retain(|s| s.timestamp >= start && s.timestamp <= end);
|
|
||||||
return Ok(Some(filtered));
|
|
||||||
}
|
|
||||||
Ok(None)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// 簡易ラベル一致検索
|
|
||||||
pub async fn find_series(&self, matchers: Vec<String>) -> Result<Vec<SeriesId>> {
|
pub async fn find_series(&self, matchers: Vec<String>) -> Result<Vec<SeriesId>> {
|
||||||
let parsed: Vec<(String, String)> = matchers
|
let parsed: Vec<(String, String)> = matchers
|
||||||
.iter()
|
.iter()
|
||||||
.filter_map(|m| m.split_once('='))
|
.filter_map(|matcher| matcher.split_once('='))
|
||||||
.map(|(k, v)| (k.to_string(), v.to_string()))
|
.map(|(key, value)| {
|
||||||
|
(
|
||||||
|
key.trim().to_string(),
|
||||||
|
value.trim().trim_matches('"').to_string(),
|
||||||
|
)
|
||||||
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
let head = self.head.read().await;
|
let head = self.head.read().await;
|
||||||
let mut result = Vec::new();
|
let mut result = Vec::new();
|
||||||
'outer: for (series_id, ts) in &head.series {
|
'outer: for (series_id, series) in &head.series {
|
||||||
for (k, v) in &parsed {
|
for (key, value) in &parsed {
|
||||||
if !ts.labels.iter().any(|l| &l.name == k && &l.value == v) {
|
if !series
|
||||||
|
.labels
|
||||||
|
.iter()
|
||||||
|
.any(|label| &label.name == key && &label.value == value)
|
||||||
|
{
|
||||||
continue 'outer;
|
continue 'outer;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
result.push(*series_id);
|
result.push(*series_id);
|
||||||
}
|
}
|
||||||
|
result.sort_unstable();
|
||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// スナップショット保存 + WAL truncate
|
|
||||||
pub async fn flush(&self) -> Result<()> {
|
pub async fn flush(&self) -> Result<()> {
|
||||||
let head = self.head.read().await;
|
|
||||||
head.save_to_file(&self.snapshot_path)?;
|
|
||||||
drop(head);
|
|
||||||
|
|
||||||
let _guard = self.wal_lock.lock().await;
|
let _guard = self.wal_lock.lock().await;
|
||||||
File::create(&self.wal_path)?; // truncate
|
let snapshot = {
|
||||||
|
let head = self.head.read().await;
|
||||||
|
head.clone()
|
||||||
|
};
|
||||||
|
snapshot.save_to_file(&self.snapshot_path)?;
|
||||||
|
File::create(&self.wal_path)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Retentionは将来実装(no-op)
|
pub async fn enforce_retention(&self, retention_days: u32) -> Result<()> {
|
||||||
pub async fn enforce_retention(&self, _retention_days: u32) -> Result<()> {
|
if retention_days == 0 {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
let retention_ms = i64::from(retention_days) * 24 * 60 * 60 * 1000;
|
||||||
|
let cutoff = Utc::now().timestamp_millis() - retention_ms;
|
||||||
|
let removed_samples = {
|
||||||
|
let mut head = self.head.write().await;
|
||||||
|
head.prune_before(cutoff)
|
||||||
|
};
|
||||||
|
|
||||||
|
if removed_samples > 0 {
|
||||||
|
debug!(removed_samples, cutoff, "pruned expired Nightlight samples");
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compactionは将来実装(no-op)
|
|
||||||
pub async fn compact(&self) -> Result<()> {
|
pub async fn compact(&self) -> Result<()> {
|
||||||
Ok(())
|
self.flush().await
|
||||||
}
|
}
|
||||||
|
|
||||||
/// 現在の統計
|
|
||||||
pub async fn stats(&self) -> Result<StorageStats> {
|
pub async fn stats(&self) -> Result<StorageStats> {
|
||||||
let head = self.head.read().await;
|
let head = self.head.read().await;
|
||||||
let active_series = head.series.len() as u64;
|
let total_samples: u64 = head
|
||||||
let total_samples = head
|
|
||||||
.series
|
.series
|
||||||
.values()
|
.values()
|
||||||
.map(|s| s.samples.len() as u64)
|
.map(|series| series.samples.len() as u64)
|
||||||
.sum();
|
.sum();
|
||||||
|
let oldest_sample_time = head
|
||||||
|
.series
|
||||||
|
.values()
|
||||||
|
.filter_map(|series| series.oldest_sample().map(|sample| sample.timestamp))
|
||||||
|
.min()
|
||||||
|
.unwrap_or(0);
|
||||||
|
let newest_sample_time = head
|
||||||
|
.series
|
||||||
|
.values()
|
||||||
|
.filter_map(|series| series.latest_sample().map(|sample| sample.timestamp))
|
||||||
|
.max()
|
||||||
|
.unwrap_or(0);
|
||||||
let wal_size = std::fs::metadata(&self.wal_path)
|
let wal_size = std::fs::metadata(&self.wal_path)
|
||||||
.map(|m| m.len())
|
.map(|metadata| metadata.len())
|
||||||
.unwrap_or(0);
|
.unwrap_or(0);
|
||||||
let snapshot_size = std::fs::metadata(&self.snapshot_path)
|
let snapshot_size = std::fs::metadata(&self.snapshot_path)
|
||||||
.map(|m| m.len())
|
.map(|metadata| metadata.len())
|
||||||
.unwrap_or(0);
|
.unwrap_or(0);
|
||||||
|
|
||||||
Ok(StorageStats {
|
Ok(StorageStats {
|
||||||
active_series,
|
active_series: head.series.len() as u64,
|
||||||
total_samples,
|
total_samples,
|
||||||
blocks_count: 1,
|
blocks_count: u64::from(snapshot_size > 0),
|
||||||
|
head_samples: total_samples,
|
||||||
disk_bytes_used: wal_size + snapshot_size,
|
disk_bytes_used: wal_size + snapshot_size,
|
||||||
|
oldest_sample_time,
|
||||||
|
newest_sample_time,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// WALリプレイ
|
|
||||||
fn replay_wal(path: &Path, storage: &mut QueryableStorage) -> Result<()> {
|
fn replay_wal(path: &Path, storage: &mut QueryableStorage) -> Result<()> {
|
||||||
let mut file = File::open(path)?;
|
let mut file = File::open(path)?;
|
||||||
let mut len_buf = [0u8; 4];
|
let mut len_buf = [0u8; 4];
|
||||||
loop {
|
loop {
|
||||||
if let Err(e) = file.read_exact(&mut len_buf) {
|
if let Err(error) = file.read_exact(&mut len_buf) {
|
||||||
if e.kind() == std::io::ErrorKind::UnexpectedEof {
|
if error.kind() == std::io::ErrorKind::UnexpectedEof {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
return Err(e.into());
|
return Err(error.into());
|
||||||
}
|
}
|
||||||
let len = u32::from_le_bytes(len_buf) as usize;
|
let len = u32::from_le_bytes(len_buf) as usize;
|
||||||
let mut buf = vec![0u8; len];
|
let mut buffer = vec![0u8; len];
|
||||||
file.read_exact(&mut buf)?;
|
file.read_exact(&mut buffer)?;
|
||||||
let record: WalRecord = bincode::deserialize(&buf)?;
|
let record: WalRecord = bincode::deserialize(&buffer)?;
|
||||||
storage.upsert_series(record.series);
|
storage.upsert_series(record.series);
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Storage statistics
|
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
|
||||||
pub struct StorageStats {
|
pub struct StorageStats {
|
||||||
pub active_series: u64,
|
pub active_series: u64,
|
||||||
pub total_samples: u64,
|
pub total_samples: u64,
|
||||||
pub blocks_count: u64,
|
pub blocks_count: u64,
|
||||||
|
pub head_samples: u64,
|
||||||
pub disk_bytes_used: u64,
|
pub disk_bytes_used: u64,
|
||||||
|
pub oldest_sample_time: i64,
|
||||||
|
pub newest_sample_time: i64,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|
@ -247,6 +272,76 @@ mod tests {
|
||||||
assert_eq!(res.samples[1].value, 2.0);
|
assert_eq!(res.samples[1].value, 2.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_retention_prunes_old_samples_and_series() {
|
||||||
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
let storage = Storage::new(dir.path().to_str().unwrap()).unwrap();
|
||||||
|
let now = Utc::now().timestamp_millis();
|
||||||
|
|
||||||
|
storage
|
||||||
|
.append(vec![
|
||||||
|
TimeSeries {
|
||||||
|
id: SeriesId::new(1),
|
||||||
|
labels: vec![Label::new("__name__", "retained_metric")],
|
||||||
|
samples: vec![
|
||||||
|
nightlight_types::Sample::new(now - (2 * 24 * 60 * 60 * 1000), 1.0),
|
||||||
|
nightlight_types::Sample::new(now, 2.0),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
TimeSeries {
|
||||||
|
id: SeriesId::new(2),
|
||||||
|
labels: vec![Label::new("__name__", "expired_metric")],
|
||||||
|
samples: vec![nightlight_types::Sample::new(
|
||||||
|
now - (3 * 24 * 60 * 60 * 1000),
|
||||||
|
3.0,
|
||||||
|
)],
|
||||||
|
},
|
||||||
|
])
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
storage.enforce_retention(1).await.unwrap();
|
||||||
|
|
||||||
|
let retained = storage
|
||||||
|
.query_series(SeriesId::new(1), 0, now + 1)
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(retained.samples.len(), 1);
|
||||||
|
assert_eq!(retained.samples[0].value, 2.0);
|
||||||
|
|
||||||
|
let expired = storage
|
||||||
|
.query_series(SeriesId::new(2), 0, now + 1)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert!(expired.is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_stats_report_sample_bounds() {
|
||||||
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
let storage = Storage::new(dir.path().to_str().unwrap()).unwrap();
|
||||||
|
|
||||||
|
storage
|
||||||
|
.append(vec![TimeSeries {
|
||||||
|
id: SeriesId::new(99),
|
||||||
|
labels: vec![Label::new("__name__", "stats_metric")],
|
||||||
|
samples: vec![
|
||||||
|
nightlight_types::Sample::new(1000, 1.0),
|
||||||
|
nightlight_types::Sample::new(2000, 2.0),
|
||||||
|
],
|
||||||
|
}])
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let stats = storage.stats().await.unwrap();
|
||||||
|
assert_eq!(stats.active_series, 1);
|
||||||
|
assert_eq!(stats.total_samples, 2);
|
||||||
|
assert_eq!(stats.head_samples, 2);
|
||||||
|
assert_eq!(stats.oldest_sample_time, 1000);
|
||||||
|
assert_eq!(stats.newest_sample_time, 2000);
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_find_series() {
|
async fn test_find_series() {
|
||||||
let dir = tempfile::tempdir().unwrap();
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
|
|
||||||
|
|
@ -75,6 +75,9 @@ CREDITSERVICE_PROTO_DIR="${REPO_ROOT}/creditservice/proto"
|
||||||
CREDITSERVICE_PROTO="${CREDITSERVICE_PROTO_DIR}/creditservice.proto"
|
CREDITSERVICE_PROTO="${CREDITSERVICE_PROTO_DIR}/creditservice.proto"
|
||||||
LIGHTNINGSTOR_PROTO_DIR="${REPO_ROOT}/lightningstor/crates/lightningstor-api/proto"
|
LIGHTNINGSTOR_PROTO_DIR="${REPO_ROOT}/lightningstor/crates/lightningstor-api/proto"
|
||||||
LIGHTNINGSTOR_PROTO="${LIGHTNINGSTOR_PROTO_DIR}/lightningstor.proto"
|
LIGHTNINGSTOR_PROTO="${LIGHTNINGSTOR_PROTO_DIR}/lightningstor.proto"
|
||||||
|
NIGHTLIGHT_PROTO_DIR="${REPO_ROOT}/nightlight/crates/nightlight-api/proto"
|
||||||
|
NIGHTLIGHT_QUERY_PROTO="${NIGHTLIGHT_PROTO_DIR}/query.proto"
|
||||||
|
NIGHTLIGHT_ADMIN_PROTO="${NIGHTLIGHT_PROTO_DIR}/admin.proto"
|
||||||
PLASMAVMC_PROTO_DIR="${REPO_ROOT}/plasmavmc/proto"
|
PLASMAVMC_PROTO_DIR="${REPO_ROOT}/plasmavmc/proto"
|
||||||
PLASMAVMC_PROTO="${PLASMAVMC_PROTO_DIR}/plasmavmc.proto"
|
PLASMAVMC_PROTO="${PLASMAVMC_PROTO_DIR}/plasmavmc.proto"
|
||||||
FLAREDB_PROTO_DIR="${REPO_ROOT}/flaredb/crates/flaredb-proto/src"
|
FLAREDB_PROTO_DIR="${REPO_ROOT}/flaredb/crates/flaredb-proto/src"
|
||||||
|
|
@ -4676,10 +4679,24 @@ validate_nightlight_flow_with_base() {
|
||||||
--label source=smoke \
|
--label source=smoke \
|
||||||
--label cluster=photoncloud
|
--label cluster=photoncloud
|
||||||
|
|
||||||
|
wait_for_nightlight_query_result "${base_url}" "${flow_name}" "${metric_name}" "${metric_value}" "source=\"smoke\""
|
||||||
|
|
||||||
|
curl -fsS "${base_url}/label/__name__/values" \
|
||||||
|
| jq -e --arg name "${metric_name}" '.status == "success" and (.data | index($name)) != null' >/dev/null
|
||||||
|
curl -fsS "${base_url}/series" \
|
||||||
|
| jq -e --arg name "${metric_name}" '.status == "success" and (.data | any(.__name__ == $name))' >/dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_nightlight_query_result() {
|
||||||
|
local base_url="$1"
|
||||||
|
local flow_name="$2"
|
||||||
|
local metric_name="$3"
|
||||||
|
local metric_value="$4"
|
||||||
|
local selector_suffix="${5:-}"
|
||||||
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
||||||
while true; do
|
while true; do
|
||||||
if curl -fsS --get "${base_url}/query" \
|
if curl -fsS --get "${base_url}/query" \
|
||||||
--data-urlencode "query=${metric_name}{source=\"smoke\"}" \
|
--data-urlencode "query=${metric_name}{${selector_suffix}}" \
|
||||||
| jq -e --arg name "${metric_name}" --argjson expected "${metric_value}" '
|
| jq -e --arg name "${metric_name}" --argjson expected "${metric_value}" '
|
||||||
.status == "success"
|
.status == "success"
|
||||||
and (.data.result | length) >= 1
|
and (.data.result | length) >= 1
|
||||||
|
|
@ -4692,15 +4709,11 @@ validate_nightlight_flow_with_base() {
|
||||||
fi
|
fi
|
||||||
sleep 2
|
sleep 2
|
||||||
done
|
done
|
||||||
|
|
||||||
curl -fsS "${base_url}/label/__name__/values" \
|
|
||||||
| jq -e --arg name "${metric_name}" '.status == "success" and (.data | index($name)) != null' >/dev/null
|
|
||||||
curl -fsS "${base_url}/series" \
|
|
||||||
| jq -e --arg name "${metric_name}" '.status == "success" and (.data | any(.__name__ == $name))' >/dev/null
|
|
||||||
}
|
}
|
||||||
|
|
||||||
validate_nightlight_flow() {
|
validate_nightlight_flow() {
|
||||||
validate_nightlight_flow_with_base "http://127.0.0.1:9090/api/v1" "NightLight"
|
validate_nightlight_flow_with_base "http://127.0.0.1:9090/api/v1" "NightLight"
|
||||||
|
validate_nightlight_grpc_and_persistence
|
||||||
}
|
}
|
||||||
|
|
||||||
validate_apigateway_nightlight_flow() {
|
validate_apigateway_nightlight_flow() {
|
||||||
|
|
@ -4709,6 +4722,85 @@ validate_apigateway_nightlight_flow() {
|
||||||
validate_nightlight_flow_with_base "http://127.0.0.1:8080/api/v1/metrics" "API Gateway -> NightLight"
|
validate_nightlight_flow_with_base "http://127.0.0.1:8080/api/v1/metrics" "API Gateway -> NightLight"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
validate_nightlight_grpc_and_persistence() {
|
||||||
|
log "Validating NightLight gRPC query/admin APIs and restart persistence"
|
||||||
|
|
||||||
|
local base_url="http://127.0.0.1:9090/api/v1"
|
||||||
|
local grpc_tunnel=""
|
||||||
|
local metric_name="nightlight_persist_metric_$(date +%s)"
|
||||||
|
local metric_value
|
||||||
|
metric_value="$(awk 'BEGIN{srand(); printf "%.3f\n", (rand()*100)+1}')"
|
||||||
|
|
||||||
|
grpc_tunnel="$(start_ssh_tunnel node06 15090 50088)"
|
||||||
|
trap 'stop_ssh_tunnel node06 "${grpc_tunnel}"' RETURN
|
||||||
|
|
||||||
|
python3 "${REPO_ROOT}/nix/test-cluster/nightlight_remote_write.py" \
|
||||||
|
--url "${base_url}/write" \
|
||||||
|
--metric "${metric_name}" \
|
||||||
|
--value "${metric_value}" \
|
||||||
|
--label source=grpc \
|
||||||
|
--label cluster=photoncloud
|
||||||
|
|
||||||
|
wait_for_nightlight_query_result "${base_url}" "NightLight persistence pre-restart" "${metric_name}" "${metric_value}" "source=\"grpc\""
|
||||||
|
|
||||||
|
grpcurl -plaintext \
|
||||||
|
-import-path "${NIGHTLIGHT_PROTO_DIR}" \
|
||||||
|
-proto "${NIGHTLIGHT_QUERY_PROTO}" \
|
||||||
|
-d "$(jq -cn --arg query "${metric_name}{source=\"grpc\"}" '{query:$query, time:0, timeout:5000}')" \
|
||||||
|
127.0.0.1:15090 nightlight.MetricQuery/InstantQuery \
|
||||||
|
| jq -e --arg name "${metric_name}" --argjson expected "${metric_value}" '
|
||||||
|
.status == "success"
|
||||||
|
and (.data.result | any(.metric.__name__ == $name and (.value.value >= ($expected - 0.001)) and (.value.value <= ($expected + 0.001))))
|
||||||
|
' >/dev/null
|
||||||
|
|
||||||
|
grpcurl -plaintext \
|
||||||
|
-import-path "${NIGHTLIGHT_PROTO_DIR}" \
|
||||||
|
-proto "${NIGHTLIGHT_QUERY_PROTO}" \
|
||||||
|
-d "$(jq -cn --arg match "__name__=${metric_name}" '{match:[$match]}')" \
|
||||||
|
127.0.0.1:15090 nightlight.MetricQuery/SeriesQuery \
|
||||||
|
| jq -e --arg name "${metric_name}" '.status == "success" and (.data | any(.labels.__name__ == $name))' >/dev/null
|
||||||
|
|
||||||
|
grpcurl -plaintext \
|
||||||
|
-import-path "${NIGHTLIGHT_PROTO_DIR}" \
|
||||||
|
-proto "${NIGHTLIGHT_QUERY_PROTO}" \
|
||||||
|
-d "$(jq -cn --arg label "source" --arg match "__name__=${metric_name}" '{labelName:$label, match:[$match]}')" \
|
||||||
|
127.0.0.1:15090 nightlight.MetricQuery/LabelValuesQuery \
|
||||||
|
| jq -e '.status == "success" and (.data | index("grpc")) != null' >/dev/null
|
||||||
|
|
||||||
|
grpcurl -plaintext \
|
||||||
|
-import-path "${NIGHTLIGHT_PROTO_DIR}" \
|
||||||
|
-proto "${NIGHTLIGHT_ADMIN_PROTO}" \
|
||||||
|
-d '{}' \
|
||||||
|
127.0.0.1:15090 nightlight.Admin/Health \
|
||||||
|
| jq -e '.status == "ok" and (.components | any(.name == "storage" and .status == "ok"))' >/dev/null
|
||||||
|
|
||||||
|
grpcurl -plaintext \
|
||||||
|
-import-path "${NIGHTLIGHT_PROTO_DIR}" \
|
||||||
|
-proto "${NIGHTLIGHT_ADMIN_PROTO}" \
|
||||||
|
-d '{}' \
|
||||||
|
127.0.0.1:15090 nightlight.Admin/Stats \
|
||||||
|
| jq -e '.storage.totalSamples >= 1 and .ingestion.samplesIngestedTotal >= 1 and .query.queriesTotal >= 1' >/dev/null
|
||||||
|
|
||||||
|
ssh_node node06 "systemctl restart nightlight.service"
|
||||||
|
wait_for_host_http http://127.0.0.1:9090/healthz
|
||||||
|
wait_for_tcp_port node06 50088
|
||||||
|
|
||||||
|
wait_for_nightlight_query_result "${base_url}" "NightLight persistence post-restart" "${metric_name}" "${metric_value}" "source=\"grpc\""
|
||||||
|
|
||||||
|
grpcurl -plaintext \
|
||||||
|
-import-path "${NIGHTLIGHT_PROTO_DIR}" \
|
||||||
|
-proto "${NIGHTLIGHT_QUERY_PROTO}" \
|
||||||
|
-d "$(jq -cn --arg query "${metric_name}{source=\"grpc\"}" '{query:$query, time:0, timeout:5000}')" \
|
||||||
|
127.0.0.1:15090 nightlight.MetricQuery/InstantQuery \
|
||||||
|
| jq -e --arg name "${metric_name}" --argjson expected "${metric_value}" '
|
||||||
|
.status == "success"
|
||||||
|
and (.data.result | any(.metric.__name__ == $name and (.value.value >= ($expected - 0.001)) and (.value.value <= ($expected + 0.001))))
|
||||||
|
' >/dev/null
|
||||||
|
|
||||||
|
trap - RETURN
|
||||||
|
stop_ssh_tunnel node06 "${grpc_tunnel}"
|
||||||
|
}
|
||||||
|
|
||||||
validate_creditservice_rest_flow() {
|
validate_creditservice_rest_flow() {
|
||||||
local base_url="$1"
|
local base_url="$1"
|
||||||
local token="$2"
|
local token="$2"
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue