photoncloud-monorepo/lightningstor/crates/lightningstor-server/src/metadata.rs

1564 lines
55 KiB
Rust

//! Metadata storage using FlareDB, PostgreSQL, or SQLite.
use dashmap::DashMap;
use flaredb_client::RdbClient;
use lightningstor_distributed::ReplicatedRepairTask;
use lightningstor_types::{Bucket, BucketId, MultipartUpload, Object, ObjectId, Result};
use serde_json;
use sqlx::pool::PoolOptions;
use sqlx::{Pool, Postgres, Sqlite};
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
use std::str::FromStr;
use std::sync::Arc;
use tokio::sync::Mutex;
use tonic::Code;
/// Storage backend enum
enum StorageBackend {
FlareDB(Vec<Arc<Mutex<RdbClient>>>),
Sql(SqlStorageBackend),
InMemory(Arc<DashMap<String, String>>),
}
enum SqlStorageBackend {
Postgres(Arc<Pool<Postgres>>),
Sqlite(Arc<Pool<Sqlite>>),
}
const FLAREDB_CLIENT_POOL_SIZE: usize = 8;
/// Metadata store for buckets and objects
pub struct MetadataStore {
backend: StorageBackend,
bucket_cache: Arc<DashMap<String, Bucket>>,
object_cache: Arc<DashMap<String, Object>>,
}
impl MetadataStore {
fn flaredb_requires_strong(status: &tonic::Status) -> bool {
status.code() == Code::FailedPrecondition && status.message().contains("not eventual")
}
/// Create a new metadata store with FlareDB backend
pub async fn new(endpoint: Option<String>) -> Result<Self> {
Self::new_flaredb(endpoint).await
}
/// Create a new metadata store with FlareDB backend
pub async fn new_flaredb(endpoint: Option<String>) -> Result<Self> {
Self::new_flaredb_with_pd(endpoint, None).await
}
/// Create a new metadata store with FlareDB backend and explicit PD address.
pub async fn new_flaredb_with_pd(
endpoint: Option<String>,
pd_endpoint: Option<String>,
) -> Result<Self> {
let endpoint = endpoint.unwrap_or_else(|| "127.0.0.1:2479".to_string());
let pd_endpoint = pd_endpoint
.map(|value| normalize_transport_addr(&value))
.unwrap_or_else(|| endpoint.clone());
let mut clients = Vec::with_capacity(FLAREDB_CLIENT_POOL_SIZE);
for _ in 0..FLAREDB_CLIENT_POOL_SIZE {
let client = RdbClient::connect_with_pd_namespace(
endpoint.clone(),
pd_endpoint.clone(),
"lightningstor",
)
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Failed to connect to FlareDB: {}",
e
))
})?;
clients.push(Arc::new(Mutex::new(client)));
}
Ok(Self {
backend: StorageBackend::FlareDB(clients),
bucket_cache: Arc::new(DashMap::new()),
object_cache: Arc::new(DashMap::new()),
})
}
/// Create a metadata store backed by PostgreSQL or SQLite.
pub async fn new_sql(database_url: &str, single_node: bool) -> Result<Self> {
let url = database_url.trim();
if url.is_empty() {
return Err(lightningstor_types::Error::StorageError(
"metadata database URL is empty".to_string(),
));
}
if Self::is_postgres_url(url) {
let pool = PoolOptions::<Postgres>::new()
.max_connections(10)
.connect(url)
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Failed to connect to Postgres: {}",
e
))
})?;
Self::ensure_sql_schema_postgres(&pool).await?;
return Ok(Self {
backend: StorageBackend::Sql(SqlStorageBackend::Postgres(Arc::new(pool))),
bucket_cache: Arc::new(DashMap::new()),
object_cache: Arc::new(DashMap::new()),
});
}
if Self::is_sqlite_url(url) {
if !single_node {
return Err(lightningstor_types::Error::StorageError(
"SQLite is allowed only in single-node mode".to_string(),
));
}
if url.contains(":memory:") {
return Err(lightningstor_types::Error::StorageError(
"In-memory SQLite is not allowed".to_string(),
));
}
let pool = PoolOptions::<Sqlite>::new()
.max_connections(1)
.connect(url)
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Failed to connect to SQLite: {}",
e
))
})?;
Self::ensure_sql_schema_sqlite(&pool).await?;
return Ok(Self {
backend: StorageBackend::Sql(SqlStorageBackend::Sqlite(Arc::new(pool))),
bucket_cache: Arc::new(DashMap::new()),
object_cache: Arc::new(DashMap::new()),
});
}
Err(lightningstor_types::Error::StorageError(
"Unsupported metadata database URL (use postgres://, postgresql://, or sqlite:)"
.to_string(),
))
}
/// Create a new in-memory metadata store (for testing)
pub fn new_in_memory() -> Self {
Self {
backend: StorageBackend::InMemory(Arc::new(DashMap::new())),
bucket_cache: Arc::new(DashMap::new()),
object_cache: Arc::new(DashMap::new()),
}
}
fn is_postgres_url(url: &str) -> bool {
url.starts_with("postgres://") || url.starts_with("postgresql://")
}
fn is_sqlite_url(url: &str) -> bool {
url.starts_with("sqlite:")
}
async fn ensure_sql_schema_postgres(pool: &Pool<Postgres>) -> Result<()> {
sqlx::query(
"CREATE TABLE IF NOT EXISTS metadata_kv (
key TEXT PRIMARY KEY,
value TEXT NOT NULL
)",
)
.execute(pool)
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Failed to initialize Postgres schema: {}",
e
))
})?;
Ok(())
}
async fn ensure_sql_schema_sqlite(pool: &Pool<Sqlite>) -> Result<()> {
sqlx::query(
"CREATE TABLE IF NOT EXISTS metadata_kv (
key TEXT PRIMARY KEY,
value TEXT NOT NULL
)",
)
.execute(pool)
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Failed to initialize SQLite schema: {}",
e
))
})?;
Ok(())
}
fn prefix_end(prefix: &[u8]) -> Vec<u8> {
let mut end_key = prefix.to_vec();
if let Some(last) = end_key.last_mut() {
if *last == 0xff {
end_key.push(0x00);
} else {
*last += 1;
}
} else {
end_key.push(0xff);
}
end_key
}
fn exclusive_scan_start(key: &[u8]) -> Vec<u8> {
let mut next = key.to_vec();
next.push(0);
next
}
fn flaredb_client_for_key<'a>(
clients: &'a [Arc<Mutex<RdbClient>>],
key: &[u8],
) -> &'a Arc<Mutex<RdbClient>> {
let mut hasher = DefaultHasher::new();
key.hash(&mut hasher);
let index = (hasher.finish() as usize) % clients.len().max(1);
&clients[index]
}
fn flaredb_scan_client(clients: &[Arc<Mutex<RdbClient>>]) -> &Arc<Mutex<RdbClient>> {
&clients[0]
}
async fn flaredb_put_strong(
client: &Arc<Mutex<RdbClient>>,
key: &[u8],
value: &[u8],
) -> Result<()> {
const MAX_RETRIES: usize = 8;
for _ in 0..MAX_RETRIES {
let mut c = client.lock().await;
let expected_version = c
.cas_get(key.to_vec())
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"FlareDB CAS get failed: {}",
e
))
})?
.map(|(version, _)| version)
.unwrap_or(0);
let (success, _current_version, _new_version) = c
.cas(key.to_vec(), value.to_vec(), expected_version)
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"FlareDB CAS put failed: {}",
e
))
})?;
if success {
return Ok(());
}
}
Err(lightningstor_types::Error::StorageError(
"FlareDB CAS put exhausted retries".to_string(),
))
}
async fn flaredb_get_strong(
client: &Arc<Mutex<RdbClient>>,
key: &[u8],
) -> Result<Option<String>> {
let mut c = client.lock().await;
let result = c.cas_get(key.to_vec()).await.map_err(|e| {
lightningstor_types::Error::StorageError(format!("FlareDB CAS get failed: {}", e))
})?;
Ok(result.map(|(_version, bytes)| String::from_utf8_lossy(&bytes).to_string()))
}
async fn flaredb_delete_strong(client: &Arc<Mutex<RdbClient>>, key: &[u8]) -> Result<()> {
let mut c = client.lock().await;
c.cas_delete(key.to_vec(), 0).await.map_err(|e| {
lightningstor_types::Error::StorageError(format!("FlareDB CAS delete failed: {}", e))
})?;
Ok(())
}
async fn flaredb_scan_strong(
client: &Arc<Mutex<RdbClient>>,
start_key: &[u8],
end_key: &[u8],
limit: u32,
) -> Result<(Vec<(String, String)>, Option<Vec<u8>>)> {
let mut c = client.lock().await;
let (entries, next) = c
.cas_scan(start_key.to_vec(), end_key.to_vec(), limit)
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!("FlareDB CAS scan failed: {}", e))
})?;
let results = entries
.into_iter()
.map(|(key, value, _version)| {
(
String::from_utf8_lossy(&key).to_string(),
String::from_utf8_lossy(&value).to_string(),
)
})
.collect();
Ok((results, next))
}
async fn flaredb_put(
clients: &[Arc<Mutex<RdbClient>>],
key: &[u8],
value: &[u8],
) -> Result<()> {
let client = Self::flaredb_client_for_key(clients, key);
let raw_result = {
let mut c = client.lock().await;
c.raw_put(key.to_vec(), value.to_vec()).await
};
match raw_result {
Ok(()) => Ok(()),
Err(status) if Self::flaredb_requires_strong(&status) => {
Self::flaredb_put_strong(client, key, value).await
}
Err(error) => Err(lightningstor_types::Error::StorageError(format!(
"FlareDB put failed: {}",
error
))),
}
}
async fn flaredb_get(clients: &[Arc<Mutex<RdbClient>>], key: &[u8]) -> Result<Option<String>> {
let client = Self::flaredb_client_for_key(clients, key);
let raw_result = {
let mut c = client.lock().await;
c.raw_get(key.to_vec()).await
};
match raw_result {
Ok(result) => Ok(result.map(|bytes| String::from_utf8_lossy(&bytes).to_string())),
Err(status) if Self::flaredb_requires_strong(&status) => {
Self::flaredb_get_strong(client, key).await
}
Err(error) => Err(lightningstor_types::Error::StorageError(format!(
"FlareDB get failed: {}",
error
))),
}
}
async fn flaredb_delete(clients: &[Arc<Mutex<RdbClient>>], key: &[u8]) -> Result<()> {
let client = Self::flaredb_client_for_key(clients, key);
let raw_result = {
let mut c = client.lock().await;
c.raw_delete(key.to_vec()).await
};
match raw_result {
Ok(_) => Ok(()),
Err(status) if Self::flaredb_requires_strong(&status) => {
Self::flaredb_delete_strong(client, key).await
}
Err(error) => Err(lightningstor_types::Error::StorageError(format!(
"FlareDB delete failed: {}",
error
))),
}
}
async fn flaredb_scan(
clients: &[Arc<Mutex<RdbClient>>],
prefix: &[u8],
limit: u32,
) -> Result<Vec<(String, String)>> {
let end_key = Self::prefix_end(prefix);
let mut results = Vec::new();
let mut start_key = prefix.to_vec();
loop {
let client = Self::flaredb_scan_client(clients);
let (items, next) = match {
let mut c = client.lock().await;
c.raw_scan(start_key.clone(), end_key.clone(), limit).await
} {
Ok((keys, values, next)) => {
let items = keys
.into_iter()
.zip(values.into_iter())
.map(|(key, value)| {
(
String::from_utf8_lossy(&key).to_string(),
String::from_utf8_lossy(&value).to_string(),
)
})
.collect();
(items, next)
}
Err(status) if Self::flaredb_requires_strong(&status) => {
Self::flaredb_scan_strong(client, &start_key, &end_key, limit).await?
}
Err(error) => {
return Err(lightningstor_types::Error::StorageError(format!(
"FlareDB scan failed: {}",
error
)));
}
};
results.extend(items);
if let Some(next_key) = next {
start_key = next_key;
} else {
break;
}
}
Ok(results)
}
async fn flaredb_scan_page(
clients: &[Arc<Mutex<RdbClient>>],
prefix: &[u8],
start_after: Option<&[u8]>,
limit: u32,
) -> Result<(Vec<(String, String)>, bool)> {
let end_key = Self::prefix_end(prefix);
let start_key = start_after
.map(Self::exclusive_scan_start)
.unwrap_or_else(|| prefix.to_vec());
let fetch_limit = limit.saturating_add(1).max(1);
let client = Self::flaredb_scan_client(clients);
let (mut items, next) = match {
let mut c = client.lock().await;
c.raw_scan(start_key.clone(), end_key.clone(), fetch_limit)
.await
} {
Ok((keys, values, next)) => {
let items = keys
.into_iter()
.zip(values.into_iter())
.map(|(key, value)| {
(
String::from_utf8_lossy(&key).to_string(),
String::from_utf8_lossy(&value).to_string(),
)
})
.collect::<Vec<_>>();
(items, next)
}
Err(status) if Self::flaredb_requires_strong(&status) => {
Self::flaredb_scan_strong(client, &start_key, &end_key, fetch_limit).await?
}
Err(error) => {
return Err(lightningstor_types::Error::StorageError(format!(
"FlareDB scan failed: {}",
error
)));
}
};
let has_more = if items.len() > limit as usize {
items.truncate(limit as usize);
true
} else {
next.is_some()
};
Ok((items, has_more))
}
async fn flaredb_has_prefix(clients: &[Arc<Mutex<RdbClient>>], prefix: &[u8]) -> Result<bool> {
let end_key = Self::prefix_end(prefix);
let client = Self::flaredb_scan_client(clients);
match {
let mut c = client.lock().await;
c.raw_scan(prefix.to_vec(), end_key.clone(), 1).await
} {
Ok((keys, _, _)) => Ok(!keys.is_empty()),
Err(status) if Self::flaredb_requires_strong(&status) => {
let (entries, _) = Self::flaredb_scan_strong(client, prefix, &end_key, 1).await?;
Ok(!entries.is_empty())
}
Err(error) => Err(lightningstor_types::Error::StorageError(format!(
"FlareDB scan failed: {}",
error
))),
}
}
/// Internal: put a key-value pair
async fn put(&self, key: &str, value: &str) -> Result<()> {
match &self.backend {
StorageBackend::FlareDB(client) => {
Self::flaredb_put(client, key.as_bytes(), value.as_bytes()).await?;
}
StorageBackend::Sql(sql) => match sql {
SqlStorageBackend::Postgres(pool) => {
sqlx::query(
"INSERT INTO metadata_kv (key, value)
VALUES ($1, $2)
ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value",
)
.bind(key)
.bind(value)
.execute(pool.as_ref())
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Postgres put failed: {}",
e
))
})?;
}
SqlStorageBackend::Sqlite(pool) => {
sqlx::query(
"INSERT INTO metadata_kv (key, value)
VALUES (?1, ?2)
ON CONFLICT(key) DO UPDATE SET value = excluded.value",
)
.bind(key)
.bind(value)
.execute(pool.as_ref())
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"SQLite put failed: {}",
e
))
})?;
}
},
StorageBackend::InMemory(map) => {
map.insert(key.to_string(), value.to_string());
}
}
Ok(())
}
/// Internal: get a value by key
async fn get(&self, key: &str) -> Result<Option<String>> {
match &self.backend {
StorageBackend::FlareDB(client) => Self::flaredb_get(client, key.as_bytes()).await,
StorageBackend::Sql(sql) => match sql {
SqlStorageBackend::Postgres(pool) => {
let value: Option<String> =
sqlx::query_scalar("SELECT value FROM metadata_kv WHERE key = $1")
.bind(key)
.fetch_optional(pool.as_ref())
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Postgres get failed: {}",
e
))
})?;
Ok(value)
}
SqlStorageBackend::Sqlite(pool) => {
let value: Option<String> =
sqlx::query_scalar("SELECT value FROM metadata_kv WHERE key = ?1")
.bind(key)
.fetch_optional(pool.as_ref())
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"SQLite get failed: {}",
e
))
})?;
Ok(value)
}
},
StorageBackend::InMemory(map) => Ok(map.get(key).map(|v| v.value().clone())),
}
}
/// Internal: delete a key
async fn delete_key(&self, key: &str) -> Result<()> {
match &self.backend {
StorageBackend::FlareDB(client) => Self::flaredb_delete(client, key.as_bytes()).await?,
StorageBackend::Sql(sql) => match sql {
SqlStorageBackend::Postgres(pool) => {
sqlx::query("DELETE FROM metadata_kv WHERE key = $1")
.bind(key)
.execute(pool.as_ref())
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Postgres delete failed: {}",
e
))
})?;
}
SqlStorageBackend::Sqlite(pool) => {
sqlx::query("DELETE FROM metadata_kv WHERE key = ?1")
.bind(key)
.execute(pool.as_ref())
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"SQLite delete failed: {}",
e
))
})?;
}
},
StorageBackend::InMemory(map) => {
map.remove(key);
}
}
Ok(())
}
/// Internal: get all keys with a prefix
async fn get_prefix(&self, prefix: &str) -> Result<Vec<(String, String)>> {
match &self.backend {
StorageBackend::FlareDB(client) => {
Self::flaredb_scan(client, prefix.as_bytes(), 1000).await
}
StorageBackend::Sql(sql) => {
let like_pattern = format!("{}%", prefix);
match sql {
SqlStorageBackend::Postgres(pool) => {
let rows: Vec<(String, String)> = sqlx::query_as(
"SELECT key, value FROM metadata_kv WHERE key LIKE $1 ORDER BY key",
)
.bind(like_pattern)
.fetch_all(pool.as_ref())
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Postgres scan failed: {}",
e
))
})?;
Ok(rows)
}
SqlStorageBackend::Sqlite(pool) => {
let rows: Vec<(String, String)> = sqlx::query_as(
"SELECT key, value FROM metadata_kv WHERE key LIKE ?1 ORDER BY key",
)
.bind(like_pattern)
.fetch_all(pool.as_ref())
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"SQLite scan failed: {}",
e
))
})?;
Ok(rows)
}
}
}
StorageBackend::InMemory(map) => {
let mut results = Vec::new();
for entry in map.iter() {
if entry.key().starts_with(prefix) {
results.push((entry.key().clone(), entry.value().clone()));
}
}
results.sort_by(|lhs, rhs| lhs.0.cmp(&rhs.0));
Ok(results)
}
}
}
async fn get_prefix_page(
&self,
prefix: &str,
start_after: Option<&str>,
limit: u32,
) -> Result<(Vec<(String, String)>, bool)> {
if limit == 0 {
return Ok((Vec::new(), false));
}
match &self.backend {
StorageBackend::FlareDB(client) => {
Self::flaredb_scan_page(
client,
prefix.as_bytes(),
start_after.map(str::as_bytes),
limit,
)
.await
}
StorageBackend::Sql(sql) => {
let prefix_end =
String::from_utf8(Self::prefix_end(prefix.as_bytes())).map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Failed to encode prefix end: {}",
e
))
})?;
let fetch_limit = (limit.saturating_add(1)) as i64;
match sql {
SqlStorageBackend::Postgres(pool) => {
let rows: Vec<(String, String)> = if let Some(after) = start_after {
sqlx::query_as(
"SELECT key, value FROM metadata_kv
WHERE key >= $1 AND key < $2 AND key > $3
ORDER BY key
LIMIT $4",
)
.bind(prefix)
.bind(&prefix_end)
.bind(after)
.bind(fetch_limit)
.fetch_all(pool.as_ref())
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Postgres paged scan failed: {}",
e
))
})?
} else {
sqlx::query_as(
"SELECT key, value FROM metadata_kv
WHERE key >= $1 AND key < $2
ORDER BY key
LIMIT $3",
)
.bind(prefix)
.bind(&prefix_end)
.bind(fetch_limit)
.fetch_all(pool.as_ref())
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Postgres paged scan failed: {}",
e
))
})?
};
let has_more = rows.len() > limit as usize;
let items = rows.into_iter().take(limit as usize).collect();
Ok((items, has_more))
}
SqlStorageBackend::Sqlite(pool) => {
let rows: Vec<(String, String)> = if let Some(after) = start_after {
sqlx::query_as(
"SELECT key, value FROM metadata_kv
WHERE key >= ?1 AND key < ?2 AND key > ?3
ORDER BY key
LIMIT ?4",
)
.bind(prefix)
.bind(&prefix_end)
.bind(after)
.bind(fetch_limit)
.fetch_all(pool.as_ref())
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"SQLite paged scan failed: {}",
e
))
})?
} else {
sqlx::query_as(
"SELECT key, value FROM metadata_kv
WHERE key >= ?1 AND key < ?2
ORDER BY key
LIMIT ?3",
)
.bind(prefix)
.bind(&prefix_end)
.bind(fetch_limit)
.fetch_all(pool.as_ref())
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"SQLite paged scan failed: {}",
e
))
})?
};
let has_more = rows.len() > limit as usize;
let items = rows.into_iter().take(limit as usize).collect();
Ok((items, has_more))
}
}
}
StorageBackend::InMemory(map) => {
let mut rows: Vec<(String, String)> = map
.iter()
.filter(|entry| entry.key().starts_with(prefix))
.map(|entry| (entry.key().clone(), entry.value().clone()))
.collect();
rows.sort_by(|lhs, rhs| lhs.0.cmp(&rhs.0));
if let Some(after) = start_after {
rows.retain(|(key, _)| key.as_str() > after);
}
let has_more = rows.len() > limit as usize;
let items = rows.into_iter().take(limit as usize).collect();
Ok((items, has_more))
}
}
}
/// Internal: check if any key exists with a prefix
async fn has_prefix(&self, prefix: &str) -> Result<bool> {
match &self.backend {
StorageBackend::FlareDB(client) => {
Self::flaredb_has_prefix(client, prefix.as_bytes()).await
}
StorageBackend::Sql(sql) => {
let like_pattern = format!("{}%", prefix);
match sql {
SqlStorageBackend::Postgres(pool) => {
let found: Option<String> = sqlx::query_scalar(
"SELECT key FROM metadata_kv WHERE key LIKE $1 LIMIT 1",
)
.bind(like_pattern)
.fetch_optional(pool.as_ref())
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Postgres scan failed: {}",
e
))
})?;
Ok(found.is_some())
}
SqlStorageBackend::Sqlite(pool) => {
let found: Option<String> = sqlx::query_scalar(
"SELECT key FROM metadata_kv WHERE key LIKE ?1 LIMIT 1",
)
.bind(like_pattern)
.fetch_optional(pool.as_ref())
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"SQLite scan failed: {}",
e
))
})?;
Ok(found.is_some())
}
}
}
StorageBackend::InMemory(map) => {
for entry in map.iter() {
if entry.key().starts_with(prefix) {
return Ok(true);
}
}
Ok(false)
}
}
}
/// Build bucket key
fn bucket_key(org_id: &str, project_id: &str, bucket_name: &str) -> String {
format!(
"/lightningstor/buckets/{}/{}/{}",
org_id, project_id, bucket_name
)
}
/// Build bucket ID key
fn bucket_id_key(bucket_id: &BucketId) -> String {
format!("/lightningstor/bucket_ids/{}", bucket_id)
}
/// Build object key
fn object_key(bucket_id: &BucketId, object_key: &str, version_id: Option<&str>) -> String {
if let Some(version_id) = version_id {
format!(
"/lightningstor/objects/{}/{}/{}",
bucket_id, object_key, version_id
)
} else {
format!("/lightningstor/objects/{}/{}", bucket_id, object_key)
}
}
/// Build object prefix for listing
fn object_prefix(bucket_id: &BucketId, prefix: &str) -> String {
format!("/lightningstor/objects/{}/{}", bucket_id, prefix)
}
fn multipart_upload_key(upload_id: &str) -> String {
format!("/lightningstor/multipart/uploads/{}", upload_id)
}
fn multipart_upload_prefix() -> &'static str {
"/lightningstor/multipart/uploads/"
}
fn multipart_bucket_key(bucket_id: &str, object_key: &str, upload_id: &str) -> String {
format!(
"/lightningstor/multipart/by-bucket/{}/{}/{}",
bucket_id, object_key, upload_id
)
}
fn multipart_bucket_prefix(bucket_id: &BucketId, prefix: &str) -> String {
format!(
"/lightningstor/multipart/by-bucket/{}/{}",
bucket_id, prefix
)
}
fn multipart_object_key(object_id: &ObjectId) -> String {
format!("/lightningstor/multipart/objects/{}", object_id)
}
fn replicated_repair_task_key(task_id: &str) -> String {
format!("/lightningstor/repair/replicated/{}", task_id)
}
fn replicated_repair_task_prefix() -> &'static str {
"/lightningstor/repair/replicated/"
}
pub async fn save_replicated_repair_task(&self, task: &ReplicatedRepairTask) -> Result<()> {
let key = Self::replicated_repair_task_key(&task.id);
let value = serde_json::to_string(task).map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Failed to serialize replicated repair task: {}",
e
))
})?;
self.put(&key, &value).await
}
pub async fn list_replicated_repair_tasks(
&self,
limit: u32,
) -> Result<Vec<ReplicatedRepairTask>> {
let (items, _) = self
.get_prefix_page(Self::replicated_repair_task_prefix(), None, limit)
.await?;
let mut tasks = Vec::new();
for (_, value) in items {
let task: ReplicatedRepairTask = serde_json::from_str(&value).map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Failed to deserialize replicated repair task: {}",
e
))
})?;
tasks.push(task);
}
Ok(tasks)
}
pub async fn delete_replicated_repair_task(&self, task_id: &str) -> Result<()> {
self.delete_key(&Self::replicated_repair_task_key(task_id))
.await
}
/// Save bucket metadata
pub async fn save_bucket(&self, bucket: &Bucket) -> Result<()> {
let key = Self::bucket_key(&bucket.org_id, &bucket.project_id, bucket.name.as_str());
let value = serde_json::to_string(bucket).map_err(|e| {
lightningstor_types::Error::StorageError(format!("Failed to serialize bucket: {}", e))
})?;
self.put(&key, &value).await?;
// Also save bucket ID mapping
let id_key = Self::bucket_id_key(&bucket.id);
self.put(&id_key, &key).await?;
self.bucket_cache.insert(key, bucket.clone());
self.bucket_cache.insert(id_key, bucket.clone());
Ok(())
}
/// Load bucket metadata
pub async fn load_bucket(
&self,
org_id: &str,
project_id: &str,
bucket_name: &str,
) -> Result<Option<Bucket>> {
let key = Self::bucket_key(org_id, project_id, bucket_name);
if let Some(bucket) = self.bucket_cache.get(&key) {
return Ok(Some(bucket.clone()));
}
if let Some(value) = self.get(&key).await? {
let bucket: Bucket = serde_json::from_str(&value).map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Failed to deserialize bucket: {}",
e
))
})?;
self.bucket_cache.insert(key, bucket.clone());
Ok(Some(bucket))
} else {
Ok(None)
}
}
/// Load bucket by ID
pub async fn load_bucket_by_id(&self, bucket_id: &BucketId) -> Result<Option<Bucket>> {
let id_key = Self::bucket_id_key(bucket_id);
if let Some(bucket) = self.bucket_cache.get(&id_key) {
return Ok(Some(bucket.clone()));
}
if let Some(bucket_key) = self.get(&id_key).await? {
if let Some(value) = self.get(&bucket_key).await? {
let bucket: Bucket = serde_json::from_str(&value).map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Failed to deserialize bucket: {}",
e
))
})?;
self.bucket_cache.insert(bucket_key.clone(), bucket.clone());
self.bucket_cache.insert(id_key, bucket.clone());
Ok(Some(bucket))
} else {
Ok(None)
}
} else {
Ok(None)
}
}
/// Delete bucket metadata
pub async fn delete_bucket(&self, bucket: &Bucket) -> Result<()> {
// Only delete bucket metadata; object deletion should be explicit.
let key = Self::bucket_key(&bucket.org_id, &bucket.project_id, bucket.name.as_str());
let id_key = Self::bucket_id_key(&bucket.id);
self.delete_key(&key).await?;
self.delete_key(&id_key).await?;
self.bucket_cache.remove(&key);
self.bucket_cache.remove(&id_key);
Ok(())
}
/// Check whether a bucket has any objects
pub async fn has_objects(&self, bucket_id: &BucketId) -> Result<bool> {
let prefix = format!("/lightningstor/objects/{}/", bucket_id);
self.has_prefix(&prefix).await
}
/// List buckets for a tenant
pub async fn list_buckets(
&self,
org_id: &str,
project_id: Option<&str>,
) -> Result<Vec<Bucket>> {
let prefix = if let Some(project_id) = project_id {
format!("/lightningstor/buckets/{}/{}/", org_id, project_id)
} else {
format!("/lightningstor/buckets/{}/", org_id)
};
let items = self.get_prefix(&prefix).await?;
let mut buckets = Vec::new();
for (_, value) in items {
if let Ok(bucket) = serde_json::from_str::<Bucket>(&value) {
let key =
Self::bucket_key(&bucket.org_id, &bucket.project_id, bucket.name.as_str());
let id_key = Self::bucket_id_key(&bucket.id);
self.bucket_cache.insert(key, bucket.clone());
self.bucket_cache.insert(id_key, bucket.clone());
buckets.push(bucket);
}
}
Ok(buckets)
}
/// Save object metadata
pub async fn save_object(&self, object: &Object) -> Result<()> {
let version_id = if object.version.is_null() {
None
} else {
Some(object.version.as_str())
};
// bucket_id is stored as String in Object, need to parse it
let bucket_id = BucketId::from_str(&object.bucket_id).map_err(|_| {
lightningstor_types::Error::InvalidArgument("Invalid bucket ID".to_string())
})?;
let key = Self::object_key(&bucket_id, object.key.as_str(), version_id);
let value = serde_json::to_string(object).map_err(|e| {
lightningstor_types::Error::StorageError(format!("Failed to serialize object: {}", e))
})?;
self.put(&key, &value).await?;
self.object_cache.insert(key, object.clone());
Ok(())
}
/// Load object metadata
pub async fn load_object(
&self,
bucket_id: &BucketId,
object_key: &str,
version_id: Option<&str>,
) -> Result<Option<Object>> {
let key = Self::object_key(bucket_id, object_key, version_id);
if let Some(object) = self.object_cache.get(&key) {
return Ok(Some(object.clone()));
}
if let Some(value) = self.get(&key).await? {
let object: Object = serde_json::from_str(&value).map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Failed to deserialize object: {}",
e
))
})?;
self.object_cache.insert(key, object.clone());
Ok(Some(object))
} else {
Ok(None)
}
}
/// Delete object metadata
pub async fn delete_object(
&self,
bucket_id: &BucketId,
object_key: &str,
version_id: Option<&str>,
) -> Result<()> {
let key = Self::object_key(bucket_id, object_key, version_id);
self.delete_key(&key).await?;
self.object_cache.remove(&key);
Ok(())
}
/// List objects in a bucket
pub async fn list_objects(
&self,
bucket_id: &BucketId,
prefix: &str,
max_keys: u32,
) -> Result<Vec<Object>> {
if max_keys > 0 {
return self
.list_objects_page(bucket_id, prefix, None, max_keys)
.await
.map(|(objects, _)| objects);
}
let prefix_key = Self::object_prefix(bucket_id, prefix);
let items = self.get_prefix(&prefix_key).await?;
let mut objects = Vec::new();
for (_, value) in items.into_iter() {
if let Ok(object) = serde_json::from_str::<Object>(&value) {
objects.push(object);
}
}
// Sort by key for consistent ordering
objects.sort_by(|a, b| a.key.as_str().cmp(b.key.as_str()));
if max_keys > 0 && objects.len() > max_keys as usize {
objects.truncate(max_keys as usize);
}
Ok(objects)
}
pub async fn list_objects_page(
&self,
bucket_id: &BucketId,
prefix: &str,
start_after_key: Option<&str>,
max_keys: u32,
) -> Result<(Vec<Object>, bool)> {
if max_keys == 0 {
return Ok((Vec::new(), false));
}
let prefix_key = Self::object_prefix(bucket_id, prefix);
let start_after_storage_key =
start_after_key.map(|key| Self::object_key(bucket_id, key, None));
let (items, has_more) = self
.get_prefix_page(&prefix_key, start_after_storage_key.as_deref(), max_keys)
.await?;
let mut objects = Vec::new();
for (_, value) in items {
if let Ok(object) = serde_json::from_str::<Object>(&value) {
objects.push(object);
}
}
Ok((objects, has_more))
}
pub async fn save_multipart_upload(&self, upload: &MultipartUpload) -> Result<()> {
let key = Self::multipart_upload_key(upload.upload_id.as_str());
let value = serde_json::to_string(upload).map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Failed to serialize multipart upload: {}",
e
))
})?;
self.put(&key, &value).await?;
self.put(
&Self::multipart_bucket_key(
&upload.bucket_id,
upload.key.as_str(),
upload.upload_id.as_str(),
),
&value,
)
.await
}
pub async fn load_multipart_upload(&self, upload_id: &str) -> Result<Option<MultipartUpload>> {
let key = Self::multipart_upload_key(upload_id);
if let Some(value) = self.get(&key).await? {
let upload: MultipartUpload = serde_json::from_str(&value).map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Failed to deserialize multipart upload: {}",
e
))
})?;
Ok(Some(upload))
} else {
Ok(None)
}
}
pub async fn delete_multipart_upload(&self, upload_id: &str) -> Result<()> {
if let Some(upload) = self.load_multipart_upload(upload_id).await? {
self.delete_key(&Self::multipart_bucket_key(
&upload.bucket_id,
upload.key.as_str(),
upload.upload_id.as_str(),
))
.await?;
}
self.delete_key(&Self::multipart_upload_key(upload_id))
.await
}
pub async fn list_multipart_uploads(
&self,
bucket_id: &BucketId,
prefix: &str,
max_uploads: u32,
) -> Result<Vec<MultipartUpload>> {
let index_prefix = Self::multipart_bucket_prefix(bucket_id, prefix);
let items = if max_uploads > 0 {
self.get_prefix_page(&index_prefix, None, max_uploads)
.await?
.0
} else {
self.get_prefix(&index_prefix).await?
};
let mut uploads = Vec::new();
for (_, value) in items {
if let Ok(upload) = serde_json::from_str::<MultipartUpload>(&value) {
uploads.push(upload);
}
}
if uploads.is_empty() {
let fallback_items = self.get_prefix(Self::multipart_upload_prefix()).await?;
for (_, value) in fallback_items {
if let Ok(upload) = serde_json::from_str::<MultipartUpload>(&value) {
if upload.bucket_id == bucket_id.to_string()
&& upload.key.as_str().starts_with(prefix)
{
uploads.push(upload);
}
}
}
}
uploads.sort_by(|a, b| {
a.key
.as_str()
.cmp(b.key.as_str())
.then_with(|| a.initiated.cmp(&b.initiated))
});
if max_uploads > 0 && uploads.len() > max_uploads as usize {
uploads.truncate(max_uploads as usize);
}
Ok(uploads)
}
pub async fn save_object_multipart_upload(
&self,
object_id: &ObjectId,
upload: &MultipartUpload,
) -> Result<()> {
let key = Self::multipart_object_key(object_id);
let value = serde_json::to_string(upload).map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Failed to serialize multipart manifest: {}",
e
))
})?;
self.put(&key, &value).await
}
pub async fn load_object_multipart_upload(
&self,
object_id: &ObjectId,
) -> Result<Option<MultipartUpload>> {
let key = Self::multipart_object_key(object_id);
if let Some(value) = self.get(&key).await? {
let upload: MultipartUpload = serde_json::from_str(&value).map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Failed to deserialize multipart manifest: {}",
e
))
})?;
Ok(Some(upload))
} else {
Ok(None)
}
}
pub async fn delete_object_multipart_upload(&self, object_id: &ObjectId) -> Result<()> {
self.delete_key(&Self::multipart_object_key(object_id))
.await
}
}
fn normalize_transport_addr(endpoint: &str) -> String {
endpoint
.trim()
.trim_start_matches("http://")
.trim_start_matches("https://")
.trim_end_matches('/')
.to_string()
}
#[cfg(test)]
mod tests {
use super::*;
use lightningstor_distributed::ReplicatedRepairTask;
use lightningstor_types::{BucketName, ETag, ObjectKey};
#[tokio::test]
async fn bucket_cache_hits_and_invalidates_on_delete() {
let store = MetadataStore::new_in_memory();
let bucket = Bucket::new(
BucketName::new("bench-bucket").unwrap(),
"org-a",
"project-a",
"default",
);
store.save_bucket(&bucket).await.unwrap();
let cache_key = MetadataStore::bucket_key("org-a", "project-a", "bench-bucket");
let cache_id_key = MetadataStore::bucket_id_key(&bucket.id);
assert!(store.bucket_cache.contains_key(&cache_key));
assert!(store.bucket_cache.contains_key(&cache_id_key));
let loaded = store
.load_bucket("org-a", "project-a", "bench-bucket")
.await
.unwrap()
.unwrap();
assert_eq!(loaded.id, bucket.id);
let by_id = store.load_bucket_by_id(&bucket.id).await.unwrap().unwrap();
assert_eq!(by_id.name, bucket.name);
store.delete_bucket(&bucket).await.unwrap();
assert!(!store.bucket_cache.contains_key(&cache_key));
assert!(!store.bucket_cache.contains_key(&cache_id_key));
assert!(store
.load_bucket("org-a", "project-a", "bench-bucket")
.await
.unwrap()
.is_none());
}
#[tokio::test]
async fn object_cache_hits_and_invalidates_on_delete() {
let store = MetadataStore::new_in_memory();
let bucket = Bucket::new(
BucketName::new("objects-bucket").unwrap(),
"org-a",
"project-a",
"default",
);
store.save_bucket(&bucket).await.unwrap();
let mut object = Object::new(
bucket.id.to_string(),
ObjectKey::new("bench/object.bin").unwrap(),
ETag::from_md5(&[1u8; 16]),
4096,
Some("application/octet-stream".to_string()),
);
object.version = lightningstor_types::ObjectVersion::null();
store.save_object(&object).await.unwrap();
let cache_key = MetadataStore::object_key(&bucket.id, object.key.as_str(), None);
assert!(store.object_cache.contains_key(&cache_key));
let loaded = store
.load_object(&bucket.id, object.key.as_str(), None)
.await
.unwrap()
.unwrap();
assert_eq!(loaded.id, object.id);
store
.delete_object(&bucket.id, object.key.as_str(), None)
.await
.unwrap();
assert!(!store.object_cache.contains_key(&cache_key));
assert!(store
.load_object(&bucket.id, object.key.as_str(), None)
.await
.unwrap()
.is_none());
}
#[tokio::test]
async fn list_objects_page_honors_start_after_and_has_more() {
let store = MetadataStore::new_in_memory();
let bucket = Bucket::new(
BucketName::new("paged-bucket").unwrap(),
"org-a",
"project-a",
"default",
);
store.save_bucket(&bucket).await.unwrap();
for key in ["a.txt", "b.txt", "c.txt"] {
let mut object = Object::new(
bucket.id.to_string(),
ObjectKey::new(key).unwrap(),
ETag::from_md5(&[7u8; 16]),
128,
Some("text/plain".to_string()),
);
object.version = lightningstor_types::ObjectVersion::null();
store.save_object(&object).await.unwrap();
}
let (first_page, first_has_more) = store
.list_objects_page(&bucket.id, "", None, 2)
.await
.unwrap();
assert_eq!(
first_page
.iter()
.map(|object| object.key.as_str().to_string())
.collect::<Vec<_>>(),
vec!["a.txt".to_string(), "b.txt".to_string()]
);
assert!(first_has_more);
let (second_page, second_has_more) = store
.list_objects_page(&bucket.id, "", Some("b.txt"), 2)
.await
.unwrap();
assert_eq!(
second_page
.iter()
.map(|object| object.key.as_str().to_string())
.collect::<Vec<_>>(),
vec!["c.txt".to_string()]
);
assert!(!second_has_more);
}
#[tokio::test]
async fn list_multipart_uploads_uses_bucket_prefix_index() {
let store = MetadataStore::new_in_memory();
let bucket = Bucket::new(
BucketName::new("multipart-bucket").unwrap(),
"org-a",
"project-a",
"default",
);
store.save_bucket(&bucket).await.unwrap();
let upload_a =
MultipartUpload::new(bucket.id.to_string(), ObjectKey::new("a/one.bin").unwrap());
let upload_b =
MultipartUpload::new(bucket.id.to_string(), ObjectKey::new("a/two.bin").unwrap());
let other_bucket = Bucket::new(
BucketName::new("other-bucket").unwrap(),
"org-a",
"project-a",
"default",
);
store.save_bucket(&other_bucket).await.unwrap();
let upload_other = MultipartUpload::new(
other_bucket.id.to_string(),
ObjectKey::new("a/three.bin").unwrap(),
);
store.save_multipart_upload(&upload_a).await.unwrap();
store.save_multipart_upload(&upload_b).await.unwrap();
store.save_multipart_upload(&upload_other).await.unwrap();
let uploads = store
.list_multipart_uploads(&bucket.id, "a/", 10)
.await
.unwrap();
assert_eq!(uploads.len(), 2);
assert_eq!(
uploads
.iter()
.map(|upload| upload.key.as_str().to_string())
.collect::<Vec<_>>(),
vec!["a/one.bin".to_string(), "a/two.bin".to_string()]
);
}
#[tokio::test]
async fn replicated_repair_tasks_round_trip() {
let store = MetadataStore::new_in_memory();
let mut task = ReplicatedRepairTask::new("obj_abc", 0, "quorum write");
store.save_replicated_repair_task(&task).await.unwrap();
let tasks = store.list_replicated_repair_tasks(10).await.unwrap();
assert_eq!(tasks.len(), 1);
assert_eq!(tasks[0].key, "obj_abc");
task.schedule_retry("transient failure", 5_000);
store.save_replicated_repair_task(&task).await.unwrap();
let tasks = store.list_replicated_repair_tasks(10).await.unwrap();
assert_eq!(tasks[0].attempt_count, 1);
assert_eq!(tasks[0].last_error.as_deref(), Some("transient failure"));
store.delete_replicated_repair_task(&task.id).await.unwrap();
assert!(store
.list_replicated_repair_tasks(10)
.await
.unwrap()
.is_empty());
}
}