From 8f94aee1fa0639b69c8b1dbcd5840d4b93153ab6 Mon Sep 17 00:00:00 2001 From: centra Date: Tue, 9 Dec 2025 16:51:20 +0900 Subject: [PATCH] Fix R8: Convert submodule gitlinks to regular directories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove gitlinks (160000 mode) for chainfire, flaredb, iam - Add workspace contents as regular tracked files - Update flake.nix to use simple paths instead of builtins.fetchGit This resolves the nix build failure where submodule directories appeared empty in the nix store. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- chainfire | 1 - chainfire/.gitignore | 22 + chainfire/Cargo.toml | 89 + chainfire/advice.md | 87 + chainfire/chainfire-client/Cargo.toml | 31 + chainfire/chainfire-client/src/client.rs | 389 ++++ chainfire/chainfire-client/src/error.rs | 34 + chainfire/chainfire-client/src/lib.rs | 34 + chainfire/chainfire-client/src/node.rs | 333 +++ chainfire/chainfire-client/src/watch.rs | 143 ++ chainfire/crates/chainfire-api/Cargo.toml | 42 + chainfire/crates/chainfire-api/build.rs | 19 + .../chainfire-api/src/cluster_service.rs | 216 ++ .../crates/chainfire-api/src/conversions.rs | 113 + .../src/generated/chainfire.internal.rs | 566 +++++ .../src/generated/chainfire.v1.rs | 1817 ++++++++++++++++ .../crates/chainfire-api/src/generated/mod.rs | 13 + .../chainfire-api/src/internal_service.rs | 242 +++ .../crates/chainfire-api/src/kv_service.rs | 285 +++ .../crates/chainfire-api/src/lease_service.rs | 194 ++ chainfire/crates/chainfire-api/src/lib.rs | 29 + .../crates/chainfire-api/src/raft_client.rs | 428 ++++ .../crates/chainfire-api/src/watch_service.rs | 157 ++ chainfire/crates/chainfire-core/Cargo.toml | 37 + .../crates/chainfire-core/src/builder.rs | 221 ++ .../crates/chainfire-core/src/callbacks.rs | 103 + .../crates/chainfire-core/src/cluster.rs | 282 +++ chainfire/crates/chainfire-core/src/config.rs | 162 ++ chainfire/crates/chainfire-core/src/error.rs | 78 + chainfire/crates/chainfire-core/src/events.rs | 198 ++ chainfire/crates/chainfire-core/src/kvs.rs | 290 +++ chainfire/crates/chainfire-core/src/lib.rs | 58 + chainfire/crates/chainfire-gossip/Cargo.toml | 35 + .../crates/chainfire-gossip/src/agent.rs | 214 ++ .../crates/chainfire-gossip/src/broadcast.rs | 210 ++ .../crates/chainfire-gossip/src/identity.rs | 147 ++ chainfire/crates/chainfire-gossip/src/lib.rs | 40 + .../crates/chainfire-gossip/src/membership.rs | 141 ++ .../crates/chainfire-gossip/src/runtime.rs | 131 ++ chainfire/crates/chainfire-proto/Cargo.toml | 21 + chainfire/crates/chainfire-proto/build.rs | 12 + chainfire/crates/chainfire-proto/src/lib.rs | 7 + chainfire/crates/chainfire-raft/Cargo.toml | 38 + chainfire/crates/chainfire-raft/src/config.rs | 79 + chainfire/crates/chainfire-raft/src/lib.rs | 20 + .../crates/chainfire-raft/src/network.rs | 316 +++ chainfire/crates/chainfire-raft/src/node.rs | 326 +++ .../crates/chainfire-raft/src/storage.rs | 475 ++++ chainfire/crates/chainfire-server/Cargo.toml | 59 + .../crates/chainfire-server/src/config.rs | 160 ++ chainfire/crates/chainfire-server/src/lib.rs | 10 + chainfire/crates/chainfire-server/src/main.rs | 148 ++ chainfire/crates/chainfire-server/src/node.rs | 201 ++ .../crates/chainfire-server/src/server.rs | 207 ++ .../tests/integration_test.rs | 159 ++ chainfire/crates/chainfire-storage/Cargo.toml | 34 + .../crates/chainfire-storage/src/kv_store.rs | 435 ++++ .../chainfire-storage/src/lease_store.rs | 280 +++ chainfire/crates/chainfire-storage/src/lib.rs | 51 + .../chainfire-storage/src/log_storage.rs | 478 ++++ .../crates/chainfire-storage/src/snapshot.rs | 316 +++ .../chainfire-storage/src/state_machine.rs | 587 +++++ .../crates/chainfire-storage/src/store.rs | 132 ++ chainfire/crates/chainfire-types/Cargo.toml | 18 + .../crates/chainfire-types/src/command.rs | 270 +++ chainfire/crates/chainfire-types/src/error.rs | 164 ++ chainfire/crates/chainfire-types/src/kv.rs | 201 ++ chainfire/crates/chainfire-types/src/lease.rs | 187 ++ chainfire/crates/chainfire-types/src/lib.rs | 23 + chainfire/crates/chainfire-types/src/node.rs | 255 +++ chainfire/crates/chainfire-types/src/watch.rs | 266 +++ chainfire/crates/chainfire-watch/Cargo.toml | 26 + chainfire/crates/chainfire-watch/src/lib.rs | 25 + .../crates/chainfire-watch/src/matcher.rs | 150 ++ .../crates/chainfire-watch/src/registry.rs | 353 +++ .../crates/chainfire-watch/src/stream.rs | 190 ++ chainfire/flake.lock | 96 + chainfire/flake.nix | 79 + chainfire/proto/chainfire.proto | 414 ++++ chainfire/proto/internal.proto | 93 + flake.nix | 20 +- flaredb | 1 - flaredb/.gitignore | 18 + flaredb/.specify/memory/constitution.md | 41 + .../scripts/bash/check-prerequisites.sh | 166 ++ flaredb/.specify/scripts/bash/common.sh | 156 ++ .../scripts/bash/create-new-feature.sh | 305 +++ flaredb/.specify/scripts/bash/setup-plan.sh | 61 + .../scripts/bash/update-agent-context.sh | 790 +++++++ .../.specify/templates/agent-file-template.md | 28 + .../.specify/templates/checklist-template.md | 40 + flaredb/.specify/templates/plan-template.md | 104 + flaredb/.specify/templates/spec-template.md | 115 + flaredb/.specify/templates/tasks-template.md | 251 +++ flaredb/Cargo.toml | 66 + flaredb/advice.md | 124 ++ flaredb/chat.md | 1935 +++++++++++++++++ flaredb/crates/flaredb-cli/Cargo.toml | 9 + flaredb/crates/flaredb-cli/src/main.rs | 3 + flaredb/crates/flaredb-client/Cargo.toml | 14 + flaredb/crates/flaredb-client/src/client.rs | 264 +++ flaredb/crates/flaredb-client/src/lib.rs | 4 + flaredb/crates/flaredb-client/src/main.rs | 91 + .../crates/flaredb-client/src/region_cache.rs | 48 + .../flaredb-client/tests/test_rpc_connect.rs | 297 +++ flaredb/crates/flaredb-pd/Cargo.toml | 11 + flaredb/crates/flaredb-pd/src/cluster.rs | 136 ++ flaredb/crates/flaredb-pd/src/main.rs | 40 + flaredb/crates/flaredb-pd/src/pd_service.rs | 78 + flaredb/crates/flaredb-pd/src/service.rs | 31 + flaredb/crates/flaredb-pd/src/tso.rs | 332 +++ flaredb/crates/flaredb-proto/Cargo.toml | 12 + flaredb/crates/flaredb-proto/build.rs | 18 + .../crates/flaredb-proto/src/chainfire.proto | 319 +++ flaredb/crates/flaredb-proto/src/kvrpc.proto | 123 ++ flaredb/crates/flaredb-proto/src/lib.rs | 16 + flaredb/crates/flaredb-proto/src/pdpb.proto | 78 + .../flaredb-proto/src/raft_server.proto | 127 ++ flaredb/crates/flaredb-raft/Cargo.toml | 19 + flaredb/crates/flaredb-raft/src/lib.rs | 15 + flaredb/crates/flaredb-raft/src/network.rs | 338 +++ .../flaredb-raft/src/persistent_storage.rs | 611 ++++++ flaredb/crates/flaredb-raft/src/raft_node.rs | 517 +++++ flaredb/crates/flaredb-raft/src/storage.rs | 809 +++++++ flaredb/crates/flaredb-raft/src/types.rs | 155 ++ flaredb/crates/flaredb-server/Cargo.toml | 32 + .../flaredb-server/examples/test_cluster.rs | 69 + .../crates/flaredb-server/src/config/mod.rs | 279 +++ .../crates/flaredb-server/src/heartbeat.rs | 47 + flaredb/crates/flaredb-server/src/lib.rs | 7 + flaredb/crates/flaredb-server/src/main.rs | 320 +++ flaredb/crates/flaredb-server/src/merkle.rs | 113 + .../crates/flaredb-server/src/pd_client.rs | 503 +++++ .../crates/flaredb-server/src/raft_service.rs | 257 +++ flaredb/crates/flaredb-server/src/service.rs | 520 +++++ flaredb/crates/flaredb-server/src/store.rs | 254 +++ .../flaredb-server/tests/test_consistency.rs | 198 ++ .../flaredb-server/tests/test_multi_region.rs | 122 ++ .../crates/flaredb-server/tests/test_split.rs | 71 + .../tests/test_split_trigger.rs | 46 + .../flaredb-server/tests/test_strong_mode.rs | 69 + flaredb/crates/flaredb-storage/Cargo.toml | 13 + flaredb/crates/flaredb-storage/src/engine.rs | 66 + flaredb/crates/flaredb-storage/src/lib.rs | 51 + .../flaredb-storage/src/rocks_engine.rs | 389 ++++ flaredb/crates/flaredb-types/Cargo.toml | 9 + flaredb/crates/flaredb-types/src/command.rs | 35 + flaredb/crates/flaredb-types/src/error.rs | 34 + flaredb/crates/flaredb-types/src/lib.rs | 11 + flaredb/crates/flaredb-types/src/region.rs | 32 + flaredb/data/CURRENT | 1 + flaredb/data/IDENTITY | 1 + flaredb/data/LOCK | 0 flaredb/data/LOG | 587 +++++ flaredb/data/MANIFEST-000005 | Bin 0 -> 170 bytes flaredb/data/OPTIONS-000007 | 333 +++ flaredb/flake.lock | 96 + flaredb/flake.nix | 48 + flaredb/rustfmt.toml | 1 + flaredb/scripts/verify-core.sh | 52 + flaredb/scripts/verify-multiraft.sh | 14 + flaredb/scripts/verify-raft.sh | 12 + flaredb/scripts/verify-sharding.sh | 40 + .../checklists/requirements.md | 34 + .../contracts/kvrpc.proto | 55 + .../001-distributed-core/contracts/pdpb.proto | 56 + .../specs/001-distributed-core/data-model.md | 52 + flaredb/specs/001-distributed-core/plan.md | 95 + .../specs/001-distributed-core/quickstart.md | 64 + .../specs/001-distributed-core/research.md | 19 + flaredb/specs/001-distributed-core/spec.md | 87 + flaredb/specs/001-distributed-core/tasks.md | 220 ++ flaredb/specs/001-multi-raft/spec.md | 115 + .../checklists/requirements.md | 34 + .../contracts/raft-service.md | 35 + flaredb/specs/002-raft-features/data-model.md | 34 + flaredb/specs/002-raft-features/plan.md | 69 + flaredb/specs/002-raft-features/quickstart.md | 39 + flaredb/specs/002-raft-features/research.md | 23 + flaredb/specs/002-raft-features/spec.md | 92 + flaredb/specs/002-raft-features/tasks.md | 128 ++ .../checklists/requirements.md | 34 + .../003-kvs-consistency/contracts/kv_cas.md | 29 + .../003-kvs-consistency/contracts/kv_raw.md | 25 + .../contracts/raft_service.md | 33 + .../specs/003-kvs-consistency/data-model.md | 26 + flaredb/specs/003-kvs-consistency/plan.md | 76 + .../specs/003-kvs-consistency/quickstart.md | 78 + flaredb/specs/003-kvs-consistency/research.md | 15 + flaredb/specs/003-kvs-consistency/spec.md | 88 + flaredb/specs/003-kvs-consistency/tasks.md | 119 + .../004-multi-raft/checklists/requirements.md | 34 + flaredb/specs/004-multi-raft/contracts/pd.md | 36 + flaredb/specs/004-multi-raft/data-model.md | 45 + flaredb/specs/004-multi-raft/plan.md | 62 + flaredb/specs/004-multi-raft/quickstart.md | 44 + flaredb/specs/004-multi-raft/spec.md | 208 ++ flaredb/specs/004-multi-raft/tasks.md | 125 ++ iam | 1 - iam/Cargo.toml | 97 + iam/config/iam.example.toml | 48 + iam/crates/iam-api/Cargo.toml | 30 + iam/crates/iam-api/build.rs | 11 + iam/crates/iam-api/src/conversions.rs | 412 ++++ iam/crates/iam-api/src/generated/mod.rs | 10 + iam/crates/iam-api/src/iam_service.rs | 918 ++++++++ iam/crates/iam-api/src/lib.rs | 19 + iam/crates/iam-api/src/token_service.rs | 430 ++++ .../iam-api/tests/tenant_path_integration.rs | 778 +++++++ iam/crates/iam-audit/Cargo.toml | 33 + iam/crates/iam-audit/src/event.rs | 467 ++++ iam/crates/iam-audit/src/lib.rs | 32 + iam/crates/iam-audit/src/logger.rs | 281 +++ iam/crates/iam-audit/src/sink.rs | 305 +++ iam/crates/iam-authn/Cargo.toml | 24 + iam/crates/iam-authn/src/jwt.rs | 409 ++++ iam/crates/iam-authn/src/lib.rs | 22 + iam/crates/iam-authn/src/mtls.rs | 353 +++ iam/crates/iam-authn/src/provider.rs | 245 +++ iam/crates/iam-authn/src/token.rs | 856 ++++++++ iam/crates/iam-authz/Cargo.toml | 22 + iam/crates/iam-authz/src/cache.rs | 490 +++++ iam/crates/iam-authz/src/condition.rs | 349 +++ iam/crates/iam-authz/src/context.rs | 222 ++ iam/crates/iam-authz/src/evaluator.rs | 550 +++++ iam/crates/iam-authz/src/lib.rs | 18 + iam/crates/iam-client/Cargo.toml | 20 + iam/crates/iam-client/src/client.rs | 510 +++++ iam/crates/iam-client/src/lib.rs | 7 + iam/crates/iam-server/Cargo.toml | 32 + iam/crates/iam-server/src/config.rs | 347 +++ iam/crates/iam-server/src/main.rs | 257 +++ iam/crates/iam-store/Cargo.toml | 22 + iam/crates/iam-store/src/backend.rs | 1047 +++++++++ iam/crates/iam-store/src/binding_store.rs | 458 ++++ iam/crates/iam-store/src/group_store.rs | 221 ++ iam/crates/iam-store/src/lib.rs | 20 + iam/crates/iam-store/src/principal_store.rs | 403 ++++ iam/crates/iam-store/src/role_store.rs | 388 ++++ iam/crates/iam-store/src/token_store.rs | 77 + iam/crates/iam-types/Cargo.toml | 20 + iam/crates/iam-types/src/condition.rs | 582 +++++ iam/crates/iam-types/src/error.rs | 159 ++ iam/crates/iam-types/src/lib.rs | 30 + iam/crates/iam-types/src/policy.rs | 318 +++ iam/crates/iam-types/src/principal.rs | 235 ++ iam/crates/iam-types/src/resource.rs | 214 ++ iam/crates/iam-types/src/role.rs | 310 +++ iam/crates/iam-types/src/scope.rs | 494 +++++ iam/crates/iam-types/src/token.rs | 506 +++++ iam/flake.lock | 96 + iam/flake.nix | 43 + iam/proto/iam.proto | 622 ++++++ 253 files changed, 45639 insertions(+), 19 deletions(-) delete mode 160000 chainfire create mode 100644 chainfire/.gitignore create mode 100644 chainfire/Cargo.toml create mode 100644 chainfire/advice.md create mode 100644 chainfire/chainfire-client/Cargo.toml create mode 100644 chainfire/chainfire-client/src/client.rs create mode 100644 chainfire/chainfire-client/src/error.rs create mode 100644 chainfire/chainfire-client/src/lib.rs create mode 100644 chainfire/chainfire-client/src/node.rs create mode 100644 chainfire/chainfire-client/src/watch.rs create mode 100644 chainfire/crates/chainfire-api/Cargo.toml create mode 100644 chainfire/crates/chainfire-api/build.rs create mode 100644 chainfire/crates/chainfire-api/src/cluster_service.rs create mode 100644 chainfire/crates/chainfire-api/src/conversions.rs create mode 100644 chainfire/crates/chainfire-api/src/generated/chainfire.internal.rs create mode 100644 chainfire/crates/chainfire-api/src/generated/chainfire.v1.rs create mode 100644 chainfire/crates/chainfire-api/src/generated/mod.rs create mode 100644 chainfire/crates/chainfire-api/src/internal_service.rs create mode 100644 chainfire/crates/chainfire-api/src/kv_service.rs create mode 100644 chainfire/crates/chainfire-api/src/lease_service.rs create mode 100644 chainfire/crates/chainfire-api/src/lib.rs create mode 100644 chainfire/crates/chainfire-api/src/raft_client.rs create mode 100644 chainfire/crates/chainfire-api/src/watch_service.rs create mode 100644 chainfire/crates/chainfire-core/Cargo.toml create mode 100644 chainfire/crates/chainfire-core/src/builder.rs create mode 100644 chainfire/crates/chainfire-core/src/callbacks.rs create mode 100644 chainfire/crates/chainfire-core/src/cluster.rs create mode 100644 chainfire/crates/chainfire-core/src/config.rs create mode 100644 chainfire/crates/chainfire-core/src/error.rs create mode 100644 chainfire/crates/chainfire-core/src/events.rs create mode 100644 chainfire/crates/chainfire-core/src/kvs.rs create mode 100644 chainfire/crates/chainfire-core/src/lib.rs create mode 100644 chainfire/crates/chainfire-gossip/Cargo.toml create mode 100644 chainfire/crates/chainfire-gossip/src/agent.rs create mode 100644 chainfire/crates/chainfire-gossip/src/broadcast.rs create mode 100644 chainfire/crates/chainfire-gossip/src/identity.rs create mode 100644 chainfire/crates/chainfire-gossip/src/lib.rs create mode 100644 chainfire/crates/chainfire-gossip/src/membership.rs create mode 100644 chainfire/crates/chainfire-gossip/src/runtime.rs create mode 100644 chainfire/crates/chainfire-proto/Cargo.toml create mode 100644 chainfire/crates/chainfire-proto/build.rs create mode 100644 chainfire/crates/chainfire-proto/src/lib.rs create mode 100644 chainfire/crates/chainfire-raft/Cargo.toml create mode 100644 chainfire/crates/chainfire-raft/src/config.rs create mode 100644 chainfire/crates/chainfire-raft/src/lib.rs create mode 100644 chainfire/crates/chainfire-raft/src/network.rs create mode 100644 chainfire/crates/chainfire-raft/src/node.rs create mode 100644 chainfire/crates/chainfire-raft/src/storage.rs create mode 100644 chainfire/crates/chainfire-server/Cargo.toml create mode 100644 chainfire/crates/chainfire-server/src/config.rs create mode 100644 chainfire/crates/chainfire-server/src/lib.rs create mode 100644 chainfire/crates/chainfire-server/src/main.rs create mode 100644 chainfire/crates/chainfire-server/src/node.rs create mode 100644 chainfire/crates/chainfire-server/src/server.rs create mode 100644 chainfire/crates/chainfire-server/tests/integration_test.rs create mode 100644 chainfire/crates/chainfire-storage/Cargo.toml create mode 100644 chainfire/crates/chainfire-storage/src/kv_store.rs create mode 100644 chainfire/crates/chainfire-storage/src/lease_store.rs create mode 100644 chainfire/crates/chainfire-storage/src/lib.rs create mode 100644 chainfire/crates/chainfire-storage/src/log_storage.rs create mode 100644 chainfire/crates/chainfire-storage/src/snapshot.rs create mode 100644 chainfire/crates/chainfire-storage/src/state_machine.rs create mode 100644 chainfire/crates/chainfire-storage/src/store.rs create mode 100644 chainfire/crates/chainfire-types/Cargo.toml create mode 100644 chainfire/crates/chainfire-types/src/command.rs create mode 100644 chainfire/crates/chainfire-types/src/error.rs create mode 100644 chainfire/crates/chainfire-types/src/kv.rs create mode 100644 chainfire/crates/chainfire-types/src/lease.rs create mode 100644 chainfire/crates/chainfire-types/src/lib.rs create mode 100644 chainfire/crates/chainfire-types/src/node.rs create mode 100644 chainfire/crates/chainfire-types/src/watch.rs create mode 100644 chainfire/crates/chainfire-watch/Cargo.toml create mode 100644 chainfire/crates/chainfire-watch/src/lib.rs create mode 100644 chainfire/crates/chainfire-watch/src/matcher.rs create mode 100644 chainfire/crates/chainfire-watch/src/registry.rs create mode 100644 chainfire/crates/chainfire-watch/src/stream.rs create mode 100644 chainfire/flake.lock create mode 100644 chainfire/flake.nix create mode 100644 chainfire/proto/chainfire.proto create mode 100644 chainfire/proto/internal.proto delete mode 160000 flaredb create mode 100644 flaredb/.gitignore create mode 100644 flaredb/.specify/memory/constitution.md create mode 100755 flaredb/.specify/scripts/bash/check-prerequisites.sh create mode 100755 flaredb/.specify/scripts/bash/common.sh create mode 100755 flaredb/.specify/scripts/bash/create-new-feature.sh create mode 100755 flaredb/.specify/scripts/bash/setup-plan.sh create mode 100755 flaredb/.specify/scripts/bash/update-agent-context.sh create mode 100644 flaredb/.specify/templates/agent-file-template.md create mode 100644 flaredb/.specify/templates/checklist-template.md create mode 100644 flaredb/.specify/templates/plan-template.md create mode 100644 flaredb/.specify/templates/spec-template.md create mode 100644 flaredb/.specify/templates/tasks-template.md create mode 100644 flaredb/Cargo.toml create mode 100644 flaredb/advice.md create mode 100644 flaredb/chat.md create mode 100644 flaredb/crates/flaredb-cli/Cargo.toml create mode 100644 flaredb/crates/flaredb-cli/src/main.rs create mode 100644 flaredb/crates/flaredb-client/Cargo.toml create mode 100644 flaredb/crates/flaredb-client/src/client.rs create mode 100644 flaredb/crates/flaredb-client/src/lib.rs create mode 100644 flaredb/crates/flaredb-client/src/main.rs create mode 100644 flaredb/crates/flaredb-client/src/region_cache.rs create mode 100644 flaredb/crates/flaredb-client/tests/test_rpc_connect.rs create mode 100644 flaredb/crates/flaredb-pd/Cargo.toml create mode 100644 flaredb/crates/flaredb-pd/src/cluster.rs create mode 100644 flaredb/crates/flaredb-pd/src/main.rs create mode 100644 flaredb/crates/flaredb-pd/src/pd_service.rs create mode 100644 flaredb/crates/flaredb-pd/src/service.rs create mode 100644 flaredb/crates/flaredb-pd/src/tso.rs create mode 100644 flaredb/crates/flaredb-proto/Cargo.toml create mode 100644 flaredb/crates/flaredb-proto/build.rs create mode 100644 flaredb/crates/flaredb-proto/src/chainfire.proto create mode 100644 flaredb/crates/flaredb-proto/src/kvrpc.proto create mode 100644 flaredb/crates/flaredb-proto/src/lib.rs create mode 100644 flaredb/crates/flaredb-proto/src/pdpb.proto create mode 100644 flaredb/crates/flaredb-proto/src/raft_server.proto create mode 100644 flaredb/crates/flaredb-raft/Cargo.toml create mode 100644 flaredb/crates/flaredb-raft/src/lib.rs create mode 100644 flaredb/crates/flaredb-raft/src/network.rs create mode 100644 flaredb/crates/flaredb-raft/src/persistent_storage.rs create mode 100644 flaredb/crates/flaredb-raft/src/raft_node.rs create mode 100644 flaredb/crates/flaredb-raft/src/storage.rs create mode 100644 flaredb/crates/flaredb-raft/src/types.rs create mode 100644 flaredb/crates/flaredb-server/Cargo.toml create mode 100644 flaredb/crates/flaredb-server/examples/test_cluster.rs create mode 100644 flaredb/crates/flaredb-server/src/config/mod.rs create mode 100644 flaredb/crates/flaredb-server/src/heartbeat.rs create mode 100644 flaredb/crates/flaredb-server/src/lib.rs create mode 100644 flaredb/crates/flaredb-server/src/main.rs create mode 100644 flaredb/crates/flaredb-server/src/merkle.rs create mode 100644 flaredb/crates/flaredb-server/src/pd_client.rs create mode 100644 flaredb/crates/flaredb-server/src/raft_service.rs create mode 100644 flaredb/crates/flaredb-server/src/service.rs create mode 100644 flaredb/crates/flaredb-server/src/store.rs create mode 100644 flaredb/crates/flaredb-server/tests/test_consistency.rs create mode 100644 flaredb/crates/flaredb-server/tests/test_multi_region.rs create mode 100644 flaredb/crates/flaredb-server/tests/test_split.rs create mode 100644 flaredb/crates/flaredb-server/tests/test_split_trigger.rs create mode 100644 flaredb/crates/flaredb-server/tests/test_strong_mode.rs create mode 100644 flaredb/crates/flaredb-storage/Cargo.toml create mode 100644 flaredb/crates/flaredb-storage/src/engine.rs create mode 100644 flaredb/crates/flaredb-storage/src/lib.rs create mode 100644 flaredb/crates/flaredb-storage/src/rocks_engine.rs create mode 100644 flaredb/crates/flaredb-types/Cargo.toml create mode 100644 flaredb/crates/flaredb-types/src/command.rs create mode 100644 flaredb/crates/flaredb-types/src/error.rs create mode 100644 flaredb/crates/flaredb-types/src/lib.rs create mode 100644 flaredb/crates/flaredb-types/src/region.rs create mode 100644 flaredb/data/CURRENT create mode 100644 flaredb/data/IDENTITY create mode 100644 flaredb/data/LOCK create mode 100644 flaredb/data/LOG create mode 100644 flaredb/data/MANIFEST-000005 create mode 100644 flaredb/data/OPTIONS-000007 create mode 100644 flaredb/flake.lock create mode 100644 flaredb/flake.nix create mode 100644 flaredb/rustfmt.toml create mode 100755 flaredb/scripts/verify-core.sh create mode 100644 flaredb/scripts/verify-multiraft.sh create mode 100755 flaredb/scripts/verify-raft.sh create mode 100755 flaredb/scripts/verify-sharding.sh create mode 100644 flaredb/specs/001-distributed-core/checklists/requirements.md create mode 100644 flaredb/specs/001-distributed-core/contracts/kvrpc.proto create mode 100644 flaredb/specs/001-distributed-core/contracts/pdpb.proto create mode 100644 flaredb/specs/001-distributed-core/data-model.md create mode 100644 flaredb/specs/001-distributed-core/plan.md create mode 100644 flaredb/specs/001-distributed-core/quickstart.md create mode 100644 flaredb/specs/001-distributed-core/research.md create mode 100644 flaredb/specs/001-distributed-core/spec.md create mode 100644 flaredb/specs/001-distributed-core/tasks.md create mode 100644 flaredb/specs/001-multi-raft/spec.md create mode 100644 flaredb/specs/002-raft-features/checklists/requirements.md create mode 100644 flaredb/specs/002-raft-features/contracts/raft-service.md create mode 100644 flaredb/specs/002-raft-features/data-model.md create mode 100644 flaredb/specs/002-raft-features/plan.md create mode 100644 flaredb/specs/002-raft-features/quickstart.md create mode 100644 flaredb/specs/002-raft-features/research.md create mode 100644 flaredb/specs/002-raft-features/spec.md create mode 100644 flaredb/specs/002-raft-features/tasks.md create mode 100644 flaredb/specs/003-kvs-consistency/checklists/requirements.md create mode 100644 flaredb/specs/003-kvs-consistency/contracts/kv_cas.md create mode 100644 flaredb/specs/003-kvs-consistency/contracts/kv_raw.md create mode 100644 flaredb/specs/003-kvs-consistency/contracts/raft_service.md create mode 100644 flaredb/specs/003-kvs-consistency/data-model.md create mode 100644 flaredb/specs/003-kvs-consistency/plan.md create mode 100644 flaredb/specs/003-kvs-consistency/quickstart.md create mode 100644 flaredb/specs/003-kvs-consistency/research.md create mode 100644 flaredb/specs/003-kvs-consistency/spec.md create mode 100644 flaredb/specs/003-kvs-consistency/tasks.md create mode 100644 flaredb/specs/004-multi-raft/checklists/requirements.md create mode 100644 flaredb/specs/004-multi-raft/contracts/pd.md create mode 100644 flaredb/specs/004-multi-raft/data-model.md create mode 100644 flaredb/specs/004-multi-raft/plan.md create mode 100644 flaredb/specs/004-multi-raft/quickstart.md create mode 100644 flaredb/specs/004-multi-raft/spec.md create mode 100644 flaredb/specs/004-multi-raft/tasks.md delete mode 160000 iam create mode 100644 iam/Cargo.toml create mode 100644 iam/config/iam.example.toml create mode 100644 iam/crates/iam-api/Cargo.toml create mode 100644 iam/crates/iam-api/build.rs create mode 100644 iam/crates/iam-api/src/conversions.rs create mode 100644 iam/crates/iam-api/src/generated/mod.rs create mode 100644 iam/crates/iam-api/src/iam_service.rs create mode 100644 iam/crates/iam-api/src/lib.rs create mode 100644 iam/crates/iam-api/src/token_service.rs create mode 100644 iam/crates/iam-api/tests/tenant_path_integration.rs create mode 100644 iam/crates/iam-audit/Cargo.toml create mode 100644 iam/crates/iam-audit/src/event.rs create mode 100644 iam/crates/iam-audit/src/lib.rs create mode 100644 iam/crates/iam-audit/src/logger.rs create mode 100644 iam/crates/iam-audit/src/sink.rs create mode 100644 iam/crates/iam-authn/Cargo.toml create mode 100644 iam/crates/iam-authn/src/jwt.rs create mode 100644 iam/crates/iam-authn/src/lib.rs create mode 100644 iam/crates/iam-authn/src/mtls.rs create mode 100644 iam/crates/iam-authn/src/provider.rs create mode 100644 iam/crates/iam-authn/src/token.rs create mode 100644 iam/crates/iam-authz/Cargo.toml create mode 100644 iam/crates/iam-authz/src/cache.rs create mode 100644 iam/crates/iam-authz/src/condition.rs create mode 100644 iam/crates/iam-authz/src/context.rs create mode 100644 iam/crates/iam-authz/src/evaluator.rs create mode 100644 iam/crates/iam-authz/src/lib.rs create mode 100644 iam/crates/iam-client/Cargo.toml create mode 100644 iam/crates/iam-client/src/client.rs create mode 100644 iam/crates/iam-client/src/lib.rs create mode 100644 iam/crates/iam-server/Cargo.toml create mode 100644 iam/crates/iam-server/src/config.rs create mode 100644 iam/crates/iam-server/src/main.rs create mode 100644 iam/crates/iam-store/Cargo.toml create mode 100644 iam/crates/iam-store/src/backend.rs create mode 100644 iam/crates/iam-store/src/binding_store.rs create mode 100644 iam/crates/iam-store/src/group_store.rs create mode 100644 iam/crates/iam-store/src/lib.rs create mode 100644 iam/crates/iam-store/src/principal_store.rs create mode 100644 iam/crates/iam-store/src/role_store.rs create mode 100644 iam/crates/iam-store/src/token_store.rs create mode 100644 iam/crates/iam-types/Cargo.toml create mode 100644 iam/crates/iam-types/src/condition.rs create mode 100644 iam/crates/iam-types/src/error.rs create mode 100644 iam/crates/iam-types/src/lib.rs create mode 100644 iam/crates/iam-types/src/policy.rs create mode 100644 iam/crates/iam-types/src/principal.rs create mode 100644 iam/crates/iam-types/src/resource.rs create mode 100644 iam/crates/iam-types/src/role.rs create mode 100644 iam/crates/iam-types/src/scope.rs create mode 100644 iam/crates/iam-types/src/token.rs create mode 100644 iam/flake.lock create mode 100644 iam/flake.nix create mode 100644 iam/proto/iam.proto diff --git a/chainfire b/chainfire deleted file mode 160000 index 0d970d8..0000000 --- a/chainfire +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0d970d80331e2a0d74a6c806f3576095bd083923 diff --git a/chainfire/.gitignore b/chainfire/.gitignore new file mode 100644 index 0000000..9391cf5 --- /dev/null +++ b/chainfire/.gitignore @@ -0,0 +1,22 @@ +# Generated files +/target/ +**/*.rs.bk +Cargo.lock + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Test data +/tmp/ +*.db/ + +# Environment +.env +.env.local diff --git a/chainfire/Cargo.toml b/chainfire/Cargo.toml new file mode 100644 index 0000000..3647999 --- /dev/null +++ b/chainfire/Cargo.toml @@ -0,0 +1,89 @@ +[workspace] +resolver = "2" +members = [ + "crates/chainfire-proto", + "crates/chainfire-types", + "crates/chainfire-storage", + "crates/chainfire-raft", + "crates/chainfire-gossip", + "crates/chainfire-watch", + "crates/chainfire-api", + "crates/chainfire-core", + "crates/chainfire-server", + "chainfire-client", +] + +[workspace.package] +version = "0.1.0" +edition = "2021" +license = "MIT OR Apache-2.0" +rust-version = "1.75" +authors = ["Chainfire Contributors"] +repository = "https://github.com/chainfire/chainfire" + +[workspace.dependencies] +# Internal crates +chainfire-types = { path = "crates/chainfire-types" } +chainfire-storage = { path = "crates/chainfire-storage" } +chainfire-raft = { path = "crates/chainfire-raft" } +chainfire-gossip = { path = "crates/chainfire-gossip" } +chainfire-watch = { path = "crates/chainfire-watch" } +chainfire-api = { path = "crates/chainfire-api" } +chainfire-client = { path = "chainfire-client" } +chainfire-core = { path = "crates/chainfire-core" } +chainfire-server = { path = "crates/chainfire-server" } +chainfire-proto = { path = "crates/chainfire-proto" } + +# Async runtime +tokio = { version = "1.40", features = ["full"] } +tokio-stream = "0.1" +futures = "0.3" +async-trait = "0.1" + +# Raft +openraft = { version = "0.9", features = ["serde", "storage-v2"] } + +# Gossip (SWIM protocol) +foca = { version = "1.0", features = ["std", "tracing", "serde", "postcard-codec"] } + +# Storage +rocksdb = { version = "0.24", default-features = false, features = ["multi-threaded-cf", "zstd", "lz4", "snappy"] } + +# gRPC +tonic = "0.12" +tonic-build = "0.12" +tonic-health = "0.12" +prost = "0.13" +prost-types = "0.13" + +# Serialization +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +bincode = "1.3" + +# Utilities +thiserror = "1.0" +anyhow = "1.0" +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +bytes = "1.5" +parking_lot = "0.12" +dashmap = "6" + +# Metrics +metrics = "0.23" +metrics-exporter-prometheus = "0.15" + +# Configuration +toml = "0.8" +clap = { version = "4", features = ["derive"] } + +# Testing +tempfile = "3.10" +proptest = "1.4" + +[workspace.lints.rust] +unsafe_code = "deny" + +[workspace.lints.clippy] +all = "warn" diff --git a/chainfire/advice.md b/chainfire/advice.md new file mode 100644 index 0000000..7517167 --- /dev/null +++ b/chainfire/advice.md @@ -0,0 +1,87 @@ +RaftとGossipプロトコルを用いた、クラスター管理のための数万台までスケールするKey-Value Storeを書いてほしいです。 + +- プログラミング言語:rust +- テストをちゃんと書きながら書くことを推奨する。 +- クラスターへの参加/削除/障害検知を行う。 + +では、**「Raft(合意形成)」と「Gossip(情報の拡散)」を組み合わせた場合、具体的にどうデータが流れ、どうやってノードが動き出すのか**、その具体的なフローを解説します。 + +----- + +### 前提:このシステムの役割分担 + + * **Control Plane (CP):** Raftで構成された3〜7台(Raftアルゴリズムでうまく合意が取れる範囲)のサーバー。情報の「正規の持ち主」。いなくなったら自動でWorker Nodesから昇格する。 + * **Worker Nodes (VM/DB Hosts):** 数百〜数千台の実働部隊。CPのクライアント。 + +### 1\. データはどのように書き込まれるか? (Write) + +書き込みは **「必ず Control Plane の Raft Leader に対して」** 行います。Gossip経由での書き込みは(順序保証がないため)行いません。 + +例:「VM-A を Node-10 で起動したい」 + +1. **API Call:** 管理者(またはCLI)が、CPのAPIサーバーにリクエストを送ります。 +2. **Raft Log:** CPのリーダーは、この変更を `Put(Key="/nodes/node-10/tasks/vm-a", Value="START")` としてRaftログに追加します。 +3. **Commit:** 過半数のCPノードがログを保存したら「書き込み完了」と見なします。 + +ここまでは普通のDBと同じです。 + +### 2\. 各ノードはどのようにデータを取得し、通知を受けるか? (Read & Notify) + +ここが最大のポイントです。数千台のノードが「自分宛ての命令はないか?」と毎秒ポーリング(問い合わせ)すると、CPがDDoS攻撃を受けたようにパンクします。 + +ここで **「Watch(ロングポーリング)」** という仕組みを使います。 + +#### A. Watchによる通知と取得(これがメイン) + +Kubernetesやetcdが採用している方式です。 + +1. **接続維持:** Node-10 は起動時に CP に対して `Watch("/nodes/node-10/")` というリクエストを送ります。 +2. **待機:** CP は「Node-10 以下のキーに変更があるまで、レスポンスを返さずに接続を維持(ブロック)」します。 +3. **イベント発火:** 先ほどの書き込み(VM起動命令)が発生した瞬間、CP は待機していた Node-10 への接続を通じて「更新イベント(Event: PUT, Key: ...vm-a, Value: START)」を即座にプッシュします。 +4. **アクション:** Node-10 は通知を受け取り次第、VMを起動します。 + +**結論:** 「書き込み後の通知」は絶対に必要です。それを効率よくやるのが **Watch API** です。 + +----- + +### 3\. じゃあ Gossip はどこで使うのか? + +「Raft + Watch」で完結しそうに見えますが、10台以上のスケール、特にVM基盤のような動的な環境では **Gossip が以下の「3つの穴」を埋めるために不可欠** になります。 + +#### ① Nodeの死活監視・インベントリ管理(下り方向) + +CPが「Node-10にVMを立てたい」と思ったとき、「そもそもNode-10は生きているのか? IPは? 空きメモリは?」という情報を知る必要があります。 + + * **Gossipの役割:** 各Worker Nodeは、Gossip(SWIMプロトコル)でお互いに、そしてCPに対して「自分は生きてるよ、IPはこれだよ」と喋り続けます。 + * CPはこのGossip情報を聞いて、最新の「ノード一覧リスト(Memberlist)」をメモリ上に保持します。 + +#### ② サービスのディスカバリ(横方向) + +DB基盤の場合、「DBノードA」が「DBノードB」と通信したいことがあります。いちいちCPに聞きに行くと遅いです。 + + * **Gossipの役割:** ノード同士で「私はここにいるよ」とGossipし合うことで、CPを介さずに直接通信相手を見つけられます。 + +#### ③ "Actual State"(現状)の報告(上り方向) + +VMが起動した後、「起動しました」というステータスをどうCPに伝えるか。 + + * **Raftに書く:** 確実ですが、頻繁に変わるステータス(CPU使用率など)を全部Raftに書くとCPがパンクします。 + * **Gossipで流す:** 「VM-Aは起動中、負荷50%」といった情報はGossipに乗せて、**「結果的にCPに伝わればいい」** という扱いにします。 + +----- + +### 設計のまとめ:RaftとGossipの使い分け + +これから作る「汎用管理DB(外部依存型)」は、以下のようなインターフェースを持つことになるでしょう。 + +| アクション | 通信方式 | 具体的なデータ例 | +| :--- | :--- | :--- | +| **命令 (Desired State)**
「こうなってほしい」 | **Raft + Watch**
(強整合性) | ・VMの起動/停止命令
・DBのデータ配置情報の変更
・パスワードや設定変更 | +| **現状 (Actual State)**
「今こうなってます」 | **Gossip**
(結果整合性) | ・ノードの生存確認 (Heartbeat)
・リソース使用率 (CPU/Mem)
・「VM起動完了」などのステータス | +| **通知 (Notification)** | **Watch (HTTP/gRPC Stream)** | ・「新しい命令が来たぞ!」というトリガー | + +#### 実装のアドバイス + +もし「etcdのようなもの」を自作されるなら、**「Serf (Gossip)」と「Raft」をライブラリとして組み込み、その上に「gRPCによるWatch付きのKVS API」を被せる** という構成になります。 + +これができれば、VM基盤は「Watchして、VMを起動して、Gossipでステータスを返すエージェント」を作るだけで済みますし、DB基盤も同様に作れます。非常にスケーラブルで美しい設計です。 diff --git a/chainfire/chainfire-client/Cargo.toml b/chainfire/chainfire-client/Cargo.toml new file mode 100644 index 0000000..ba9dd90 --- /dev/null +++ b/chainfire/chainfire-client/Cargo.toml @@ -0,0 +1,31 @@ +[package] +name = "chainfire-client" +version.workspace = true +edition.workspace = true +license.workspace = true +rust-version.workspace = true +description = "Chainfire distributed KVS client library" + +[dependencies] +chainfire-types = { workspace = true } +chainfire-proto = { workspace = true } + +# gRPC +tonic = { workspace = true } + +# Async +tokio = { workspace = true } +tokio-stream = { workspace = true } +futures = { workspace = true } + +# Utilities +tracing = { workspace = true } +thiserror = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } + +[dev-dependencies] +tokio = { workspace = true, features = ["rt-multi-thread", "macros"] } + +[lints] +workspace = true diff --git a/chainfire/chainfire-client/src/client.rs b/chainfire/chainfire-client/src/client.rs new file mode 100644 index 0000000..11bb272 --- /dev/null +++ b/chainfire/chainfire-client/src/client.rs @@ -0,0 +1,389 @@ +//! Chainfire client implementation + +use crate::error::{ClientError, Result}; +use crate::watch::WatchHandle; +use chainfire_proto::proto::{ + cluster_client::ClusterClient, + compare, + kv_client::KvClient, + request_op, + response_op, + watch_client::WatchClient, + Compare, + DeleteRangeRequest, + PutRequest, + RangeRequest, + RequestOp, + StatusRequest, + TxnRequest, +}; +use tonic::transport::Channel; +use tracing::debug; + +/// Chainfire client +pub struct Client { + /// gRPC channel + channel: Channel, + /// KV client + kv: KvClient, + /// Cluster client + cluster: ClusterClient, +} + +impl Client { + /// Connect to a Chainfire server + pub async fn connect(addr: impl AsRef) -> Result { + let addr = addr.as_ref().to_string(); + debug!(addr = %addr, "Connecting to Chainfire"); + + let channel = Channel::from_shared(addr) + .map_err(|e| ClientError::Connection(e.to_string()))? + .connect() + .await?; + + let kv = KvClient::new(channel.clone()); + let cluster = ClusterClient::new(channel.clone()); + + Ok(Self { + channel, + kv, + cluster, + }) + } + + /// Put a key-value pair + pub async fn put(&mut self, key: impl AsRef<[u8]>, value: impl AsRef<[u8]>) -> Result { + let resp = self + .kv + .put(PutRequest { + key: key.as_ref().to_vec(), + value: value.as_ref().to_vec(), + lease: 0, + prev_kv: false, + }) + .await? + .into_inner(); + + Ok(resp.header.map(|h| h.revision as u64).unwrap_or(0)) + } + + /// Put a key-value pair with string values + pub async fn put_str(&mut self, key: &str, value: &str) -> Result { + self.put(key.as_bytes(), value.as_bytes()).await + } + + /// Get a value by key + pub async fn get(&mut self, key: impl AsRef<[u8]>) -> Result>> { + Ok(self + .get_with_revision(key) + .await? + .map(|(value, _)| value)) + } + + /// Get a value by key along with its current revision + pub async fn get_with_revision( + &mut self, + key: impl AsRef<[u8]>, + ) -> Result, u64)>> { + let resp = self + .kv + .range(RangeRequest { + key: key.as_ref().to_vec(), + range_end: vec![], + limit: 1, + revision: 0, + keys_only: false, + count_only: false, + serializable: false, // default: linearizable read + }) + .await? + .into_inner(); + + Ok(resp.kvs.into_iter().next().map(|kv| (kv.value, kv.mod_revision as u64))) + } + + /// Get a value as string + pub async fn get_str(&mut self, key: &str) -> Result> { + let value = self.get(key.as_bytes()).await?; + Ok(value.map(|v| String::from_utf8_lossy(&v).to_string())) + } + + /// Delete a key + pub async fn delete(&mut self, key: impl AsRef<[u8]>) -> Result { + let resp = self + .kv + .delete(DeleteRangeRequest { + key: key.as_ref().to_vec(), + range_end: vec![], + prev_kv: false, + }) + .await? + .into_inner(); + + Ok(resp.deleted > 0) + } + + /// Get all keys with a prefix + pub async fn get_prefix(&mut self, prefix: impl AsRef<[u8]>) -> Result, Vec)>> { + let prefix = prefix.as_ref(); + let range_end = prefix_end(prefix); + + let resp = self + .kv + .range(RangeRequest { + key: prefix.to_vec(), + range_end, + limit: 0, + revision: 0, + keys_only: false, + count_only: false, + serializable: false, + }) + .await? + .into_inner(); + + Ok(resp.kvs.into_iter().map(|kv| (kv.key, kv.value)).collect()) + } + + /// Scan a prefix returning keys, values, and revisions + pub async fn scan_prefix( + &mut self, + prefix: impl AsRef<[u8]>, + limit: i64, + ) -> Result<(Vec<(Vec, Vec, u64)>, Option>)> { + let prefix = prefix.as_ref(); + let range_end = prefix_end(prefix); + + let resp = self + .kv + .range(RangeRequest { + key: prefix.to_vec(), + range_end, + limit, + revision: 0, + keys_only: false, + count_only: false, + serializable: false, + }) + .await? + .into_inner(); + + let more = resp.more; + let mut kvs: Vec<(Vec, Vec, u64)> = resp + .kvs + .into_iter() + .map(|kv| (kv.key, kv.value, kv.mod_revision as u64)) + .collect(); + let next_key = if more { + kvs.last() + .map(|(k, _, _)| { + let mut nk = k.clone(); + nk.push(0); + nk + }) + } else { + None + }; + + Ok((kvs, next_key)) + } + + /// Scan an arbitrary range [start, end) + pub async fn scan_range( + &mut self, + start: impl AsRef<[u8]>, + end: impl AsRef<[u8]>, + limit: i64, + ) -> Result<(Vec<(Vec, Vec, u64)>, Option>)> { + let resp = self + .kv + .range(RangeRequest { + key: start.as_ref().to_vec(), + range_end: end.as_ref().to_vec(), + limit, + revision: 0, + keys_only: false, + count_only: false, + serializable: false, + }) + .await? + .into_inner(); + + let more = resp.more; + let mut kvs: Vec<(Vec, Vec, u64)> = resp + .kvs + .into_iter() + .map(|kv| (kv.key, kv.value, kv.mod_revision as u64)) + .collect(); + let next_key = if more { + kvs.last() + .map(|(k, _, _)| { + let mut nk = k.clone(); + nk.push(0); + nk + }) + } else { + None + }; + + Ok((kvs, next_key)) + } + + /// Compare-and-swap based on key version + pub async fn compare_and_swap( + &mut self, + key: impl AsRef<[u8]>, + expected_version: u64, + value: impl AsRef<[u8]>, + ) -> Result { + let key_bytes = key.as_ref().to_vec(); + let put_op = RequestOp { + request: Some(request_op::Request::RequestPut(PutRequest { + key: key_bytes.clone(), + value: value.as_ref().to_vec(), + lease: 0, + prev_kv: false, + })), + }; + + // Fetch current value on failure to surface the actual version + let read_on_fail = RequestOp { + request: Some(request_op::Request::RequestRange(RangeRequest { + key: key_bytes.clone(), + range_end: vec![], + limit: 1, + revision: 0, + keys_only: false, + count_only: false, + serializable: true, // within txn, use serializable read + })), + }; + + let compare = Compare { + result: compare::CompareResult::Equal as i32, + target: compare::CompareTarget::Version as i32, + key: key_bytes.clone(), + target_union: Some(compare::TargetUnion::Version(expected_version as i64)), + }; + + let resp = self + .kv + .txn(TxnRequest { + compare: vec![compare], + success: vec![put_op], + failure: vec![read_on_fail], + }) + .await? + .into_inner(); + + if resp.succeeded { + let new_version = resp + .header + .as_ref() + .map(|h| h.revision as u64) + .unwrap_or(0); + return Ok(CasOutcome { + success: true, + current_version: new_version, + new_version, + }); + } + + // On failure try to extract the current version from the range response + let current_version = resp + .responses + .into_iter() + .filter_map(|op| match op.response { + Some(response_op::Response::ResponseRange(r)) => r + .kvs + .into_iter() + .next() + .map(|kv| kv.mod_revision as u64), + _ => None, + }) + .next() + .unwrap_or(0); + + Ok(CasOutcome { + success: false, + current_version, + new_version: 0, + }) + } + + /// Watch a key or prefix for changes + pub async fn watch(&mut self, key: impl AsRef<[u8]>) -> Result { + let key = key.as_ref().to_vec(); + let watch_client = WatchClient::new(self.channel.clone()); + WatchHandle::new(watch_client, key, None).await + } + + /// Watch all keys with a prefix + pub async fn watch_prefix(&mut self, prefix: impl AsRef<[u8]>) -> Result { + let prefix = prefix.as_ref().to_vec(); + let range_end = prefix_end(&prefix); + let watch_client = WatchClient::new(self.channel.clone()); + WatchHandle::new(watch_client, prefix, Some(range_end)).await + } + + /// Get cluster status + pub async fn status(&mut self) -> Result { + let resp = self + .cluster + .status(StatusRequest {}) + .await? + .into_inner(); + + Ok(ClusterStatus { + version: resp.version, + leader: resp.leader, + raft_term: resp.raft_term, + }) + } +} + +/// Cluster status +#[derive(Debug, Clone)] +pub struct ClusterStatus { + /// Server version + pub version: String, + /// Current leader ID + pub leader: u64, + /// Current Raft term + pub raft_term: u64, +} + +/// CAS outcome returned by compare_and_swap +#[derive(Debug, Clone)] +pub struct CasOutcome { + /// Whether CAS succeeded + pub success: bool, + /// Observed/current version + pub current_version: u64, + /// New version when succeeded + pub new_version: u64, +} + +/// Calculate prefix end for range queries +fn prefix_end(prefix: &[u8]) -> Vec { + let mut end = prefix.to_vec(); + for i in (0..end.len()).rev() { + if end[i] < 0xff { + end[i] += 1; + end.truncate(i + 1); + return end; + } + } + vec![] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_prefix_end() { + assert_eq!(prefix_end(b"abc"), b"abd"); + assert_eq!(prefix_end(b"/nodes/"), b"/nodes0"); + } +} diff --git a/chainfire/chainfire-client/src/error.rs b/chainfire/chainfire-client/src/error.rs new file mode 100644 index 0000000..9329c3c --- /dev/null +++ b/chainfire/chainfire-client/src/error.rs @@ -0,0 +1,34 @@ +//! Client error types + +use thiserror::Error; + +/// Result type for client operations +pub type Result = std::result::Result; + +/// Client error +#[derive(Error, Debug)] +pub enum ClientError { + /// Connection error + #[error("Connection error: {0}")] + Connection(String), + + /// RPC error + #[error("RPC error: {0}")] + Rpc(#[from] tonic::Status), + + /// Transport error + #[error("Transport error: {0}")] + Transport(#[from] tonic::transport::Error), + + /// Key not found + #[error("Key not found: {0}")] + KeyNotFound(String), + + /// Watch error + #[error("Watch error: {0}")] + Watch(String), + + /// Internal error + #[error("Internal error: {0}")] + Internal(String), +} diff --git a/chainfire/chainfire-client/src/lib.rs b/chainfire/chainfire-client/src/lib.rs new file mode 100644 index 0000000..ea39197 --- /dev/null +++ b/chainfire/chainfire-client/src/lib.rs @@ -0,0 +1,34 @@ +//! Chainfire distributed KVS client library +//! +//! This crate provides a client for interacting with Chainfire clusters. +//! +//! # Example +//! +//! ```no_run +//! use chainfire_client::Client; +//! +//! #[tokio::main] +//! async fn main() -> Result<(), Box> { +//! let mut client = Client::connect("http://127.0.0.1:2379").await?; +//! +//! // Put a value +//! client.put("/my/key", "my value").await?; +//! +//! // Get a value +//! if let Some(value) = client.get("/my/key").await? { +//! println!("Got: {}", String::from_utf8_lossy(&value)); +//! } +//! +//! Ok(()) +//! } +//! ``` + +mod client; +mod error; +pub mod node; +mod watch; + +pub use client::{CasOutcome, Client}; +pub use error::{ClientError, Result}; +pub use node::{NodeCapacity, NodeFilter, NodeMetadata}; +pub use watch::WatchHandle; diff --git a/chainfire/chainfire-client/src/node.rs b/chainfire/chainfire-client/src/node.rs new file mode 100644 index 0000000..6a2f7be --- /dev/null +++ b/chainfire/chainfire-client/src/node.rs @@ -0,0 +1,333 @@ +//! Node metadata helpers for Chainfire KVS +//! +//! This module provides helpers for storing and retrieving node metadata +//! in the Chainfire distributed KVS. +//! +//! # KVS Key Schema +//! +//! Node metadata is stored with the following key structure: +//! - `/nodes//info` - JSON-encoded NodeMetadata +//! - `/nodes//roles` - JSON-encoded roles (raft_role, gossip_role) +//! - `/nodes//capacity/cpu` - CPU cores (u32) +//! - `/nodes//capacity/memory_gb` - Memory in GB (u32) +//! - `/nodes//labels/` - Custom labels (string) +//! - `/nodes//api_addr` - API address (string) + +use crate::error::Result; +use crate::Client; +use chainfire_types::node::NodeRole; +use chainfire_types::RaftRole; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +/// Node metadata stored in KVS +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NodeMetadata { + /// Unique node identifier + pub id: u64, + /// Human-readable node name + pub name: String, + /// Raft participation role + pub raft_role: RaftRole, + /// Gossip/cluster role + pub gossip_role: NodeRole, + /// API address for client connections + pub api_addr: String, + /// Raft address for inter-node communication (optional for workers) + pub raft_addr: Option, + /// Gossip address for membership protocol + pub gossip_addr: String, + /// Node capacity information + pub capacity: NodeCapacity, + /// Custom labels for node selection + pub labels: HashMap, +} + +/// Node capacity information +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct NodeCapacity { + /// Number of CPU cores + pub cpu_cores: u32, + /// Memory in gigabytes + pub memory_gb: u32, + /// Disk space in gigabytes (optional) + pub disk_gb: Option, +} + +/// Filter for listing nodes +#[derive(Debug, Clone, Default)] +pub struct NodeFilter { + /// Filter by Raft role + pub raft_role: Option, + /// Filter by gossip role + pub gossip_role: Option, + /// Filter by labels (all must match) + pub labels: HashMap, +} + +impl NodeMetadata { + /// Create a new NodeMetadata for a control-plane node + pub fn control_plane( + id: u64, + name: impl Into, + api_addr: impl Into, + raft_addr: impl Into, + gossip_addr: impl Into, + ) -> Self { + Self { + id, + name: name.into(), + raft_role: RaftRole::Voter, + gossip_role: NodeRole::ControlPlane, + api_addr: api_addr.into(), + raft_addr: Some(raft_addr.into()), + gossip_addr: gossip_addr.into(), + capacity: NodeCapacity::default(), + labels: HashMap::new(), + } + } + + /// Create a new NodeMetadata for a worker node + pub fn worker( + id: u64, + name: impl Into, + api_addr: impl Into, + gossip_addr: impl Into, + ) -> Self { + Self { + id, + name: name.into(), + raft_role: RaftRole::None, + gossip_role: NodeRole::Worker, + api_addr: api_addr.into(), + raft_addr: None, + gossip_addr: gossip_addr.into(), + capacity: NodeCapacity::default(), + labels: HashMap::new(), + } + } + + /// Set capacity information + pub fn with_capacity(mut self, cpu_cores: u32, memory_gb: u32) -> Self { + self.capacity.cpu_cores = cpu_cores; + self.capacity.memory_gb = memory_gb; + self + } + + /// Add a label + pub fn with_label(mut self, key: impl Into, value: impl Into) -> Self { + self.labels.insert(key.into(), value.into()); + self + } +} + +/// Key prefix for all node metadata +const NODE_PREFIX: &str = "/nodes/"; + +/// Generate the key for node info +fn node_info_key(id: u64) -> String { + format!("{}{}/info", NODE_PREFIX, id) +} + +/// Generate the key for a node label +fn node_label_key(id: u64, label: &str) -> String { + format!("{}{}/labels/{}", NODE_PREFIX, id, label) +} + +/// Register a node in the cluster by storing its metadata in KVS +/// +/// # Arguments +/// +/// * `client` - The Chainfire client +/// * `meta` - Node metadata to register +/// +/// # Returns +/// +/// The revision number of the write operation +pub async fn register_node(client: &mut Client, meta: &NodeMetadata) -> Result { + let key = node_info_key(meta.id); + let value = serde_json::to_string(meta) + .map_err(|e| crate::error::ClientError::Internal(e.to_string()))?; + + client.put_str(&key, &value).await +} + +/// Update a specific node attribute +pub async fn update_node_label( + client: &mut Client, + node_id: u64, + label: &str, + value: &str, +) -> Result { + let key = node_label_key(node_id, label); + client.put_str(&key, value).await +} + +/// Get a node's metadata by ID +/// +/// # Arguments +/// +/// * `client` - The Chainfire client +/// * `node_id` - The node ID to look up +/// +/// # Returns +/// +/// The node metadata if found, None otherwise +pub async fn get_node(client: &mut Client, node_id: u64) -> Result> { + let key = node_info_key(node_id); + let value = client.get_str(&key).await?; + + match value { + Some(json) => { + let meta: NodeMetadata = serde_json::from_str(&json) + .map_err(|e| crate::error::ClientError::Internal(e.to_string()))?; + Ok(Some(meta)) + } + None => Ok(None), + } +} + +/// List all registered nodes +/// +/// # Arguments +/// +/// * `client` - The Chainfire client +/// * `filter` - Optional filter criteria +/// +/// # Returns +/// +/// A list of node metadata matching the filter +pub async fn list_nodes(client: &mut Client, filter: &NodeFilter) -> Result> { + let prefix = format!("{}", NODE_PREFIX); + let entries = client.get_prefix(&prefix).await?; + + let mut nodes = Vec::new(); + + for (key, value) in entries { + let key_str = String::from_utf8_lossy(&key); + + // Only process /nodes//info keys + if !key_str.ends_with("/info") { + continue; + } + + let json = String::from_utf8_lossy(&value); + if let Ok(meta) = serde_json::from_str::(&json) { + // Apply filters + if let Some(ref raft_role) = filter.raft_role { + if meta.raft_role != *raft_role { + continue; + } + } + + if let Some(ref gossip_role) = filter.gossip_role { + if meta.gossip_role != *gossip_role { + continue; + } + } + + // Check label filters + let mut labels_match = true; + for (k, v) in &filter.labels { + match meta.labels.get(k) { + Some(node_v) if node_v == v => {} + _ => { + labels_match = false; + break; + } + } + } + + if labels_match { + nodes.push(meta); + } + } + } + + // Sort by node ID for consistent ordering + nodes.sort_by_key(|n| n.id); + + Ok(nodes) +} + +/// Unregister a node from the cluster +/// +/// # Arguments +/// +/// * `client` - The Chainfire client +/// * `node_id` - The node ID to unregister +/// +/// # Returns +/// +/// True if the node was found and deleted +pub async fn unregister_node(client: &mut Client, node_id: u64) -> Result { + let key = node_info_key(node_id); + client.delete(&key).await +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_node_info_key() { + assert_eq!(node_info_key(1), "/nodes/1/info"); + assert_eq!(node_info_key(123), "/nodes/123/info"); + } + + #[test] + fn test_node_label_key() { + assert_eq!(node_label_key(1, "zone"), "/nodes/1/labels/zone"); + } + + #[test] + fn test_control_plane_metadata() { + let meta = NodeMetadata::control_plane( + 1, + "cp-1", + "127.0.0.1:2379", + "127.0.0.1:2380", + "127.0.0.1:2381", + ); + + assert_eq!(meta.id, 1); + assert_eq!(meta.raft_role, RaftRole::Voter); + assert_eq!(meta.gossip_role, NodeRole::ControlPlane); + assert!(meta.raft_addr.is_some()); + } + + #[test] + fn test_worker_metadata() { + let meta = NodeMetadata::worker(100, "worker-1", "127.0.0.1:3379", "127.0.0.1:3381"); + + assert_eq!(meta.id, 100); + assert_eq!(meta.raft_role, RaftRole::None); + assert_eq!(meta.gossip_role, NodeRole::Worker); + assert!(meta.raft_addr.is_none()); + } + + #[test] + fn test_metadata_with_capacity() { + let meta = NodeMetadata::worker(1, "worker", "addr", "gossip") + .with_capacity(8, 32) + .with_label("zone", "us-west-1"); + + assert_eq!(meta.capacity.cpu_cores, 8); + assert_eq!(meta.capacity.memory_gb, 32); + assert_eq!(meta.labels.get("zone"), Some(&"us-west-1".to_string())); + } + + #[test] + fn test_metadata_serialization() { + let meta = NodeMetadata::control_plane(1, "test", "api", "raft", "gossip") + .with_capacity(4, 16) + .with_label("env", "prod"); + + let json = serde_json::to_string(&meta).unwrap(); + let deserialized: NodeMetadata = serde_json::from_str(&json).unwrap(); + + assert_eq!(meta.id, deserialized.id); + assert_eq!(meta.raft_role, deserialized.raft_role); + assert_eq!(meta.capacity.cpu_cores, deserialized.capacity.cpu_cores); + } +} diff --git a/chainfire/chainfire-client/src/watch.rs b/chainfire/chainfire-client/src/watch.rs new file mode 100644 index 0000000..efc2b26 --- /dev/null +++ b/chainfire/chainfire-client/src/watch.rs @@ -0,0 +1,143 @@ +//! Watch functionality + +use crate::error::{ClientError, Result}; +use chainfire_proto::proto::{ + watch_client::WatchClient, watch_request, Event, WatchCreateRequest, WatchRequest, +}; +use futures::StreamExt; +use tokio::sync::mpsc; +use tonic::transport::Channel; +use tracing::{debug, warn}; + +/// Event received from a watch +#[derive(Debug, Clone)] +pub struct WatchEvent { + /// Event type (Put or Delete) + pub event_type: EventType, + /// Key that changed + pub key: Vec, + /// New value (for Put events) + pub value: Vec, + /// Revision of the change + pub revision: u64, +} + +/// Type of watch event +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum EventType { + Put, + Delete, +} + +/// Handle to a watch stream +pub struct WatchHandle { + /// Watch ID + watch_id: i64, + /// Event receiver + rx: mpsc::Receiver, +} + +impl WatchHandle { + /// Create a new watch + pub(crate) async fn new( + mut client: WatchClient, + key: Vec, + range_end: Option>, + ) -> Result { + let (tx, rx) = mpsc::channel(64); + let (req_tx, req_rx) = mpsc::channel(16); + + // Send initial create request + let create_req = WatchRequest { + request_union: Some(watch_request::RequestUnion::CreateRequest( + WatchCreateRequest { + key, + range_end: range_end.unwrap_or_default(), + start_revision: 0, + progress_notify: false, + prev_kv: false, + watch_id: 0, + }, + )), + }; + + req_tx + .send(create_req) + .await + .map_err(|_| ClientError::Watch("Failed to send create request".into()))?; + + // Create bidirectional stream + let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx); + let mut resp_stream = client.watch(req_stream).await?.into_inner(); + + // Wait for creation confirmation + let first_resp = resp_stream + .next() + .await + .ok_or_else(|| ClientError::Watch("No response from server".into()))? + .map_err(ClientError::Rpc)?; + + if !first_resp.created { + return Err(ClientError::Watch("Watch creation failed".into())); + } + + let watch_id = first_resp.watch_id; + debug!(watch_id, "Watch created"); + + // Spawn task to process events + tokio::spawn(async move { + while let Some(result) = resp_stream.next().await { + match result { + Ok(resp) => { + if resp.canceled { + debug!(watch_id = resp.watch_id, "Watch canceled"); + break; + } + + for event in resp.events { + let watch_event = convert_event(event); + if tx.send(watch_event).await.is_err() { + break; + } + } + } + Err(e) => { + warn!(error = %e, "Watch stream error"); + break; + } + } + } + }); + + Ok(Self { watch_id, rx }) + } + + /// Get the watch ID + pub fn id(&self) -> i64 { + self.watch_id + } + + /// Receive the next event + pub async fn recv(&mut self) -> Option { + self.rx.recv().await + } +} + +fn convert_event(event: Event) -> WatchEvent { + let event_type = if event.r#type == 0 { + EventType::Put + } else { + EventType::Delete + }; + + let (key, value, revision) = event.kv.map(|kv| { + (kv.key, kv.value, kv.mod_revision as u64) + }).unwrap_or_default(); + + WatchEvent { + event_type, + key, + value, + revision, + } +} diff --git a/chainfire/crates/chainfire-api/Cargo.toml b/chainfire/crates/chainfire-api/Cargo.toml new file mode 100644 index 0000000..b4856c5 --- /dev/null +++ b/chainfire/crates/chainfire-api/Cargo.toml @@ -0,0 +1,42 @@ +[package] +name = "chainfire-api" +version.workspace = true +edition.workspace = true +license.workspace = true +rust-version.workspace = true +description = "gRPC API layer for Chainfire distributed KVS" + +[dependencies] +chainfire-types = { workspace = true } +chainfire-storage = { workspace = true } +chainfire-raft = { workspace = true } +chainfire-watch = { workspace = true } + +# gRPC +tonic = { workspace = true } +prost = { workspace = true } +prost-types = { workspace = true } + +# Async +tokio = { workspace = true } +tokio-stream = { workspace = true } +futures = { workspace = true } +async-trait = { workspace = true } + +# Raft +openraft = { workspace = true } + +# Serialization +bincode = { workspace = true } + +# Utilities +tracing = { workspace = true } + +[build-dependencies] +tonic-build = { workspace = true } + +[dev-dependencies] +tokio = { workspace = true, features = ["rt-multi-thread", "macros"] } + +[lints] +workspace = true diff --git a/chainfire/crates/chainfire-api/build.rs b/chainfire/crates/chainfire-api/build.rs new file mode 100644 index 0000000..1b77c9b --- /dev/null +++ b/chainfire/crates/chainfire-api/build.rs @@ -0,0 +1,19 @@ +fn main() -> Result<(), Box> { + // Compile the protobuf files to OUT_DIR (default location for include_proto!) + tonic_build::configure() + .build_server(true) + .build_client(true) + .compile_protos( + &[ + "../../proto/chainfire.proto", + "../../proto/internal.proto", + ], + &["../../proto"], + )?; + + // Tell cargo to rerun if proto files change + println!("cargo:rerun-if-changed=../../proto/chainfire.proto"); + println!("cargo:rerun-if-changed=../../proto/internal.proto"); + + Ok(()) +} diff --git a/chainfire/crates/chainfire-api/src/cluster_service.rs b/chainfire/crates/chainfire-api/src/cluster_service.rs new file mode 100644 index 0000000..9abf28e --- /dev/null +++ b/chainfire/crates/chainfire-api/src/cluster_service.rs @@ -0,0 +1,216 @@ +//! Cluster management service implementation +//! +//! This service handles cluster membership operations including adding, +//! removing, and listing members. + +use crate::conversions::make_header; +use crate::proto::{ + cluster_server::Cluster, Member, MemberAddRequest, MemberAddResponse, MemberListRequest, + MemberListResponse, MemberRemoveRequest, MemberRemoveResponse, StatusRequest, StatusResponse, +}; +use chainfire_raft::RaftNode; +use openraft::BasicNode; +use std::collections::hash_map::DefaultHasher; +use std::collections::BTreeMap; +use std::hash::{Hash, Hasher}; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; +use std::time::{SystemTime, UNIX_EPOCH}; +use tonic::{Request, Response, Status}; +use tracing::{debug, info, warn}; + +/// Generate a unique member ID based on timestamp and counter +fn generate_member_id() -> u64 { + static COUNTER: AtomicU64 = AtomicU64::new(0); + let counter = COUNTER.fetch_add(1, Ordering::Relaxed); + let timestamp = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() as u64; + + let mut hasher = DefaultHasher::new(); + (timestamp, counter, std::process::id()).hash(&mut hasher); + hasher.finish() +} + +/// Cluster service implementation +pub struct ClusterServiceImpl { + /// Raft node + raft: Arc, + /// Cluster ID + cluster_id: u64, + /// Server version + version: String, +} + +impl ClusterServiceImpl { + /// Create a new cluster service + pub fn new(raft: Arc, cluster_id: u64) -> Self { + Self { + raft, + cluster_id, + version: env!("CARGO_PKG_VERSION").to_string(), + } + } + + fn make_header(&self, revision: u64) -> crate::proto::ResponseHeader { + make_header(self.cluster_id, self.raft.id(), revision, 0) + } + + /// Get current members as proto Member list + async fn get_member_list(&self) -> Vec { + self.raft + .membership() + .await + .iter() + .map(|&id| Member { + id, + name: format!("node-{}", id), + peer_urls: vec![], + client_urls: vec![], + is_learner: false, + }) + .collect() + } +} + +#[tonic::async_trait] +impl Cluster for ClusterServiceImpl { + async fn member_add( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + debug!(peer_urls = ?req.peer_urls, is_learner = req.is_learner, "Member add request"); + + // Generate new member ID + let member_id = generate_member_id(); + + // Create BasicNode for the new member + let node = BasicNode::default(); + + // Add as learner first (safer for cluster stability) + match self.raft.add_learner(member_id, node, true).await { + Ok(()) => { + info!(member_id, "Added learner node"); + + // If not explicitly a learner, promote to voter + if !req.is_learner { + // Get current membership and add new member + let mut members: BTreeMap = self + .raft + .membership() + .await + .iter() + .map(|&id| (id, BasicNode::default())) + .collect(); + members.insert(member_id, BasicNode::default()); + + if let Err(e) = self.raft.change_membership(members, false).await { + warn!(error = %e, member_id, "Failed to promote learner to voter"); + // Still return success for the learner add + } else { + info!(member_id, "Promoted learner to voter"); + } + } + + let new_member = Member { + id: member_id, + name: String::new(), + peer_urls: req.peer_urls, + client_urls: vec![], + is_learner: req.is_learner, + }; + + Ok(Response::new(MemberAddResponse { + header: Some(self.make_header(0)), + member: Some(new_member), + members: self.get_member_list().await, + })) + } + Err(e) => { + warn!(error = %e, "Failed to add member"); + Err(Status::internal(format!("Failed to add member: {}", e))) + } + } + } + + async fn member_remove( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + debug!(member_id = req.id, "Member remove request"); + + // Get current membership and remove the member + let mut members: BTreeMap = self + .raft + .membership() + .await + .iter() + .map(|&id| (id, BasicNode::default())) + .collect(); + + if !members.contains_key(&req.id) { + return Err(Status::not_found(format!( + "Member {} not found in cluster", + req.id + ))); + } + + members.remove(&req.id); + + match self.raft.change_membership(members, false).await { + Ok(()) => { + info!(member_id = req.id, "Removed member from cluster"); + Ok(Response::new(MemberRemoveResponse { + header: Some(self.make_header(0)), + members: self.get_member_list().await, + })) + } + Err(e) => { + warn!(error = %e, member_id = req.id, "Failed to remove member"); + Err(Status::internal(format!("Failed to remove member: {}", e))) + } + } + } + + async fn member_list( + &self, + _request: Request, + ) -> Result, Status> { + debug!("Member list request"); + + Ok(Response::new(MemberListResponse { + header: Some(self.make_header(0)), + members: self.get_member_list().await, + })) + } + + async fn status( + &self, + _request: Request, + ) -> Result, Status> { + debug!("Status request"); + + let leader = self.raft.leader().await; + let term = self.raft.current_term().await; + let is_leader = self.raft.is_leader().await; + + // Get storage info from Raft node + let storage = self.raft.storage(); + let storage_guard = storage.read().await; + let sm = storage_guard.state_machine().read().await; + let revision = sm.current_revision(); + + Ok(Response::new(StatusResponse { + header: Some(self.make_header(revision)), + version: self.version.clone(), + db_size: 0, // TODO: get actual RocksDB size + leader: leader.unwrap_or(0), + raft_index: revision, + raft_term: term, + raft_applied_index: revision, + })) + } +} diff --git a/chainfire/crates/chainfire-api/src/conversions.rs b/chainfire/crates/chainfire-api/src/conversions.rs new file mode 100644 index 0000000..20a67d1 --- /dev/null +++ b/chainfire/crates/chainfire-api/src/conversions.rs @@ -0,0 +1,113 @@ +//! Conversions between protobuf types and internal types + +use crate::proto; +use chainfire_types::kv::KvEntry; +use chainfire_types::watch::{WatchEvent, WatchEventType, WatchRequest as InternalWatchRequest}; +use chainfire_types::Revision; + +/// Convert internal KvEntry to proto KeyValue +impl From for proto::KeyValue { + fn from(entry: KvEntry) -> Self { + Self { + key: entry.key, + value: entry.value, + create_revision: entry.create_revision as i64, + mod_revision: entry.mod_revision as i64, + version: entry.version as i64, + lease: entry.lease_id.unwrap_or(0), + } + } +} + +/// Convert proto KeyValue to internal KvEntry +impl From for KvEntry { + fn from(kv: proto::KeyValue) -> Self { + Self { + key: kv.key, + value: kv.value, + create_revision: kv.create_revision as u64, + mod_revision: kv.mod_revision as u64, + version: kv.version as u64, + lease_id: if kv.lease != 0 { Some(kv.lease) } else { None }, + } + } +} + +/// Convert internal WatchEvent to proto Event +impl From for proto::Event { + fn from(event: WatchEvent) -> Self { + Self { + r#type: match event.event_type { + WatchEventType::Put => proto::event::EventType::Put as i32, + WatchEventType::Delete => proto::event::EventType::Delete as i32, + }, + kv: Some(event.kv.into()), + prev_kv: event.prev_kv.map(Into::into), + } + } +} + +/// Convert proto WatchCreateRequest to internal WatchRequest +impl From for InternalWatchRequest { + fn from(req: proto::WatchCreateRequest) -> Self { + Self { + watch_id: req.watch_id, + key: req.key, + range_end: if req.range_end.is_empty() { + None + } else { + Some(req.range_end) + }, + start_revision: if req.start_revision > 0 { + Some(req.start_revision as Revision) + } else { + None + }, + prev_kv: req.prev_kv, + progress_notify: req.progress_notify, + } + } +} + +/// Create a response header +pub fn make_header( + cluster_id: u64, + member_id: u64, + revision: Revision, + raft_term: u64, +) -> proto::ResponseHeader { + proto::ResponseHeader { + cluster_id, + member_id, + revision: revision as i64, + raft_term, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_kv_entry_conversion() { + let entry = KvEntry::new(b"key".to_vec(), b"value".to_vec(), 1); + let proto_kv: proto::KeyValue = entry.clone().into(); + + assert_eq!(proto_kv.key, b"key"); + assert_eq!(proto_kv.value, b"value"); + assert_eq!(proto_kv.create_revision, 1); + + let back: KvEntry = proto_kv.into(); + assert_eq!(back.key, entry.key); + assert_eq!(back.value, entry.value); + } + + #[test] + fn test_watch_event_conversion() { + let kv = KvEntry::new(b"key".to_vec(), b"value".to_vec(), 1); + let event = WatchEvent::put(kv, None); + + let proto_event: proto::Event = event.into(); + assert_eq!(proto_event.r#type, proto::event::EventType::Put as i32); + } +} diff --git a/chainfire/crates/chainfire-api/src/generated/chainfire.internal.rs b/chainfire/crates/chainfire-api/src/generated/chainfire.internal.rs new file mode 100644 index 0000000..67558bb --- /dev/null +++ b/chainfire/crates/chainfire-api/src/generated/chainfire.internal.rs @@ -0,0 +1,566 @@ +// This file is @generated by prost-build. +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct VoteRequest { + /// term is the candidate's term + #[prost(uint64, tag = "1")] + pub term: u64, + /// candidate_id is the candidate requesting the vote + #[prost(uint64, tag = "2")] + pub candidate_id: u64, + /// last_log_index is index of candidate's last log entry + #[prost(uint64, tag = "3")] + pub last_log_index: u64, + /// last_log_term is term of candidate's last log entry + #[prost(uint64, tag = "4")] + pub last_log_term: u64, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct VoteResponse { + /// term is the current term for the voter + #[prost(uint64, tag = "1")] + pub term: u64, + /// vote_granted is true if the candidate received the vote + #[prost(bool, tag = "2")] + pub vote_granted: bool, + /// last_log_id is the voter's last log ID + #[prost(uint64, tag = "3")] + pub last_log_index: u64, + #[prost(uint64, tag = "4")] + pub last_log_term: u64, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct AppendEntriesRequest { + /// term is the leader's term + #[prost(uint64, tag = "1")] + pub term: u64, + /// leader_id is the leader's ID + #[prost(uint64, tag = "2")] + pub leader_id: u64, + /// prev_log_index is index of log entry immediately preceding new ones + #[prost(uint64, tag = "3")] + pub prev_log_index: u64, + /// prev_log_term is term of prev_log_index entry + #[prost(uint64, tag = "4")] + pub prev_log_term: u64, + /// entries are log entries to append + #[prost(message, repeated, tag = "5")] + pub entries: ::prost::alloc::vec::Vec, + /// leader_commit is leader's commit index + #[prost(uint64, tag = "6")] + pub leader_commit: u64, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct LogEntry { + /// index is the log entry index + #[prost(uint64, tag = "1")] + pub index: u64, + /// term is the term when entry was received + #[prost(uint64, tag = "2")] + pub term: u64, + /// data is the command data + #[prost(bytes = "vec", tag = "3")] + pub data: ::prost::alloc::vec::Vec, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct AppendEntriesResponse { + /// term is the current term + #[prost(uint64, tag = "1")] + pub term: u64, + /// success is true if follower contained entry matching prevLogIndex + #[prost(bool, tag = "2")] + pub success: bool, + /// conflict_index is the first conflicting index (for optimization) + #[prost(uint64, tag = "3")] + pub conflict_index: u64, + /// conflict_term is the term of the conflicting entry + #[prost(uint64, tag = "4")] + pub conflict_term: u64, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct InstallSnapshotRequest { + /// term is the leader's term + #[prost(uint64, tag = "1")] + pub term: u64, + /// leader_id is the leader's ID + #[prost(uint64, tag = "2")] + pub leader_id: u64, + /// last_included_index is the snapshot replaces all entries up through and including this index + #[prost(uint64, tag = "3")] + pub last_included_index: u64, + /// last_included_term is term of last_included_index + #[prost(uint64, tag = "4")] + pub last_included_term: u64, + /// offset is byte offset where chunk is positioned in the snapshot file + #[prost(uint64, tag = "5")] + pub offset: u64, + /// data is raw bytes of the snapshot chunk + #[prost(bytes = "vec", tag = "6")] + pub data: ::prost::alloc::vec::Vec, + /// done is true if this is the last chunk + #[prost(bool, tag = "7")] + pub done: bool, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct InstallSnapshotResponse { + /// term is the current term + #[prost(uint64, tag = "1")] + pub term: u64, +} +/// Generated client implementations. +pub mod raft_service_client { + #![allow( + unused_variables, + dead_code, + missing_docs, + clippy::wildcard_imports, + clippy::let_unit_value, + )] + use tonic::codegen::*; + use tonic::codegen::http::Uri; + /// Internal Raft RPC service for node-to-node communication + #[derive(Debug, Clone)] + pub struct RaftServiceClient { + inner: tonic::client::Grpc, + } + impl RaftServiceClient { + /// Attempt to create a new client by connecting to a given endpoint. + pub async fn connect(dst: D) -> Result + where + D: TryInto, + D::Error: Into, + { + let conn = tonic::transport::Endpoint::new(dst)?.connect().await?; + Ok(Self::new(conn)) + } + } + impl RaftServiceClient + where + T: tonic::client::GrpcService, + T::Error: Into, + T::ResponseBody: Body + std::marker::Send + 'static, + ::Error: Into + std::marker::Send, + { + pub fn new(inner: T) -> Self { + let inner = tonic::client::Grpc::new(inner); + Self { inner } + } + pub fn with_origin(inner: T, origin: Uri) -> Self { + let inner = tonic::client::Grpc::with_origin(inner, origin); + Self { inner } + } + pub fn with_interceptor( + inner: T, + interceptor: F, + ) -> RaftServiceClient> + where + F: tonic::service::Interceptor, + T::ResponseBody: Default, + T: tonic::codegen::Service< + http::Request, + Response = http::Response< + >::ResponseBody, + >, + >, + , + >>::Error: Into + std::marker::Send + std::marker::Sync, + { + RaftServiceClient::new(InterceptedService::new(inner, interceptor)) + } + /// Compress requests with the given encoding. + /// + /// This requires the server to support it otherwise it might respond with an + /// error. + #[must_use] + pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.inner = self.inner.send_compressed(encoding); + self + } + /// Enable decompressing responses. + #[must_use] + pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.inner = self.inner.accept_compressed(encoding); + self + } + /// Limits the maximum size of a decoded message. + /// + /// Default: `4MB` + #[must_use] + pub fn max_decoding_message_size(mut self, limit: usize) -> Self { + self.inner = self.inner.max_decoding_message_size(limit); + self + } + /// Limits the maximum size of an encoded message. + /// + /// Default: `usize::MAX` + #[must_use] + pub fn max_encoding_message_size(mut self, limit: usize) -> Self { + self.inner = self.inner.max_encoding_message_size(limit); + self + } + /// Vote requests a vote from a peer + pub async fn vote( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result, tonic::Status> { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic::codec::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/chainfire.internal.RaftService/Vote", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert(GrpcMethod::new("chainfire.internal.RaftService", "Vote")); + self.inner.unary(req, path, codec).await + } + /// AppendEntries sends log entries to followers + pub async fn append_entries( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic::codec::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/chainfire.internal.RaftService/AppendEntries", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new("chainfire.internal.RaftService", "AppendEntries"), + ); + self.inner.unary(req, path, codec).await + } + /// InstallSnapshot sends a snapshot to a follower + pub async fn install_snapshot( + &mut self, + request: impl tonic::IntoStreamingRequest< + Message = super::InstallSnapshotRequest, + >, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic::codec::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/chainfire.internal.RaftService/InstallSnapshot", + ); + let mut req = request.into_streaming_request(); + req.extensions_mut() + .insert( + GrpcMethod::new("chainfire.internal.RaftService", "InstallSnapshot"), + ); + self.inner.client_streaming(req, path, codec).await + } + } +} +/// Generated server implementations. +pub mod raft_service_server { + #![allow( + unused_variables, + dead_code, + missing_docs, + clippy::wildcard_imports, + clippy::let_unit_value, + )] + use tonic::codegen::*; + /// Generated trait containing gRPC methods that should be implemented for use with RaftServiceServer. + #[async_trait] + pub trait RaftService: std::marker::Send + std::marker::Sync + 'static { + /// Vote requests a vote from a peer + async fn vote( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status>; + /// AppendEntries sends log entries to followers + async fn append_entries( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; + /// InstallSnapshot sends a snapshot to a follower + async fn install_snapshot( + &self, + request: tonic::Request>, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; + } + /// Internal Raft RPC service for node-to-node communication + #[derive(Debug)] + pub struct RaftServiceServer { + inner: Arc, + accept_compression_encodings: EnabledCompressionEncodings, + send_compression_encodings: EnabledCompressionEncodings, + max_decoding_message_size: Option, + max_encoding_message_size: Option, + } + impl RaftServiceServer { + pub fn new(inner: T) -> Self { + Self::from_arc(Arc::new(inner)) + } + pub fn from_arc(inner: Arc) -> Self { + Self { + inner, + accept_compression_encodings: Default::default(), + send_compression_encodings: Default::default(), + max_decoding_message_size: None, + max_encoding_message_size: None, + } + } + pub fn with_interceptor( + inner: T, + interceptor: F, + ) -> InterceptedService + where + F: tonic::service::Interceptor, + { + InterceptedService::new(Self::new(inner), interceptor) + } + /// Enable decompressing requests with the given encoding. + #[must_use] + pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.accept_compression_encodings.enable(encoding); + self + } + /// Compress responses with the given encoding, if the client supports it. + #[must_use] + pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.send_compression_encodings.enable(encoding); + self + } + /// Limits the maximum size of a decoded message. + /// + /// Default: `4MB` + #[must_use] + pub fn max_decoding_message_size(mut self, limit: usize) -> Self { + self.max_decoding_message_size = Some(limit); + self + } + /// Limits the maximum size of an encoded message. + /// + /// Default: `usize::MAX` + #[must_use] + pub fn max_encoding_message_size(mut self, limit: usize) -> Self { + self.max_encoding_message_size = Some(limit); + self + } + } + impl tonic::codegen::Service> for RaftServiceServer + where + T: RaftService, + B: Body + std::marker::Send + 'static, + B::Error: Into + std::marker::Send + 'static, + { + type Response = http::Response; + type Error = std::convert::Infallible; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut Context<'_>, + ) -> Poll> { + Poll::Ready(Ok(())) + } + fn call(&mut self, req: http::Request) -> Self::Future { + match req.uri().path() { + "/chainfire.internal.RaftService/Vote" => { + #[allow(non_camel_case_types)] + struct VoteSvc(pub Arc); + impl tonic::server::UnaryService + for VoteSvc { + type Response = super::VoteResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::vote(&inner, request).await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = VoteSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/chainfire.internal.RaftService/AppendEntries" => { + #[allow(non_camel_case_types)] + struct AppendEntriesSvc(pub Arc); + impl< + T: RaftService, + > tonic::server::UnaryService + for AppendEntriesSvc { + type Response = super::AppendEntriesResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::append_entries(&inner, request).await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = AppendEntriesSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/chainfire.internal.RaftService/InstallSnapshot" => { + #[allow(non_camel_case_types)] + struct InstallSnapshotSvc(pub Arc); + impl< + T: RaftService, + > tonic::server::ClientStreamingService< + super::InstallSnapshotRequest, + > for InstallSnapshotSvc { + type Response = super::InstallSnapshotResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request< + tonic::Streaming, + >, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::install_snapshot(&inner, request).await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = InstallSnapshotSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.client_streaming(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + _ => { + Box::pin(async move { + let mut response = http::Response::new(empty_body()); + let headers = response.headers_mut(); + headers + .insert( + tonic::Status::GRPC_STATUS, + (tonic::Code::Unimplemented as i32).into(), + ); + headers + .insert( + http::header::CONTENT_TYPE, + tonic::metadata::GRPC_CONTENT_TYPE, + ); + Ok(response) + }) + } + } + } + } + impl Clone for RaftServiceServer { + fn clone(&self) -> Self { + let inner = self.inner.clone(); + Self { + inner, + accept_compression_encodings: self.accept_compression_encodings, + send_compression_encodings: self.send_compression_encodings, + max_decoding_message_size: self.max_decoding_message_size, + max_encoding_message_size: self.max_encoding_message_size, + } + } + } + /// Generated gRPC service name + pub const SERVICE_NAME: &str = "chainfire.internal.RaftService"; + impl tonic::server::NamedService for RaftServiceServer { + const NAME: &'static str = SERVICE_NAME; + } +} diff --git a/chainfire/crates/chainfire-api/src/generated/chainfire.v1.rs b/chainfire/crates/chainfire-api/src/generated/chainfire.v1.rs new file mode 100644 index 0000000..a46504f --- /dev/null +++ b/chainfire/crates/chainfire-api/src/generated/chainfire.v1.rs @@ -0,0 +1,1817 @@ +// This file is @generated by prost-build. +/// Response header included in all responses +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct ResponseHeader { + /// cluster_id is the ID of the cluster + #[prost(uint64, tag = "1")] + pub cluster_id: u64, + /// member_id is the ID of the responding member + #[prost(uint64, tag = "2")] + pub member_id: u64, + /// revision is the key-value store revision + #[prost(int64, tag = "3")] + pub revision: i64, + /// raft_term is the current Raft term + #[prost(uint64, tag = "4")] + pub raft_term: u64, +} +/// Key-value pair +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct KeyValue { + /// key is the key in bytes + #[prost(bytes = "vec", tag = "1")] + pub key: ::prost::alloc::vec::Vec, + /// create_revision is the revision of last creation + #[prost(int64, tag = "2")] + pub create_revision: i64, + /// mod_revision is the revision of last modification + #[prost(int64, tag = "3")] + pub mod_revision: i64, + /// version is the version of the key + #[prost(int64, tag = "4")] + pub version: i64, + /// value is the value held by the key + #[prost(bytes = "vec", tag = "5")] + pub value: ::prost::alloc::vec::Vec, + /// lease is the ID of the lease attached to the key + #[prost(int64, tag = "6")] + pub lease: i64, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct RangeRequest { + /// key is the first key for the range + #[prost(bytes = "vec", tag = "1")] + pub key: ::prost::alloc::vec::Vec, + /// range_end is the upper bound on the requested range + #[prost(bytes = "vec", tag = "2")] + pub range_end: ::prost::alloc::vec::Vec, + /// limit is a limit on the number of keys returned + #[prost(int64, tag = "3")] + pub limit: i64, + /// revision is the point-in-time of the store to use + #[prost(int64, tag = "4")] + pub revision: i64, + /// keys_only when set returns only the keys and not the values + #[prost(bool, tag = "5")] + pub keys_only: bool, + /// count_only when set returns only the count of the keys + #[prost(bool, tag = "6")] + pub count_only: bool, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct RangeResponse { + #[prost(message, optional, tag = "1")] + pub header: ::core::option::Option, + /// kvs is the list of key-value pairs matched by the range request + #[prost(message, repeated, tag = "2")] + pub kvs: ::prost::alloc::vec::Vec, + /// more indicates if there are more keys to return + #[prost(bool, tag = "3")] + pub more: bool, + /// count is set to the number of keys within the range + #[prost(int64, tag = "4")] + pub count: i64, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct PutRequest { + /// key is the key to put + #[prost(bytes = "vec", tag = "1")] + pub key: ::prost::alloc::vec::Vec, + /// value is the value to put + #[prost(bytes = "vec", tag = "2")] + pub value: ::prost::alloc::vec::Vec, + /// lease is the lease ID to attach to the key + #[prost(int64, tag = "3")] + pub lease: i64, + /// prev_kv when set returns the previous key-value pair + #[prost(bool, tag = "4")] + pub prev_kv: bool, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct PutResponse { + #[prost(message, optional, tag = "1")] + pub header: ::core::option::Option, + /// prev_kv is the key-value pair before the put + #[prost(message, optional, tag = "2")] + pub prev_kv: ::core::option::Option, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct DeleteRangeRequest { + /// key is the first key to delete + #[prost(bytes = "vec", tag = "1")] + pub key: ::prost::alloc::vec::Vec, + /// range_end is the key following the last key to delete + #[prost(bytes = "vec", tag = "2")] + pub range_end: ::prost::alloc::vec::Vec, + /// prev_kv when set returns deleted key-value pairs + #[prost(bool, tag = "3")] + pub prev_kv: bool, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct DeleteRangeResponse { + #[prost(message, optional, tag = "1")] + pub header: ::core::option::Option, + /// deleted is the number of keys deleted + #[prost(int64, tag = "2")] + pub deleted: i64, + /// prev_kvs holds the deleted key-value pairs + #[prost(message, repeated, tag = "3")] + pub prev_kvs: ::prost::alloc::vec::Vec, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct TxnRequest { + /// compare is a list of predicates + #[prost(message, repeated, tag = "1")] + pub compare: ::prost::alloc::vec::Vec, + /// success is a list of operations to apply if all comparisons succeed + #[prost(message, repeated, tag = "2")] + pub success: ::prost::alloc::vec::Vec, + /// failure is a list of operations to apply if any comparison fails + #[prost(message, repeated, tag = "3")] + pub failure: ::prost::alloc::vec::Vec, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct TxnResponse { + #[prost(message, optional, tag = "1")] + pub header: ::core::option::Option, + /// succeeded is set to true if all comparisons evaluated to true + #[prost(bool, tag = "2")] + pub succeeded: bool, + /// responses is a list of responses corresponding to the results + #[prost(message, repeated, tag = "3")] + pub responses: ::prost::alloc::vec::Vec, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct Compare { + #[prost(enumeration = "compare::CompareResult", tag = "1")] + pub result: i32, + #[prost(enumeration = "compare::CompareTarget", tag = "2")] + pub target: i32, + #[prost(bytes = "vec", tag = "3")] + pub key: ::prost::alloc::vec::Vec, + #[prost(oneof = "compare::TargetUnion", tags = "4, 5, 6, 7")] + pub target_union: ::core::option::Option, +} +/// Nested message and enum types in `Compare`. +pub mod compare { + #[derive( + Clone, + Copy, + Debug, + PartialEq, + Eq, + Hash, + PartialOrd, + Ord, + ::prost::Enumeration + )] + #[repr(i32)] + pub enum CompareResult { + Equal = 0, + Greater = 1, + Less = 2, + NotEqual = 3, + } + impl CompareResult { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + Self::Equal => "EQUAL", + Self::Greater => "GREATER", + Self::Less => "LESS", + Self::NotEqual => "NOT_EQUAL", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "EQUAL" => Some(Self::Equal), + "GREATER" => Some(Self::Greater), + "LESS" => Some(Self::Less), + "NOT_EQUAL" => Some(Self::NotEqual), + _ => None, + } + } + } + #[derive( + Clone, + Copy, + Debug, + PartialEq, + Eq, + Hash, + PartialOrd, + Ord, + ::prost::Enumeration + )] + #[repr(i32)] + pub enum CompareTarget { + Version = 0, + Create = 1, + Mod = 2, + Value = 3, + } + impl CompareTarget { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + Self::Version => "VERSION", + Self::Create => "CREATE", + Self::Mod => "MOD", + Self::Value => "VALUE", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "VERSION" => Some(Self::Version), + "CREATE" => Some(Self::Create), + "MOD" => Some(Self::Mod), + "VALUE" => Some(Self::Value), + _ => None, + } + } + } + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum TargetUnion { + #[prost(int64, tag = "4")] + Version(i64), + #[prost(int64, tag = "5")] + CreateRevision(i64), + #[prost(int64, tag = "6")] + ModRevision(i64), + #[prost(bytes, tag = "7")] + Value(::prost::alloc::vec::Vec), + } +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct RequestOp { + #[prost(oneof = "request_op::Request", tags = "1, 2, 3")] + pub request: ::core::option::Option, +} +/// Nested message and enum types in `RequestOp`. +pub mod request_op { + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum Request { + #[prost(message, tag = "1")] + RequestRange(super::RangeRequest), + #[prost(message, tag = "2")] + RequestPut(super::PutRequest), + #[prost(message, tag = "3")] + RequestDeleteRange(super::DeleteRangeRequest), + } +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ResponseOp { + #[prost(oneof = "response_op::Response", tags = "1, 2, 3")] + pub response: ::core::option::Option, +} +/// Nested message and enum types in `ResponseOp`. +pub mod response_op { + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum Response { + #[prost(message, tag = "1")] + ResponseRange(super::RangeResponse), + #[prost(message, tag = "2")] + ResponsePut(super::PutResponse), + #[prost(message, tag = "3")] + ResponseDeleteRange(super::DeleteRangeResponse), + } +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct WatchRequest { + #[prost(oneof = "watch_request::RequestUnion", tags = "1, 2, 3")] + pub request_union: ::core::option::Option, +} +/// Nested message and enum types in `WatchRequest`. +pub mod watch_request { + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum RequestUnion { + #[prost(message, tag = "1")] + CreateRequest(super::WatchCreateRequest), + #[prost(message, tag = "2")] + CancelRequest(super::WatchCancelRequest), + #[prost(message, tag = "3")] + ProgressRequest(super::WatchProgressRequest), + } +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct WatchCreateRequest { + /// key is the key to watch + #[prost(bytes = "vec", tag = "1")] + pub key: ::prost::alloc::vec::Vec, + /// range_end is the end of the range to watch + #[prost(bytes = "vec", tag = "2")] + pub range_end: ::prost::alloc::vec::Vec, + /// start_revision is an optional revision to start watching from + #[prost(int64, tag = "3")] + pub start_revision: i64, + /// progress_notify is set to true to enable progress notifications + #[prost(bool, tag = "4")] + pub progress_notify: bool, + /// prev_kv when set includes previous key-value in events + #[prost(bool, tag = "5")] + pub prev_kv: bool, + /// watch_id is the user-provided watch ID (0 for server-assigned) + #[prost(int64, tag = "6")] + pub watch_id: i64, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct WatchCancelRequest { + /// watch_id is the watch ID to cancel + #[prost(int64, tag = "1")] + pub watch_id: i64, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct WatchProgressRequest {} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct WatchResponse { + #[prost(message, optional, tag = "1")] + pub header: ::core::option::Option, + /// watch_id is the watch ID for this response + #[prost(int64, tag = "2")] + pub watch_id: i64, + /// created is set to true if this response is for a create request + #[prost(bool, tag = "3")] + pub created: bool, + /// canceled is set to true if the watch was canceled + #[prost(bool, tag = "4")] + pub canceled: bool, + /// compact_revision is the minimum revision the watcher may receive + #[prost(int64, tag = "5")] + pub compact_revision: i64, + /// cancel_reason indicates the reason for cancellation + #[prost(string, tag = "6")] + pub cancel_reason: ::prost::alloc::string::String, + /// events is the list of events in this response + #[prost(message, repeated, tag = "11")] + pub events: ::prost::alloc::vec::Vec, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct Event { + /// type is the kind of event + #[prost(enumeration = "event::EventType", tag = "1")] + pub r#type: i32, + /// kv is the KeyValue affected by the event + #[prost(message, optional, tag = "2")] + pub kv: ::core::option::Option, + /// prev_kv is the KeyValue prior to the event + #[prost(message, optional, tag = "3")] + pub prev_kv: ::core::option::Option, +} +/// Nested message and enum types in `Event`. +pub mod event { + #[derive( + Clone, + Copy, + Debug, + PartialEq, + Eq, + Hash, + PartialOrd, + Ord, + ::prost::Enumeration + )] + #[repr(i32)] + pub enum EventType { + Put = 0, + Delete = 1, + } + impl EventType { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + Self::Put => "PUT", + Self::Delete => "DELETE", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "PUT" => Some(Self::Put), + "DELETE" => Some(Self::Delete), + _ => None, + } + } + } +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct Member { + /// ID is the member ID + #[prost(uint64, tag = "1")] + pub id: u64, + /// name is the human-readable name + #[prost(string, tag = "2")] + pub name: ::prost::alloc::string::String, + /// peer_urls are URLs for Raft communication + #[prost(string, repeated, tag = "3")] + pub peer_urls: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, + /// client_urls are URLs for client communication + #[prost(string, repeated, tag = "4")] + pub client_urls: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, + /// is_learner indicates if member is a learner + #[prost(bool, tag = "5")] + pub is_learner: bool, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct MemberAddRequest { + /// peer_urls are the URLs to reach the new member + #[prost(string, repeated, tag = "1")] + pub peer_urls: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, + /// is_learner indicates if the member is a learner + #[prost(bool, tag = "2")] + pub is_learner: bool, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct MemberAddResponse { + #[prost(message, optional, tag = "1")] + pub header: ::core::option::Option, + /// member is the member information for the added member + #[prost(message, optional, tag = "2")] + pub member: ::core::option::Option, + /// members is the list of all members after adding + #[prost(message, repeated, tag = "3")] + pub members: ::prost::alloc::vec::Vec, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct MemberRemoveRequest { + /// ID is the member ID to remove + #[prost(uint64, tag = "1")] + pub id: u64, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct MemberRemoveResponse { + #[prost(message, optional, tag = "1")] + pub header: ::core::option::Option, + /// members is the list of all members after removing + #[prost(message, repeated, tag = "2")] + pub members: ::prost::alloc::vec::Vec, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct MemberListRequest {} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct MemberListResponse { + #[prost(message, optional, tag = "1")] + pub header: ::core::option::Option, + /// members is the list of all members + #[prost(message, repeated, tag = "2")] + pub members: ::prost::alloc::vec::Vec, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct StatusRequest {} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct StatusResponse { + #[prost(message, optional, tag = "1")] + pub header: ::core::option::Option, + /// version is the version of the server + #[prost(string, tag = "2")] + pub version: ::prost::alloc::string::String, + /// db_size is the size of the database + #[prost(int64, tag = "3")] + pub db_size: i64, + /// leader is the member ID of the current leader + #[prost(uint64, tag = "4")] + pub leader: u64, + /// raft_index is the current Raft committed index + #[prost(uint64, tag = "5")] + pub raft_index: u64, + /// raft_term is the current Raft term + #[prost(uint64, tag = "6")] + pub raft_term: u64, + /// raft_applied_index is the current Raft applied index + #[prost(uint64, tag = "7")] + pub raft_applied_index: u64, +} +/// Generated client implementations. +pub mod kv_client { + #![allow( + unused_variables, + dead_code, + missing_docs, + clippy::wildcard_imports, + clippy::let_unit_value, + )] + use tonic::codegen::*; + use tonic::codegen::http::Uri; + /// Key-Value service + #[derive(Debug, Clone)] + pub struct KvClient { + inner: tonic::client::Grpc, + } + impl KvClient { + /// Attempt to create a new client by connecting to a given endpoint. + pub async fn connect(dst: D) -> Result + where + D: TryInto, + D::Error: Into, + { + let conn = tonic::transport::Endpoint::new(dst)?.connect().await?; + Ok(Self::new(conn)) + } + } + impl KvClient + where + T: tonic::client::GrpcService, + T::Error: Into, + T::ResponseBody: Body + std::marker::Send + 'static, + ::Error: Into + std::marker::Send, + { + pub fn new(inner: T) -> Self { + let inner = tonic::client::Grpc::new(inner); + Self { inner } + } + pub fn with_origin(inner: T, origin: Uri) -> Self { + let inner = tonic::client::Grpc::with_origin(inner, origin); + Self { inner } + } + pub fn with_interceptor( + inner: T, + interceptor: F, + ) -> KvClient> + where + F: tonic::service::Interceptor, + T::ResponseBody: Default, + T: tonic::codegen::Service< + http::Request, + Response = http::Response< + >::ResponseBody, + >, + >, + , + >>::Error: Into + std::marker::Send + std::marker::Sync, + { + KvClient::new(InterceptedService::new(inner, interceptor)) + } + /// Compress requests with the given encoding. + /// + /// This requires the server to support it otherwise it might respond with an + /// error. + #[must_use] + pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.inner = self.inner.send_compressed(encoding); + self + } + /// Enable decompressing responses. + #[must_use] + pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.inner = self.inner.accept_compressed(encoding); + self + } + /// Limits the maximum size of a decoded message. + /// + /// Default: `4MB` + #[must_use] + pub fn max_decoding_message_size(mut self, limit: usize) -> Self { + self.inner = self.inner.max_decoding_message_size(limit); + self + } + /// Limits the maximum size of an encoded message. + /// + /// Default: `usize::MAX` + #[must_use] + pub fn max_encoding_message_size(mut self, limit: usize) -> Self { + self.inner = self.inner.max_encoding_message_size(limit); + self + } + /// Range gets the keys in the range from the key-value store + pub async fn range( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result, tonic::Status> { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic::codec::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static("/chainfire.v1.KV/Range"); + let mut req = request.into_request(); + req.extensions_mut().insert(GrpcMethod::new("chainfire.v1.KV", "Range")); + self.inner.unary(req, path, codec).await + } + /// Put puts the given key into the key-value store + pub async fn put( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result, tonic::Status> { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic::codec::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static("/chainfire.v1.KV/Put"); + let mut req = request.into_request(); + req.extensions_mut().insert(GrpcMethod::new("chainfire.v1.KV", "Put")); + self.inner.unary(req, path, codec).await + } + /// Delete deletes the given range from the key-value store + pub async fn delete( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic::codec::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static("/chainfire.v1.KV/Delete"); + let mut req = request.into_request(); + req.extensions_mut().insert(GrpcMethod::new("chainfire.v1.KV", "Delete")); + self.inner.unary(req, path, codec).await + } + /// Txn processes multiple requests in a single transaction + pub async fn txn( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result, tonic::Status> { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic::codec::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static("/chainfire.v1.KV/Txn"); + let mut req = request.into_request(); + req.extensions_mut().insert(GrpcMethod::new("chainfire.v1.KV", "Txn")); + self.inner.unary(req, path, codec).await + } + } +} +/// Generated client implementations. +pub mod watch_client { + #![allow( + unused_variables, + dead_code, + missing_docs, + clippy::wildcard_imports, + clippy::let_unit_value, + )] + use tonic::codegen::*; + use tonic::codegen::http::Uri; + /// Watch service + #[derive(Debug, Clone)] + pub struct WatchClient { + inner: tonic::client::Grpc, + } + impl WatchClient { + /// Attempt to create a new client by connecting to a given endpoint. + pub async fn connect(dst: D) -> Result + where + D: TryInto, + D::Error: Into, + { + let conn = tonic::transport::Endpoint::new(dst)?.connect().await?; + Ok(Self::new(conn)) + } + } + impl WatchClient + where + T: tonic::client::GrpcService, + T::Error: Into, + T::ResponseBody: Body + std::marker::Send + 'static, + ::Error: Into + std::marker::Send, + { + pub fn new(inner: T) -> Self { + let inner = tonic::client::Grpc::new(inner); + Self { inner } + } + pub fn with_origin(inner: T, origin: Uri) -> Self { + let inner = tonic::client::Grpc::with_origin(inner, origin); + Self { inner } + } + pub fn with_interceptor( + inner: T, + interceptor: F, + ) -> WatchClient> + where + F: tonic::service::Interceptor, + T::ResponseBody: Default, + T: tonic::codegen::Service< + http::Request, + Response = http::Response< + >::ResponseBody, + >, + >, + , + >>::Error: Into + std::marker::Send + std::marker::Sync, + { + WatchClient::new(InterceptedService::new(inner, interceptor)) + } + /// Compress requests with the given encoding. + /// + /// This requires the server to support it otherwise it might respond with an + /// error. + #[must_use] + pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.inner = self.inner.send_compressed(encoding); + self + } + /// Enable decompressing responses. + #[must_use] + pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.inner = self.inner.accept_compressed(encoding); + self + } + /// Limits the maximum size of a decoded message. + /// + /// Default: `4MB` + #[must_use] + pub fn max_decoding_message_size(mut self, limit: usize) -> Self { + self.inner = self.inner.max_decoding_message_size(limit); + self + } + /// Limits the maximum size of an encoded message. + /// + /// Default: `usize::MAX` + #[must_use] + pub fn max_encoding_message_size(mut self, limit: usize) -> Self { + self.inner = self.inner.max_encoding_message_size(limit); + self + } + /// Watch watches for events happening or that have happened + pub async fn watch( + &mut self, + request: impl tonic::IntoStreamingRequest, + ) -> std::result::Result< + tonic::Response>, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic::codec::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static("/chainfire.v1.Watch/Watch"); + let mut req = request.into_streaming_request(); + req.extensions_mut().insert(GrpcMethod::new("chainfire.v1.Watch", "Watch")); + self.inner.streaming(req, path, codec).await + } + } +} +/// Generated client implementations. +pub mod cluster_client { + #![allow( + unused_variables, + dead_code, + missing_docs, + clippy::wildcard_imports, + clippy::let_unit_value, + )] + use tonic::codegen::*; + use tonic::codegen::http::Uri; + /// Cluster management service + #[derive(Debug, Clone)] + pub struct ClusterClient { + inner: tonic::client::Grpc, + } + impl ClusterClient { + /// Attempt to create a new client by connecting to a given endpoint. + pub async fn connect(dst: D) -> Result + where + D: TryInto, + D::Error: Into, + { + let conn = tonic::transport::Endpoint::new(dst)?.connect().await?; + Ok(Self::new(conn)) + } + } + impl ClusterClient + where + T: tonic::client::GrpcService, + T::Error: Into, + T::ResponseBody: Body + std::marker::Send + 'static, + ::Error: Into + std::marker::Send, + { + pub fn new(inner: T) -> Self { + let inner = tonic::client::Grpc::new(inner); + Self { inner } + } + pub fn with_origin(inner: T, origin: Uri) -> Self { + let inner = tonic::client::Grpc::with_origin(inner, origin); + Self { inner } + } + pub fn with_interceptor( + inner: T, + interceptor: F, + ) -> ClusterClient> + where + F: tonic::service::Interceptor, + T::ResponseBody: Default, + T: tonic::codegen::Service< + http::Request, + Response = http::Response< + >::ResponseBody, + >, + >, + , + >>::Error: Into + std::marker::Send + std::marker::Sync, + { + ClusterClient::new(InterceptedService::new(inner, interceptor)) + } + /// Compress requests with the given encoding. + /// + /// This requires the server to support it otherwise it might respond with an + /// error. + #[must_use] + pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.inner = self.inner.send_compressed(encoding); + self + } + /// Enable decompressing responses. + #[must_use] + pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.inner = self.inner.accept_compressed(encoding); + self + } + /// Limits the maximum size of a decoded message. + /// + /// Default: `4MB` + #[must_use] + pub fn max_decoding_message_size(mut self, limit: usize) -> Self { + self.inner = self.inner.max_decoding_message_size(limit); + self + } + /// Limits the maximum size of an encoded message. + /// + /// Default: `usize::MAX` + #[must_use] + pub fn max_encoding_message_size(mut self, limit: usize) -> Self { + self.inner = self.inner.max_encoding_message_size(limit); + self + } + /// MemberAdd adds a member into the cluster + pub async fn member_add( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic::codec::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/chainfire.v1.Cluster/MemberAdd", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert(GrpcMethod::new("chainfire.v1.Cluster", "MemberAdd")); + self.inner.unary(req, path, codec).await + } + /// MemberRemove removes an existing member from the cluster + pub async fn member_remove( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic::codec::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/chainfire.v1.Cluster/MemberRemove", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert(GrpcMethod::new("chainfire.v1.Cluster", "MemberRemove")); + self.inner.unary(req, path, codec).await + } + /// MemberList lists all the members in the cluster + pub async fn member_list( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic::codec::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/chainfire.v1.Cluster/MemberList", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert(GrpcMethod::new("chainfire.v1.Cluster", "MemberList")); + self.inner.unary(req, path, codec).await + } + /// Status gets the status of the cluster + pub async fn status( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result, tonic::Status> { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic::codec::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/chainfire.v1.Cluster/Status", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert(GrpcMethod::new("chainfire.v1.Cluster", "Status")); + self.inner.unary(req, path, codec).await + } + } +} +/// Generated server implementations. +pub mod kv_server { + #![allow( + unused_variables, + dead_code, + missing_docs, + clippy::wildcard_imports, + clippy::let_unit_value, + )] + use tonic::codegen::*; + /// Generated trait containing gRPC methods that should be implemented for use with KvServer. + #[async_trait] + pub trait Kv: std::marker::Send + std::marker::Sync + 'static { + /// Range gets the keys in the range from the key-value store + async fn range( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status>; + /// Put puts the given key into the key-value store + async fn put( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status>; + /// Delete deletes the given range from the key-value store + async fn delete( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; + /// Txn processes multiple requests in a single transaction + async fn txn( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status>; + } + /// Key-Value service + #[derive(Debug)] + pub struct KvServer { + inner: Arc, + accept_compression_encodings: EnabledCompressionEncodings, + send_compression_encodings: EnabledCompressionEncodings, + max_decoding_message_size: Option, + max_encoding_message_size: Option, + } + impl KvServer { + pub fn new(inner: T) -> Self { + Self::from_arc(Arc::new(inner)) + } + pub fn from_arc(inner: Arc) -> Self { + Self { + inner, + accept_compression_encodings: Default::default(), + send_compression_encodings: Default::default(), + max_decoding_message_size: None, + max_encoding_message_size: None, + } + } + pub fn with_interceptor( + inner: T, + interceptor: F, + ) -> InterceptedService + where + F: tonic::service::Interceptor, + { + InterceptedService::new(Self::new(inner), interceptor) + } + /// Enable decompressing requests with the given encoding. + #[must_use] + pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.accept_compression_encodings.enable(encoding); + self + } + /// Compress responses with the given encoding, if the client supports it. + #[must_use] + pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.send_compression_encodings.enable(encoding); + self + } + /// Limits the maximum size of a decoded message. + /// + /// Default: `4MB` + #[must_use] + pub fn max_decoding_message_size(mut self, limit: usize) -> Self { + self.max_decoding_message_size = Some(limit); + self + } + /// Limits the maximum size of an encoded message. + /// + /// Default: `usize::MAX` + #[must_use] + pub fn max_encoding_message_size(mut self, limit: usize) -> Self { + self.max_encoding_message_size = Some(limit); + self + } + } + impl tonic::codegen::Service> for KvServer + where + T: Kv, + B: Body + std::marker::Send + 'static, + B::Error: Into + std::marker::Send + 'static, + { + type Response = http::Response; + type Error = std::convert::Infallible; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut Context<'_>, + ) -> Poll> { + Poll::Ready(Ok(())) + } + fn call(&mut self, req: http::Request) -> Self::Future { + match req.uri().path() { + "/chainfire.v1.KV/Range" => { + #[allow(non_camel_case_types)] + struct RangeSvc(pub Arc); + impl tonic::server::UnaryService + for RangeSvc { + type Response = super::RangeResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::range(&inner, request).await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = RangeSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/chainfire.v1.KV/Put" => { + #[allow(non_camel_case_types)] + struct PutSvc(pub Arc); + impl tonic::server::UnaryService + for PutSvc { + type Response = super::PutResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::put(&inner, request).await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = PutSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/chainfire.v1.KV/Delete" => { + #[allow(non_camel_case_types)] + struct DeleteSvc(pub Arc); + impl tonic::server::UnaryService + for DeleteSvc { + type Response = super::DeleteRangeResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::delete(&inner, request).await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = DeleteSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/chainfire.v1.KV/Txn" => { + #[allow(non_camel_case_types)] + struct TxnSvc(pub Arc); + impl tonic::server::UnaryService + for TxnSvc { + type Response = super::TxnResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::txn(&inner, request).await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = TxnSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + _ => { + Box::pin(async move { + let mut response = http::Response::new(empty_body()); + let headers = response.headers_mut(); + headers + .insert( + tonic::Status::GRPC_STATUS, + (tonic::Code::Unimplemented as i32).into(), + ); + headers + .insert( + http::header::CONTENT_TYPE, + tonic::metadata::GRPC_CONTENT_TYPE, + ); + Ok(response) + }) + } + } + } + } + impl Clone for KvServer { + fn clone(&self) -> Self { + let inner = self.inner.clone(); + Self { + inner, + accept_compression_encodings: self.accept_compression_encodings, + send_compression_encodings: self.send_compression_encodings, + max_decoding_message_size: self.max_decoding_message_size, + max_encoding_message_size: self.max_encoding_message_size, + } + } + } + /// Generated gRPC service name + pub const SERVICE_NAME: &str = "chainfire.v1.KV"; + impl tonic::server::NamedService for KvServer { + const NAME: &'static str = SERVICE_NAME; + } +} +/// Generated server implementations. +pub mod watch_server { + #![allow( + unused_variables, + dead_code, + missing_docs, + clippy::wildcard_imports, + clippy::let_unit_value, + )] + use tonic::codegen::*; + /// Generated trait containing gRPC methods that should be implemented for use with WatchServer. + #[async_trait] + pub trait Watch: std::marker::Send + std::marker::Sync + 'static { + /// Server streaming response type for the Watch method. + type WatchStream: tonic::codegen::tokio_stream::Stream< + Item = std::result::Result, + > + + std::marker::Send + + 'static; + /// Watch watches for events happening or that have happened + async fn watch( + &self, + request: tonic::Request>, + ) -> std::result::Result, tonic::Status>; + } + /// Watch service + #[derive(Debug)] + pub struct WatchServer { + inner: Arc, + accept_compression_encodings: EnabledCompressionEncodings, + send_compression_encodings: EnabledCompressionEncodings, + max_decoding_message_size: Option, + max_encoding_message_size: Option, + } + impl WatchServer { + pub fn new(inner: T) -> Self { + Self::from_arc(Arc::new(inner)) + } + pub fn from_arc(inner: Arc) -> Self { + Self { + inner, + accept_compression_encodings: Default::default(), + send_compression_encodings: Default::default(), + max_decoding_message_size: None, + max_encoding_message_size: None, + } + } + pub fn with_interceptor( + inner: T, + interceptor: F, + ) -> InterceptedService + where + F: tonic::service::Interceptor, + { + InterceptedService::new(Self::new(inner), interceptor) + } + /// Enable decompressing requests with the given encoding. + #[must_use] + pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.accept_compression_encodings.enable(encoding); + self + } + /// Compress responses with the given encoding, if the client supports it. + #[must_use] + pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.send_compression_encodings.enable(encoding); + self + } + /// Limits the maximum size of a decoded message. + /// + /// Default: `4MB` + #[must_use] + pub fn max_decoding_message_size(mut self, limit: usize) -> Self { + self.max_decoding_message_size = Some(limit); + self + } + /// Limits the maximum size of an encoded message. + /// + /// Default: `usize::MAX` + #[must_use] + pub fn max_encoding_message_size(mut self, limit: usize) -> Self { + self.max_encoding_message_size = Some(limit); + self + } + } + impl tonic::codegen::Service> for WatchServer + where + T: Watch, + B: Body + std::marker::Send + 'static, + B::Error: Into + std::marker::Send + 'static, + { + type Response = http::Response; + type Error = std::convert::Infallible; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut Context<'_>, + ) -> Poll> { + Poll::Ready(Ok(())) + } + fn call(&mut self, req: http::Request) -> Self::Future { + match req.uri().path() { + "/chainfire.v1.Watch/Watch" => { + #[allow(non_camel_case_types)] + struct WatchSvc(pub Arc); + impl tonic::server::StreamingService + for WatchSvc { + type Response = super::WatchResponse; + type ResponseStream = T::WatchStream; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request< + tonic::Streaming, + >, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::watch(&inner, request).await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = WatchSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.streaming(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + _ => { + Box::pin(async move { + let mut response = http::Response::new(empty_body()); + let headers = response.headers_mut(); + headers + .insert( + tonic::Status::GRPC_STATUS, + (tonic::Code::Unimplemented as i32).into(), + ); + headers + .insert( + http::header::CONTENT_TYPE, + tonic::metadata::GRPC_CONTENT_TYPE, + ); + Ok(response) + }) + } + } + } + } + impl Clone for WatchServer { + fn clone(&self) -> Self { + let inner = self.inner.clone(); + Self { + inner, + accept_compression_encodings: self.accept_compression_encodings, + send_compression_encodings: self.send_compression_encodings, + max_decoding_message_size: self.max_decoding_message_size, + max_encoding_message_size: self.max_encoding_message_size, + } + } + } + /// Generated gRPC service name + pub const SERVICE_NAME: &str = "chainfire.v1.Watch"; + impl tonic::server::NamedService for WatchServer { + const NAME: &'static str = SERVICE_NAME; + } +} +/// Generated server implementations. +pub mod cluster_server { + #![allow( + unused_variables, + dead_code, + missing_docs, + clippy::wildcard_imports, + clippy::let_unit_value, + )] + use tonic::codegen::*; + /// Generated trait containing gRPC methods that should be implemented for use with ClusterServer. + #[async_trait] + pub trait Cluster: std::marker::Send + std::marker::Sync + 'static { + /// MemberAdd adds a member into the cluster + async fn member_add( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; + /// MemberRemove removes an existing member from the cluster + async fn member_remove( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; + /// MemberList lists all the members in the cluster + async fn member_list( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; + /// Status gets the status of the cluster + async fn status( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status>; + } + /// Cluster management service + #[derive(Debug)] + pub struct ClusterServer { + inner: Arc, + accept_compression_encodings: EnabledCompressionEncodings, + send_compression_encodings: EnabledCompressionEncodings, + max_decoding_message_size: Option, + max_encoding_message_size: Option, + } + impl ClusterServer { + pub fn new(inner: T) -> Self { + Self::from_arc(Arc::new(inner)) + } + pub fn from_arc(inner: Arc) -> Self { + Self { + inner, + accept_compression_encodings: Default::default(), + send_compression_encodings: Default::default(), + max_decoding_message_size: None, + max_encoding_message_size: None, + } + } + pub fn with_interceptor( + inner: T, + interceptor: F, + ) -> InterceptedService + where + F: tonic::service::Interceptor, + { + InterceptedService::new(Self::new(inner), interceptor) + } + /// Enable decompressing requests with the given encoding. + #[must_use] + pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.accept_compression_encodings.enable(encoding); + self + } + /// Compress responses with the given encoding, if the client supports it. + #[must_use] + pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.send_compression_encodings.enable(encoding); + self + } + /// Limits the maximum size of a decoded message. + /// + /// Default: `4MB` + #[must_use] + pub fn max_decoding_message_size(mut self, limit: usize) -> Self { + self.max_decoding_message_size = Some(limit); + self + } + /// Limits the maximum size of an encoded message. + /// + /// Default: `usize::MAX` + #[must_use] + pub fn max_encoding_message_size(mut self, limit: usize) -> Self { + self.max_encoding_message_size = Some(limit); + self + } + } + impl tonic::codegen::Service> for ClusterServer + where + T: Cluster, + B: Body + std::marker::Send + 'static, + B::Error: Into + std::marker::Send + 'static, + { + type Response = http::Response; + type Error = std::convert::Infallible; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut Context<'_>, + ) -> Poll> { + Poll::Ready(Ok(())) + } + fn call(&mut self, req: http::Request) -> Self::Future { + match req.uri().path() { + "/chainfire.v1.Cluster/MemberAdd" => { + #[allow(non_camel_case_types)] + struct MemberAddSvc(pub Arc); + impl tonic::server::UnaryService + for MemberAddSvc { + type Response = super::MemberAddResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::member_add(&inner, request).await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = MemberAddSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/chainfire.v1.Cluster/MemberRemove" => { + #[allow(non_camel_case_types)] + struct MemberRemoveSvc(pub Arc); + impl< + T: Cluster, + > tonic::server::UnaryService + for MemberRemoveSvc { + type Response = super::MemberRemoveResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::member_remove(&inner, request).await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = MemberRemoveSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/chainfire.v1.Cluster/MemberList" => { + #[allow(non_camel_case_types)] + struct MemberListSvc(pub Arc); + impl< + T: Cluster, + > tonic::server::UnaryService + for MemberListSvc { + type Response = super::MemberListResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::member_list(&inner, request).await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = MemberListSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/chainfire.v1.Cluster/Status" => { + #[allow(non_camel_case_types)] + struct StatusSvc(pub Arc); + impl tonic::server::UnaryService + for StatusSvc { + type Response = super::StatusResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::status(&inner, request).await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = StatusSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + _ => { + Box::pin(async move { + let mut response = http::Response::new(empty_body()); + let headers = response.headers_mut(); + headers + .insert( + tonic::Status::GRPC_STATUS, + (tonic::Code::Unimplemented as i32).into(), + ); + headers + .insert( + http::header::CONTENT_TYPE, + tonic::metadata::GRPC_CONTENT_TYPE, + ); + Ok(response) + }) + } + } + } + } + impl Clone for ClusterServer { + fn clone(&self) -> Self { + let inner = self.inner.clone(); + Self { + inner, + accept_compression_encodings: self.accept_compression_encodings, + send_compression_encodings: self.send_compression_encodings, + max_decoding_message_size: self.max_decoding_message_size, + max_encoding_message_size: self.max_encoding_message_size, + } + } + } + /// Generated gRPC service name + pub const SERVICE_NAME: &str = "chainfire.v1.Cluster"; + impl tonic::server::NamedService for ClusterServer { + const NAME: &'static str = SERVICE_NAME; + } +} diff --git a/chainfire/crates/chainfire-api/src/generated/mod.rs b/chainfire/crates/chainfire-api/src/generated/mod.rs new file mode 100644 index 0000000..63dd46a --- /dev/null +++ b/chainfire/crates/chainfire-api/src/generated/mod.rs @@ -0,0 +1,13 @@ +//! Generated protobuf code +//! +//! This module contains the code generated by tonic-build from the proto files. + +pub mod chainfire { + pub mod v1 { + tonic::include_proto!("chainfire.v1"); + } + + pub mod internal { + tonic::include_proto!("chainfire.internal"); + } +} diff --git a/chainfire/crates/chainfire-api/src/internal_service.rs b/chainfire/crates/chainfire-api/src/internal_service.rs new file mode 100644 index 0000000..a1f2208 --- /dev/null +++ b/chainfire/crates/chainfire-api/src/internal_service.rs @@ -0,0 +1,242 @@ +//! Internal Raft RPC service implementation +//! +//! This service handles Raft protocol messages between nodes in the cluster. +//! It bridges the gRPC layer with the OpenRaft implementation. + +use crate::internal_proto::{ + raft_service_server::RaftService, AppendEntriesRequest, AppendEntriesResponse, + InstallSnapshotRequest, InstallSnapshotResponse, VoteRequest, VoteResponse, +}; +use chainfire_raft::{Raft, TypeConfig}; +use chainfire_types::NodeId; +use openraft::BasicNode; +use std::sync::Arc; +use tonic::{Request, Response, Status, Streaming}; +use tracing::{debug, trace, warn}; + +/// Internal Raft RPC service implementation +/// +/// This service handles Raft protocol messages between nodes. +pub struct RaftServiceImpl { + /// Reference to the Raft instance + raft: Arc, +} + +impl RaftServiceImpl { + /// Create a new Raft service with a Raft instance + pub fn new(raft: Arc) -> Self { + Self { raft } + } +} + +#[tonic::async_trait] +impl RaftService for RaftServiceImpl { + async fn vote( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + trace!( + term = req.term, + candidate = req.candidate_id, + "Vote request received" + ); + + // Convert proto request to openraft request + let vote_req = openraft::raft::VoteRequest { + vote: openraft::Vote::new(req.term, req.candidate_id), + last_log_id: if req.last_log_index > 0 { + Some(openraft::LogId::new( + openraft::CommittedLeaderId::new(req.last_log_term, 0), + req.last_log_index, + )) + } else { + None + }, + }; + + // Forward to Raft node + let result = self.raft.vote(vote_req).await; + + match result { + Ok(resp) => { + trace!(term = resp.vote.leader_id().term, granted = resp.vote_granted, "Vote response"); + Ok(Response::new(VoteResponse { + term: resp.vote.leader_id().term, + vote_granted: resp.vote_granted, + last_log_index: resp.last_log_id.map(|id| id.index).unwrap_or(0), + last_log_term: resp.last_log_id.map(|id| id.leader_id.term).unwrap_or(0), + })) + } + Err(e) => { + warn!(error = %e, "Vote request failed"); + Err(Status::internal(e.to_string())) + } + } + } + + async fn append_entries( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + trace!( + term = req.term, + leader = req.leader_id, + entries = req.entries.len(), + "AppendEntries request received" + ); + + // Convert proto entries to openraft entries + let entries: Vec> = req + .entries + .into_iter() + .map(|e| { + let payload = if e.data.is_empty() { + openraft::EntryPayload::Blank + } else { + // Deserialize the command from the entry data + match bincode::deserialize(&e.data) { + Ok(cmd) => openraft::EntryPayload::Normal(cmd), + Err(_) => openraft::EntryPayload::Blank, + } + }; + + openraft::Entry { + log_id: openraft::LogId::new( + openraft::CommittedLeaderId::new(e.term, 0), + e.index, + ), + payload, + } + }) + .collect(); + + let prev_log_id = if req.prev_log_index > 0 { + Some(openraft::LogId::new( + openraft::CommittedLeaderId::new(req.prev_log_term, 0), + req.prev_log_index, + )) + } else { + None + }; + + let leader_commit = if req.leader_commit > 0 { + Some(openraft::LogId::new( + openraft::CommittedLeaderId::new(req.term, 0), + req.leader_commit, + )) + } else { + None + }; + + let append_req = openraft::raft::AppendEntriesRequest { + vote: openraft::Vote::new_committed(req.term, req.leader_id), + prev_log_id, + entries, + leader_commit, + }; + + let result = self.raft.append_entries(append_req).await; + + match result { + Ok(resp) => { + let (success, conflict_index, conflict_term) = match resp { + openraft::raft::AppendEntriesResponse::Success => (true, 0, 0), + openraft::raft::AppendEntriesResponse::PartialSuccess(log_id) => { + // Partial success - some entries were accepted + let index = log_id.map(|l| l.index).unwrap_or(0); + (true, index, 0) + } + openraft::raft::AppendEntriesResponse::HigherVote(vote) => { + (false, 0, vote.leader_id().term) + } + openraft::raft::AppendEntriesResponse::Conflict => (false, 0, 0), + }; + + trace!(success, "AppendEntries response"); + Ok(Response::new(AppendEntriesResponse { + term: req.term, + success, + conflict_index, + conflict_term, + })) + } + Err(e) => { + warn!(error = %e, "AppendEntries request failed"); + Err(Status::internal(e.to_string())) + } + } + } + + async fn install_snapshot( + &self, + request: Request>, + ) -> Result, Status> { + let mut stream = request.into_inner(); + debug!("InstallSnapshot stream started"); + + // Collect all chunks + let mut term = 0; + let mut leader_id = 0; + let mut last_log_index = 0; + let mut last_log_term = 0; + let mut data = Vec::new(); + + while let Some(chunk) = stream.message().await? { + term = chunk.term; + leader_id = chunk.leader_id; + last_log_index = chunk.last_included_index; + last_log_term = chunk.last_included_term; + data.extend_from_slice(&chunk.data); + + if chunk.done { + break; + } + } + + debug!(term, size = data.len(), "InstallSnapshot completed"); + + // Create snapshot metadata + let last_log_id = if last_log_index > 0 { + Some(openraft::LogId::new( + openraft::CommittedLeaderId::new(last_log_term, 0), + last_log_index, + )) + } else { + None + }; + + let meta = openraft::SnapshotMeta { + last_log_id, + last_membership: openraft::StoredMembership::new( + None, + openraft::Membership::::new(vec![], None), + ), + snapshot_id: format!("{}-{}", term, last_log_index), + }; + + let snapshot_req = openraft::raft::InstallSnapshotRequest { + vote: openraft::Vote::new_committed(term, leader_id), + meta, + offset: 0, + data, + done: true, + }; + + let result = self.raft.install_snapshot(snapshot_req).await; + + match result { + Ok(resp) => { + debug!(term = resp.vote.leader_id().term, "InstallSnapshot response"); + Ok(Response::new(InstallSnapshotResponse { + term: resp.vote.leader_id().term, + })) + } + Err(e) => { + warn!(error = %e, "InstallSnapshot request failed"); + Err(Status::internal(e.to_string())) + } + } + } +} diff --git a/chainfire/crates/chainfire-api/src/kv_service.rs b/chainfire/crates/chainfire-api/src/kv_service.rs new file mode 100644 index 0000000..15808ee --- /dev/null +++ b/chainfire/crates/chainfire-api/src/kv_service.rs @@ -0,0 +1,285 @@ +//! KV service implementation + +use crate::conversions::make_header; +use crate::proto::{ + compare, kv_server::Kv, DeleteRangeRequest, DeleteRangeResponse, PutRequest, PutResponse, + RangeRequest, RangeResponse, ResponseOp, TxnRequest, TxnResponse, +}; +use chainfire_raft::RaftNode; +use chainfire_types::command::RaftCommand; +use std::sync::Arc; +use tonic::{Request, Response, Status}; +use tracing::{debug, trace}; + +/// KV service implementation +pub struct KvServiceImpl { + /// Raft node for consensus + raft: Arc, + /// Cluster ID + cluster_id: u64, +} + +impl KvServiceImpl { + /// Create a new KV service + pub fn new(raft: Arc, cluster_id: u64) -> Self { + Self { raft, cluster_id } + } + + /// Create a response header + fn make_header(&self, revision: u64) -> crate::proto::ResponseHeader { + make_header( + self.cluster_id, + self.raft.id(), + revision, + 0, // TODO: get actual term + ) + } +} + +#[tonic::async_trait] +impl Kv for KvServiceImpl { + async fn range( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + trace!(key = ?String::from_utf8_lossy(&req.key), serializable = req.serializable, "Range request"); + + // For linearizable reads (serializable=false), ensure we're reading consistent state + // by verifying leadership/log commit status through Raft + if !req.serializable { + self.raft + .linearizable_read() + .await + .map_err(|e| Status::unavailable(format!("linearizable read failed: {}", e)))?; + } + + // Get storage from Raft node + let storage = self.raft.storage(); + let storage_guard = storage.read().await; + let sm = storage_guard.state_machine().read().await; + + let entries = if req.range_end.is_empty() { + // Single key lookup + sm.kv() + .get(&req.key) + .map_err(|e| Status::internal(e.to_string()))? + .into_iter() + .collect() + } else { + // Range scan + sm.kv() + .range(&req.key, Some(&req.range_end)) + .map_err(|e| Status::internal(e.to_string()))? + }; + + let revision = sm.current_revision(); + let kvs: Vec<_> = entries.into_iter().map(Into::into).collect(); + let count = kvs.len() as i64; + + Ok(Response::new(RangeResponse { + header: Some(self.make_header(revision)), + kvs, + more: false, + count, + })) + } + + async fn put(&self, request: Request) -> Result, Status> { + let req = request.into_inner(); + debug!(key = ?String::from_utf8_lossy(&req.key), "Put request"); + + let command = RaftCommand::Put { + key: req.key, + value: req.value, + lease_id: if req.lease != 0 { Some(req.lease) } else { None }, + prev_kv: req.prev_kv, + }; + + let response = self + .raft + .write(command) + .await + .map_err(|e| Status::internal(e.to_string()))?; + + Ok(Response::new(PutResponse { + header: Some(self.make_header(response.revision)), + prev_kv: response.prev_kv.map(Into::into), + })) + } + + async fn delete( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + debug!(key = ?String::from_utf8_lossy(&req.key), "Delete request"); + + let command = if req.range_end.is_empty() { + RaftCommand::Delete { + key: req.key, + prev_kv: req.prev_kv, + } + } else { + RaftCommand::DeleteRange { + start: req.key, + end: req.range_end, + prev_kv: req.prev_kv, + } + }; + + let response = self + .raft + .write(command) + .await + .map_err(|e| Status::internal(e.to_string()))?; + + Ok(Response::new(DeleteRangeResponse { + header: Some(self.make_header(response.revision)), + deleted: response.deleted as i64, + prev_kvs: response.prev_kvs.into_iter().map(Into::into).collect(), + })) + } + + async fn txn(&self, request: Request) -> Result, Status> { + let req = request.into_inner(); + debug!("Txn request with {} comparisons", req.compare.len()); + + // Convert protobuf types to internal types + let compare: Vec<_> = req + .compare + .into_iter() + .map(|c| { + use chainfire_types::command::{ + Compare, CompareResult as InternalResult, CompareTarget as InternalTarget, + }; + + let result = match compare::CompareResult::try_from(c.result) { + Ok(compare::CompareResult::Equal) => InternalResult::Equal, + Ok(compare::CompareResult::NotEqual) => InternalResult::NotEqual, + Ok(compare::CompareResult::Greater) => InternalResult::Greater, + Ok(compare::CompareResult::Less) => InternalResult::Less, + Err(_) => InternalResult::Equal, + }; + + let target = match c.target_union { + Some(compare::TargetUnion::Version(v)) => InternalTarget::Version(v as u64), + Some(compare::TargetUnion::CreateRevision(v)) => { + InternalTarget::CreateRevision(v as u64) + } + Some(compare::TargetUnion::ModRevision(v)) => { + InternalTarget::ModRevision(v as u64) + } + Some(compare::TargetUnion::Value(v)) => InternalTarget::Value(v), + None => InternalTarget::Version(0), + }; + + Compare { + key: c.key, + target, + result, + } + }) + .collect(); + + let success = convert_ops(&req.success); + let failure = convert_ops(&req.failure); + + let command = RaftCommand::Txn { + compare, + success, + failure, + }; + + let response = self + .raft + .write(command) + .await + .map_err(|e| Status::internal(e.to_string()))?; + + // Convert txn_responses to proto ResponseOp + let responses = convert_txn_responses(&response.txn_responses, response.revision); + + Ok(Response::new(TxnResponse { + header: Some(self.make_header(response.revision)), + succeeded: response.succeeded, + responses, + })) + } +} + +/// Convert internal TxnOpResponse to proto ResponseOp +fn convert_txn_responses( + responses: &[chainfire_types::command::TxnOpResponse], + revision: u64, +) -> Vec { + use crate::proto::response_op::Response as ProtoResponse; + use chainfire_types::command::TxnOpResponse; + + responses + .iter() + .map(|resp| { + let response = match resp { + TxnOpResponse::Put { prev_kv } => ProtoResponse::ResponsePut(PutResponse { + header: Some(make_header(0, 0, revision, 0)), + prev_kv: prev_kv.clone().map(Into::into), + }), + TxnOpResponse::Delete { deleted, prev_kvs } => { + ProtoResponse::ResponseDeleteRange(DeleteRangeResponse { + header: Some(make_header(0, 0, revision, 0)), + deleted: *deleted as i64, + prev_kvs: prev_kvs.iter().cloned().map(Into::into).collect(), + }) + } + TxnOpResponse::Range { kvs, count, more } => { + ProtoResponse::ResponseRange(RangeResponse { + header: Some(make_header(0, 0, revision, 0)), + kvs: kvs.iter().cloned().map(Into::into).collect(), + count: *count as i64, + more: *more, + }) + } + }; + ResponseOp { + response: Some(response), + } + }) + .collect() +} + +fn convert_ops( + ops: &[crate::proto::RequestOp], +) -> Vec { + use chainfire_types::command::TxnOp; + + ops.iter() + .filter_map(|op| { + op.request.as_ref().map(|req| match req { + crate::proto::request_op::Request::RequestPut(put) => TxnOp::Put { + key: put.key.clone(), + value: put.value.clone(), + lease_id: if put.lease != 0 { Some(put.lease) } else { None }, + }, + crate::proto::request_op::Request::RequestDeleteRange(del) => { + if del.range_end.is_empty() { + TxnOp::Delete { + key: del.key.clone(), + } + } else { + TxnOp::DeleteRange { + start: del.key.clone(), + end: del.range_end.clone(), + } + } + } + crate::proto::request_op::Request::RequestRange(range) => TxnOp::Range { + key: range.key.clone(), + range_end: range.range_end.clone(), + limit: range.limit, + keys_only: range.keys_only, + count_only: range.count_only, + } + }) + }) + .collect() +} diff --git a/chainfire/crates/chainfire-api/src/lease_service.rs b/chainfire/crates/chainfire-api/src/lease_service.rs new file mode 100644 index 0000000..f0a897c --- /dev/null +++ b/chainfire/crates/chainfire-api/src/lease_service.rs @@ -0,0 +1,194 @@ +//! Lease service implementation + +use crate::conversions::make_header; +use crate::proto::{ + lease_server::Lease, LeaseGrantRequest, LeaseGrantResponse, LeaseKeepAliveRequest, + LeaseKeepAliveResponse, LeaseLeasesRequest, LeaseLeasesResponse, LeaseRevokeRequest, + LeaseRevokeResponse, LeaseStatus, LeaseTimeToLiveRequest, LeaseTimeToLiveResponse, +}; +use chainfire_raft::RaftNode; +use chainfire_types::command::RaftCommand; +use std::pin::Pin; +use std::sync::Arc; +use tokio::sync::mpsc; +use tokio_stream::{wrappers::ReceiverStream, Stream, StreamExt}; +use tonic::{Request, Response, Status, Streaming}; +use tracing::{debug, warn}; + +/// Lease service implementation +pub struct LeaseServiceImpl { + /// Raft node for consensus + raft: Arc, + /// Cluster ID + cluster_id: u64, +} + +impl LeaseServiceImpl { + /// Create a new Lease service + pub fn new(raft: Arc, cluster_id: u64) -> Self { + Self { raft, cluster_id } + } + + /// Create a response header + fn make_header(&self, revision: u64) -> crate::proto::ResponseHeader { + make_header(self.cluster_id, self.raft.id(), revision, 0) + } +} + +#[tonic::async_trait] +impl Lease for LeaseServiceImpl { + async fn lease_grant( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + debug!(id = req.id, ttl = req.ttl, "LeaseGrant request"); + + let command = RaftCommand::LeaseGrant { + id: req.id, + ttl: req.ttl, + }; + + let response = self + .raft + .write(command) + .await + .map_err(|e| Status::internal(e.to_string()))?; + + Ok(Response::new(LeaseGrantResponse { + header: Some(self.make_header(response.revision)), + id: response.lease_id.unwrap_or(0), + ttl: response.lease_ttl.unwrap_or(0), + error: String::new(), + })) + } + + async fn lease_revoke( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + debug!(id = req.id, "LeaseRevoke request"); + + let command = RaftCommand::LeaseRevoke { id: req.id }; + + let response = self + .raft + .write(command) + .await + .map_err(|e| Status::internal(e.to_string()))?; + + Ok(Response::new(LeaseRevokeResponse { + header: Some(self.make_header(response.revision)), + })) + } + + type LeaseKeepAliveStream = + Pin> + Send>>; + + async fn lease_keep_alive( + &self, + request: Request>, + ) -> Result, Status> { + let mut stream = request.into_inner(); + let raft = Arc::clone(&self.raft); + let cluster_id = self.cluster_id; + + let (tx, rx) = mpsc::channel(16); + + tokio::spawn(async move { + while let Some(result) = stream.next().await { + match result { + Ok(req) => { + debug!(id = req.id, "LeaseKeepAlive request"); + + let command = RaftCommand::LeaseRefresh { id: req.id }; + + match raft.write(command).await { + Ok(response) => { + let resp = LeaseKeepAliveResponse { + header: Some(make_header( + cluster_id, + raft.id(), + response.revision, + 0, + )), + id: response.lease_id.unwrap_or(req.id), + ttl: response.lease_ttl.unwrap_or(0), + }; + if tx.send(Ok(resp)).await.is_err() { + break; + } + } + Err(e) => { + warn!("LeaseKeepAlive failed: {}", e); + if tx.send(Err(Status::internal(e.to_string()))).await.is_err() { + break; + } + } + } + } + Err(e) => { + warn!("LeaseKeepAlive stream error: {}", e); + break; + } + } + } + }); + + Ok(Response::new(Box::pin(ReceiverStream::new(rx)))) + } + + async fn lease_time_to_live( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + debug!(id = req.id, "LeaseTimeToLive request"); + + // Read directly from state machine (this is a read operation) + let storage = self.raft.storage(); + let storage_guard = storage.read().await; + let sm = storage_guard.state_machine().read().await; + + let leases = sm.leases(); + match leases.time_to_live(req.id) { + Some((ttl, granted_ttl, keys)) => Ok(Response::new(LeaseTimeToLiveResponse { + header: Some(self.make_header(sm.current_revision())), + id: req.id, + ttl, + granted_ttl, + keys: if req.keys { keys } else { vec![] }, + })), + None => Ok(Response::new(LeaseTimeToLiveResponse { + header: Some(self.make_header(sm.current_revision())), + id: req.id, + ttl: -1, + granted_ttl: 0, + keys: vec![], + })), + } + } + + async fn lease_leases( + &self, + _request: Request, + ) -> Result, Status> { + debug!("LeaseLeases request"); + + // Read directly from state machine + let storage = self.raft.storage(); + let storage_guard = storage.read().await; + let sm = storage_guard.state_machine().read().await; + + let leases = sm.leases(); + let lease_ids = leases.list(); + + let statuses: Vec = lease_ids.into_iter().map(|id| LeaseStatus { id }).collect(); + + Ok(Response::new(LeaseLeasesResponse { + header: Some(self.make_header(sm.current_revision())), + leases: statuses, + })) + } +} diff --git a/chainfire/crates/chainfire-api/src/lib.rs b/chainfire/crates/chainfire-api/src/lib.rs new file mode 100644 index 0000000..06a71ad --- /dev/null +++ b/chainfire/crates/chainfire-api/src/lib.rs @@ -0,0 +1,29 @@ +//! gRPC API layer for Chainfire distributed KVS +//! +//! This crate provides: +//! - Generated protobuf types +//! - gRPC service implementations +//! - Client and server components + +pub mod generated; +pub mod kv_service; +pub mod lease_service; +pub mod watch_service; +pub mod cluster_service; +pub mod internal_service; +pub mod raft_client; +pub mod conversions; + +// Re-export generated types +pub use generated::chainfire::v1 as proto; +pub use generated::chainfire::internal as internal_proto; + +// Re-export services +pub use kv_service::KvServiceImpl; +pub use lease_service::LeaseServiceImpl; +pub use watch_service::WatchServiceImpl; +pub use cluster_service::ClusterServiceImpl; +pub use internal_service::RaftServiceImpl; + +// Re-export Raft client and config +pub use raft_client::{GrpcRaftClient, RetryConfig}; diff --git a/chainfire/crates/chainfire-api/src/raft_client.rs b/chainfire/crates/chainfire-api/src/raft_client.rs new file mode 100644 index 0000000..cf8c024 --- /dev/null +++ b/chainfire/crates/chainfire-api/src/raft_client.rs @@ -0,0 +1,428 @@ +//! gRPC client for Raft RPC +//! +//! This module provides a gRPC-based implementation of RaftRpcClient +//! for node-to-node Raft communication with retry and backoff support. + +use crate::internal_proto::{ + raft_service_client::RaftServiceClient, AppendEntriesRequest as ProtoAppendEntriesRequest, + InstallSnapshotRequest as ProtoInstallSnapshotRequest, LogEntry as ProtoLogEntry, + VoteRequest as ProtoVoteRequest, +}; +use chainfire_raft::network::{RaftNetworkError, RaftRpcClient}; +use chainfire_raft::TypeConfig; +use chainfire_types::NodeId; +use openraft::raft::{ + AppendEntriesRequest, AppendEntriesResponse, InstallSnapshotRequest, InstallSnapshotResponse, + VoteRequest, VoteResponse, +}; +use openraft::{CommittedLeaderId, LogId, Vote}; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Duration; +use tokio::sync::RwLock; +use tonic::transport::Channel; +use tracing::{debug, error, trace, warn}; + +/// Configuration for RPC retry behavior with exponential backoff. +#[derive(Debug, Clone)] +pub struct RetryConfig { + /// Initial timeout for RPC calls (default: 500ms) + pub initial_timeout: Duration, + /// Maximum timeout after backoff (default: 30s) + pub max_timeout: Duration, + /// Maximum number of retry attempts (default: 3) + pub max_retries: u32, + /// Backoff multiplier between retries (default: 2.0) + pub backoff_multiplier: f64, +} + +impl Default for RetryConfig { + fn default() -> Self { + Self { + initial_timeout: Duration::from_millis(500), + max_timeout: Duration::from_secs(30), + max_retries: 3, + backoff_multiplier: 2.0, + } + } +} + +impl RetryConfig { + /// Create a new RetryConfig with custom values + pub fn new( + initial_timeout: Duration, + max_timeout: Duration, + max_retries: u32, + backoff_multiplier: f64, + ) -> Self { + Self { + initial_timeout, + max_timeout, + max_retries, + backoff_multiplier, + } + } + + /// Calculate timeout for a given retry attempt (0-indexed) + fn timeout_for_attempt(&self, attempt: u32) -> Duration { + let multiplier = self.backoff_multiplier.powi(attempt as i32); + let timeout_millis = (self.initial_timeout.as_millis() as f64 * multiplier) as u64; + let timeout = Duration::from_millis(timeout_millis); + timeout.min(self.max_timeout) + } +} + +/// gRPC-based Raft RPC client with retry support +pub struct GrpcRaftClient { + /// Cached gRPC clients per node + clients: Arc>>>, + /// Node address mapping + node_addrs: Arc>>, + /// Retry configuration + retry_config: RetryConfig, +} + +impl GrpcRaftClient { + /// Create a new gRPC Raft client with default retry config + pub fn new() -> Self { + Self { + clients: Arc::new(RwLock::new(HashMap::new())), + node_addrs: Arc::new(RwLock::new(HashMap::new())), + retry_config: RetryConfig::default(), + } + } + + /// Create a new gRPC Raft client with custom retry config + pub fn new_with_retry(retry_config: RetryConfig) -> Self { + Self { + clients: Arc::new(RwLock::new(HashMap::new())), + node_addrs: Arc::new(RwLock::new(HashMap::new())), + retry_config, + } + } + + /// Add or update a node's address + pub async fn add_node(&self, id: NodeId, addr: String) { + debug!(node_id = id, addr = %addr, "Adding node address"); + self.node_addrs.write().await.insert(id, addr); + } + + /// Remove a node + pub async fn remove_node(&self, id: NodeId) { + self.node_addrs.write().await.remove(&id); + self.clients.write().await.remove(&id); + } + + /// Get or create a gRPC client for the target node + async fn get_client(&self, target: NodeId) -> Result, RaftNetworkError> { + // Check cache first + { + let clients = self.clients.read().await; + if let Some(client) = clients.get(&target) { + return Ok(client.clone()); + } + } + + // Get address + let addr = { + let addrs = self.node_addrs.read().await; + addrs.get(&target).cloned() + }; + + let addr = addr.ok_or(RaftNetworkError::NodeNotFound(target))?; + + // Create new connection + let endpoint = format!("http://{}", addr); + trace!(target = target, endpoint = %endpoint, "Connecting to node"); + + let channel = Channel::from_shared(endpoint.clone()) + .map_err(|e| RaftNetworkError::ConnectionFailed { + node_id: target, + reason: e.to_string(), + })? + .connect() + .await + .map_err(|e| RaftNetworkError::ConnectionFailed { + node_id: target, + reason: e.to_string(), + })?; + + let client = RaftServiceClient::new(channel); + + // Cache the client + self.clients.write().await.insert(target, client.clone()); + + Ok(client) + } + + /// Invalidate cached client for a node (e.g., on connection failure) + async fn invalidate_client(&self, target: NodeId) { + self.clients.write().await.remove(&target); + } + + /// Execute an async operation with retry and exponential backoff + async fn with_retry( + &self, + target: NodeId, + rpc_name: &str, + mut operation: F, + ) -> Result + where + F: FnMut() -> Fut, + Fut: std::future::Future>, + { + let mut last_error = None; + + for attempt in 0..=self.retry_config.max_retries { + let timeout = self.retry_config.timeout_for_attempt(attempt); + + trace!( + target = target, + rpc = rpc_name, + attempt = attempt, + timeout_ms = timeout.as_millis(), + "Attempting RPC" + ); + + match tokio::time::timeout(timeout, operation()).await { + Ok(Ok(result)) => return Ok(result), + Ok(Err(e)) => { + warn!( + target = target, + rpc = rpc_name, + attempt = attempt, + error = %e, + "RPC failed" + ); + // Invalidate cached client on failure + self.invalidate_client(target).await; + last_error = Some(e); + } + Err(_) => { + warn!( + target = target, + rpc = rpc_name, + attempt = attempt, + timeout_ms = timeout.as_millis(), + "RPC timed out" + ); + // Invalidate cached client on timeout + self.invalidate_client(target).await; + last_error = Some(RaftNetworkError::RpcFailed(format!( + "{} timed out after {}ms", + rpc_name, + timeout.as_millis() + ))); + } + } + + // Wait before retry (backoff delay) + if attempt < self.retry_config.max_retries { + let backoff_delay = self.retry_config.timeout_for_attempt(attempt); + tokio::time::sleep(backoff_delay).await; + } + } + + Err(last_error.unwrap_or_else(|| { + RaftNetworkError::RpcFailed(format!( + "{} failed after {} retries", + rpc_name, self.retry_config.max_retries + )) + })) + } +} + +impl Default for GrpcRaftClient { + fn default() -> Self { + Self::new() + } +} + +#[async_trait::async_trait] +impl RaftRpcClient for GrpcRaftClient { + async fn vote( + &self, + target: NodeId, + req: VoteRequest, + ) -> Result, RaftNetworkError> { + trace!(target = target, term = req.vote.leader_id().term, "Sending vote request"); + + self.with_retry(target, "vote", || async { + let mut client = self.get_client(target).await?; + + // Convert to proto request + let proto_req = ProtoVoteRequest { + term: req.vote.leader_id().term, + candidate_id: req.vote.leader_id().node_id, + last_log_index: req.last_log_id.map(|id| id.index).unwrap_or(0), + last_log_term: req.last_log_id.map(|id| id.leader_id.term).unwrap_or(0), + }; + + let response = client + .vote(proto_req) + .await + .map_err(|e| RaftNetworkError::RpcFailed(e.to_string()))?; + + let resp = response.into_inner(); + + // Convert from proto response + let last_log_id = if resp.last_log_index > 0 { + Some(LogId::new( + CommittedLeaderId::new(resp.last_log_term, 0), + resp.last_log_index, + )) + } else { + None + }; + + Ok(VoteResponse { + vote: Vote::new(resp.term, target), + vote_granted: resp.vote_granted, + last_log_id, + }) + }) + .await + } + + async fn append_entries( + &self, + target: NodeId, + req: AppendEntriesRequest, + ) -> Result, RaftNetworkError> { + trace!( + target = target, + entries = req.entries.len(), + "Sending append entries" + ); + + // Clone entries once for potential retries + let entries_data: Vec<(u64, u64, Vec)> = req + .entries + .iter() + .map(|e| { + let data = match &e.payload { + openraft::EntryPayload::Blank => vec![], + openraft::EntryPayload::Normal(cmd) => { + bincode::serialize(cmd).unwrap_or_default() + } + openraft::EntryPayload::Membership(_) => vec![], + }; + (e.log_id.index, e.log_id.leader_id.term, data) + }) + .collect(); + + let term = req.vote.leader_id().term; + let leader_id = req.vote.leader_id().node_id; + let prev_log_index = req.prev_log_id.map(|id| id.index).unwrap_or(0); + let prev_log_term = req.prev_log_id.map(|id| id.leader_id.term).unwrap_or(0); + let leader_commit = req.leader_commit.map(|id| id.index).unwrap_or(0); + + self.with_retry(target, "append_entries", || { + let entries_data = entries_data.clone(); + async move { + let mut client = self.get_client(target).await?; + + let entries: Vec = entries_data + .into_iter() + .map(|(index, term, data)| ProtoLogEntry { index, term, data }) + .collect(); + + let proto_req = ProtoAppendEntriesRequest { + term, + leader_id, + prev_log_index, + prev_log_term, + entries, + leader_commit, + }; + + let response = client + .append_entries(proto_req) + .await + .map_err(|e| RaftNetworkError::RpcFailed(e.to_string()))?; + + let resp = response.into_inner(); + + // Convert response + if resp.success { + Ok(AppendEntriesResponse::Success) + } else if resp.conflict_term > 0 { + Ok(AppendEntriesResponse::HigherVote(Vote::new( + resp.conflict_term, + target, + ))) + } else { + Ok(AppendEntriesResponse::Conflict) + } + } + }) + .await + } + + async fn install_snapshot( + &self, + target: NodeId, + req: InstallSnapshotRequest, + ) -> Result, RaftNetworkError> { + debug!( + target = target, + last_log_id = ?req.meta.last_log_id, + data_len = req.data.len(), + "Sending install snapshot" + ); + + let term = req.vote.leader_id().term; + let leader_id = req.vote.leader_id().node_id; + let last_included_index = req.meta.last_log_id.map(|id| id.index).unwrap_or(0); + let last_included_term = req.meta.last_log_id.map(|id| id.leader_id.term).unwrap_or(0); + let offset = req.offset; + let data = req.data.clone(); + let done = req.done; + + let result = self + .with_retry(target, "install_snapshot", || { + let data = data.clone(); + async move { + let mut client = self.get_client(target).await?; + + let proto_req = ProtoInstallSnapshotRequest { + term, + leader_id, + last_included_index, + last_included_term, + offset, + data, + done, + }; + + // Send as stream (single item) + let stream = tokio_stream::once(proto_req); + + let response = client + .install_snapshot(stream) + .await + .map_err(|e| RaftNetworkError::RpcFailed(e.to_string()))?; + + let resp = response.into_inner(); + + Ok(InstallSnapshotResponse { + vote: Vote::new(resp.term, target), + }) + } + }) + .await; + + // Log error for install_snapshot failures + if let Err(ref e) = result { + error!( + target = target, + last_log_id = ?req.meta.last_log_id, + data_len = req.data.len(), + error = %e, + "install_snapshot failed after retries" + ); + } + + result + } +} diff --git a/chainfire/crates/chainfire-api/src/watch_service.rs b/chainfire/crates/chainfire-api/src/watch_service.rs new file mode 100644 index 0000000..71aec81 --- /dev/null +++ b/chainfire/crates/chainfire-api/src/watch_service.rs @@ -0,0 +1,157 @@ +//! Watch service implementation + +use crate::conversions::make_header; +use crate::proto::{ + watch_server::Watch, WatchRequest, WatchResponse, +}; +use chainfire_watch::{WatchRegistry, WatchStream}; +use std::pin::Pin; +use std::sync::Arc; +use tokio::sync::mpsc; +use tokio_stream::{wrappers::ReceiverStream, StreamExt}; +use tonic::{Request, Response, Status, Streaming}; +use tracing::{debug, warn}; + +/// Watch service implementation +pub struct WatchServiceImpl { + /// Watch registry + registry: Arc, + /// Cluster ID + cluster_id: u64, + /// Member ID + member_id: u64, +} + +impl WatchServiceImpl { + /// Create a new watch service + pub fn new(registry: Arc, cluster_id: u64, member_id: u64) -> Self { + Self { + registry, + cluster_id, + member_id, + } + } + + fn make_header(&self, revision: u64) -> crate::proto::ResponseHeader { + make_header(self.cluster_id, self.member_id, revision, 0) + } +} + +#[tonic::async_trait] +impl Watch for WatchServiceImpl { + type WatchStream = Pin> + Send>>; + + async fn watch( + &self, + request: Request>, + ) -> Result, Status> { + let mut in_stream = request.into_inner(); + let registry = Arc::clone(&self.registry); + let cluster_id = self.cluster_id; + let member_id = self.member_id; + + // Channel for sending responses back to client + let (tx, rx) = mpsc::channel(128); + let tx_for_events = tx.clone(); + + // Channel for watch events + let (event_tx, mut event_rx) = mpsc::channel::(128); + + // Spawn task to handle the bidirectional stream + tokio::spawn(async move { + let mut stream = WatchStream::new(Arc::clone(®istry), { + let event_tx = event_tx.clone(); + let (watch_tx, mut watch_rx) = mpsc::channel(64); + + // Forward internal watch responses to proto responses + tokio::spawn(async move { + while let Some(resp) = watch_rx.recv().await { + let proto_resp = internal_to_proto_response(resp, cluster_id, member_id); + if event_tx.send(proto_resp).await.is_err() { + break; + } + } + }); + + watch_tx + }); + + while let Some(result) = in_stream.next().await { + match result { + Ok(req) => { + if let Some(request_union) = req.request_union { + let response = match request_union { + crate::proto::watch_request::RequestUnion::CreateRequest(create) => { + let internal_req: chainfire_types::watch::WatchRequest = + create.into(); + let resp = stream.create_watch(internal_req); + internal_to_proto_response(resp, cluster_id, member_id) + } + crate::proto::watch_request::RequestUnion::CancelRequest(cancel) => { + let resp = stream.cancel_watch(cancel.watch_id); + internal_to_proto_response(resp, cluster_id, member_id) + } + crate::proto::watch_request::RequestUnion::ProgressRequest(_) => { + // Send progress notification + WatchResponse { + header: Some(make_header( + cluster_id, + member_id, + registry.current_revision(), + 0, + )), + watch_id: 0, + created: false, + canceled: false, + compact_revision: 0, + cancel_reason: String::new(), + events: vec![], + } + } + }; + + if tx.send(Ok(response)).await.is_err() { + break; + } + } + } + Err(e) => { + warn!(error = %e, "Watch stream error"); + break; + } + } + } + + debug!(watches = stream.watch_count(), "Watch stream closed"); + // Stream cleanup happens in WatchStream::drop + }); + + // Spawn task to forward watch events + tokio::spawn(async move { + while let Some(response) = event_rx.recv().await { + if tx_for_events.send(Ok(response)).await.is_err() { + break; + } + } + }); + + let output_stream = ReceiverStream::new(rx); + Ok(Response::new(Box::pin(output_stream))) + } +} + +fn internal_to_proto_response( + resp: chainfire_types::watch::WatchResponse, + cluster_id: u64, + member_id: u64, +) -> WatchResponse { + WatchResponse { + header: Some(make_header(cluster_id, member_id, resp.compact_revision, 0)), + watch_id: resp.watch_id, + created: resp.created, + canceled: resp.canceled, + compact_revision: resp.compact_revision as i64, + cancel_reason: String::new(), + events: resp.events.into_iter().map(Into::into).collect(), + } +} diff --git a/chainfire/crates/chainfire-core/Cargo.toml b/chainfire/crates/chainfire-core/Cargo.toml new file mode 100644 index 0000000..e516053 --- /dev/null +++ b/chainfire/crates/chainfire-core/Cargo.toml @@ -0,0 +1,37 @@ +[package] +name = "chainfire-core" +version.workspace = true +edition.workspace = true +license.workspace = true +description = "Embeddable distributed cluster library with Raft consensus and SWIM gossip" +rust-version.workspace = true + +[dependencies] +# Internal crates +chainfire-types = { workspace = true } +# Note: chainfire-storage, chainfire-raft, chainfire-gossip, chainfire-watch +# will be added as implementation progresses +# chainfire-storage = { workspace = true } +# chainfire-raft = { workspace = true } +# chainfire-gossip = { workspace = true } +# chainfire-watch = { workspace = true } + +# Async runtime +tokio = { workspace = true } +tokio-stream = { workspace = true } +futures = { workspace = true } +async-trait = { workspace = true } + +# Utilities +thiserror = { workspace = true } +tracing = { workspace = true } +bytes = { workspace = true } +parking_lot = { workspace = true } +dashmap = { workspace = true } + +[dev-dependencies] +tokio = { workspace = true, features = ["test-util"] } +tempfile = { workspace = true } + +[lints] +workspace = true diff --git a/chainfire/crates/chainfire-core/src/builder.rs b/chainfire/crates/chainfire-core/src/builder.rs new file mode 100644 index 0000000..5aaae56 --- /dev/null +++ b/chainfire/crates/chainfire-core/src/builder.rs @@ -0,0 +1,221 @@ +//! Builder pattern for cluster creation + +use std::net::SocketAddr; +use std::path::PathBuf; +use std::sync::Arc; + +use chainfire_types::node::NodeRole; +use chainfire_types::RaftRole; + +use crate::callbacks::{ClusterEventHandler, KvEventHandler}; +use crate::cluster::Cluster; +use crate::config::{ClusterConfig, MemberConfig, StorageBackendConfig, TimeoutConfig}; +use crate::error::{ClusterError, Result}; +use crate::events::EventDispatcher; + +/// Builder for creating a Chainfire cluster instance +/// +/// # Example +/// +/// ```ignore +/// use chainfire_core::ClusterBuilder; +/// +/// let cluster = ClusterBuilder::new(1) +/// .name("node-1") +/// .gossip_addr("0.0.0.0:7946".parse()?) +/// .raft_addr("0.0.0.0:2380".parse()?) +/// .bootstrap(true) +/// .build() +/// .await?; +/// ``` +pub struct ClusterBuilder { + config: ClusterConfig, + cluster_handlers: Vec>, + kv_handlers: Vec>, +} + +impl ClusterBuilder { + /// Create a new cluster builder with the given node ID + pub fn new(node_id: u64) -> Self { + Self { + config: ClusterConfig { + node_id, + ..Default::default() + }, + cluster_handlers: Vec::new(), + kv_handlers: Vec::new(), + } + } + + /// Set the node name + pub fn name(mut self, name: impl Into) -> Self { + self.config.node_name = name.into(); + self + } + + /// Set the node role (ControlPlane or Worker) + pub fn role(mut self, role: NodeRole) -> Self { + self.config.node_role = role; + self + } + + /// Set the Raft participation role (Voter, Learner, or None) + pub fn raft_role(mut self, role: RaftRole) -> Self { + self.config.raft_role = role; + self + } + + /// Set the API listen address + pub fn api_addr(mut self, addr: SocketAddr) -> Self { + self.config.api_addr = Some(addr); + self + } + + /// Set the Raft listen address (for control plane nodes) + pub fn raft_addr(mut self, addr: SocketAddr) -> Self { + self.config.raft_addr = Some(addr); + self + } + + /// Set the gossip listen address + pub fn gossip_addr(mut self, addr: SocketAddr) -> Self { + self.config.gossip_addr = addr; + self + } + + /// Set the storage backend + pub fn storage(mut self, backend: StorageBackendConfig) -> Self { + self.config.storage = backend; + self + } + + /// Set the data directory (convenience method for RocksDB storage) + pub fn data_dir(mut self, path: impl Into) -> Self { + self.config.storage = StorageBackendConfig::RocksDb { path: path.into() }; + self + } + + /// Use in-memory storage + pub fn memory_storage(mut self) -> Self { + self.config.storage = StorageBackendConfig::Memory; + self + } + + /// Add initial cluster members (for bootstrap) + pub fn initial_members(mut self, members: Vec) -> Self { + self.config.initial_members = members; + self + } + + /// Add a single initial member + pub fn add_member(mut self, member: MemberConfig) -> Self { + self.config.initial_members.push(member); + self + } + + /// Enable cluster bootstrap (first node) + pub fn bootstrap(mut self, bootstrap: bool) -> Self { + self.config.bootstrap = bootstrap; + self + } + + /// Set the cluster ID + pub fn cluster_id(mut self, id: u64) -> Self { + self.config.cluster_id = id; + self + } + + /// Enable gRPC API server + pub fn with_grpc_api(mut self, enabled: bool) -> Self { + self.config.enable_grpc_api = enabled; + self + } + + /// Set timeout configuration + pub fn timeouts(mut self, timeouts: TimeoutConfig) -> Self { + self.config.timeouts = timeouts; + self + } + + /// Register a cluster event handler + /// + /// Multiple handlers can be registered. They will all be called + /// when cluster events occur. + pub fn on_cluster_event(mut self, handler: H) -> Self + where + H: ClusterEventHandler + 'static, + { + self.cluster_handlers.push(Arc::new(handler)); + self + } + + /// Register a cluster event handler (Arc version) + pub fn on_cluster_event_arc(mut self, handler: Arc) -> Self { + self.cluster_handlers.push(handler); + self + } + + /// Register a KV event handler + /// + /// Multiple handlers can be registered. They will all be called + /// when KV events occur. + pub fn on_kv_event(mut self, handler: H) -> Self + where + H: KvEventHandler + 'static, + { + self.kv_handlers.push(Arc::new(handler)); + self + } + + /// Register a KV event handler (Arc version) + pub fn on_kv_event_arc(mut self, handler: Arc) -> Self { + self.kv_handlers.push(handler); + self + } + + /// Validate the configuration + fn validate(&self) -> Result<()> { + if self.config.node_id == 0 { + return Err(ClusterError::Config("node_id must be non-zero".into())); + } + + if self.config.node_name.is_empty() { + return Err(ClusterError::Config("node_name is required".into())); + } + + // Raft-participating nodes need a Raft address + if self.config.raft_role.participates_in_raft() && self.config.raft_addr.is_none() { + return Err(ClusterError::Config( + "raft_addr is required for Raft-participating nodes".into(), + )); + } + + Ok(()) + } + + /// Build the cluster instance + /// + /// This initializes the storage backend, Raft (if applicable), and gossip. + pub async fn build(self) -> Result { + self.validate()?; + + // Create event dispatcher with registered handlers + let mut event_dispatcher = EventDispatcher::new(); + for handler in self.cluster_handlers { + event_dispatcher.add_cluster_handler(handler); + } + for handler in self.kv_handlers { + event_dispatcher.add_kv_handler(handler); + } + + // Create the cluster + let cluster = Cluster::new(self.config, event_dispatcher); + + // TODO: Initialize storage backend + // TODO: Initialize Raft if role participates + // TODO: Initialize gossip + // TODO: Start background tasks + + Ok(cluster) + } +} diff --git a/chainfire/crates/chainfire-core/src/callbacks.rs b/chainfire/crates/chainfire-core/src/callbacks.rs new file mode 100644 index 0000000..1dcf8a1 --- /dev/null +++ b/chainfire/crates/chainfire-core/src/callbacks.rs @@ -0,0 +1,103 @@ +//! Callback traits for cluster events + +use async_trait::async_trait; + +use chainfire_types::node::NodeInfo; + +use crate::kvs::KvEntry; + +/// Handler for cluster lifecycle events +/// +/// Implement this trait to receive notifications about cluster membership +/// and leadership changes. +#[async_trait] +pub trait ClusterEventHandler: Send + Sync { + /// Called when a node joins the cluster + async fn on_node_joined(&self, _node: &NodeInfo) {} + + /// Called when a node leaves the cluster + async fn on_node_left(&self, _node_id: u64, _reason: LeaveReason) {} + + /// Called when leadership changes + async fn on_leader_changed(&self, _old_leader: Option, _new_leader: u64) {} + + /// Called when this node becomes leader + async fn on_became_leader(&self) {} + + /// Called when this node loses leadership + async fn on_lost_leadership(&self) {} + + /// Called when cluster membership changes + async fn on_membership_changed(&self, _members: &[NodeInfo]) {} + + /// Called when a network partition is detected + async fn on_partition_detected(&self, _reachable: &[u64], _unreachable: &[u64]) {} + + /// Called when cluster is ready (initial leader elected, etc.) + async fn on_cluster_ready(&self) {} +} + +/// Handler for KV store events +/// +/// Implement this trait to receive notifications about key-value changes. +#[async_trait] +pub trait KvEventHandler: Send + Sync { + /// Called when a key is created or updated + async fn on_key_changed( + &self, + _namespace: &str, + _key: &[u8], + _value: &[u8], + _revision: u64, + ) { + } + + /// Called when a key is deleted + async fn on_key_deleted(&self, _namespace: &str, _key: &[u8], _revision: u64) {} + + /// Called when multiple keys with a prefix are changed + async fn on_prefix_changed(&self, _namespace: &str, _prefix: &[u8], _entries: &[KvEntry]) {} +} + +/// Reason for node departure from the cluster +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum LeaveReason { + /// Node left gracefully + Graceful, + + /// Node timed out (failed to respond) + Timeout, + + /// Network partition detected + NetworkPartition, + + /// Node was explicitly evicted + Evicted, + + /// Unknown reason + Unknown, +} + +impl std::fmt::Display for LeaveReason { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + LeaveReason::Graceful => write!(f, "graceful"), + LeaveReason::Timeout => write!(f, "timeout"), + LeaveReason::NetworkPartition => write!(f, "network_partition"), + LeaveReason::Evicted => write!(f, "evicted"), + LeaveReason::Unknown => write!(f, "unknown"), + } + } +} + +/// A no-op event handler for when callbacks are not needed +pub struct NoOpClusterEventHandler; + +#[async_trait] +impl ClusterEventHandler for NoOpClusterEventHandler {} + +/// A no-op KV event handler +pub struct NoOpKvEventHandler; + +#[async_trait] +impl KvEventHandler for NoOpKvEventHandler {} diff --git a/chainfire/crates/chainfire-core/src/cluster.rs b/chainfire/crates/chainfire-core/src/cluster.rs new file mode 100644 index 0000000..774a5ad --- /dev/null +++ b/chainfire/crates/chainfire-core/src/cluster.rs @@ -0,0 +1,282 @@ +//! Cluster management + +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; + +use parking_lot::RwLock; +use tokio::sync::broadcast; + +use chainfire_types::node::NodeInfo; + +use crate::config::ClusterConfig; +use crate::error::{ClusterError, Result}; +use crate::events::EventDispatcher; +use crate::kvs::{Kv, KvHandle}; + +/// Current state of the cluster +#[derive(Debug, Clone)] +pub struct ClusterState { + /// Whether this node is the leader + pub is_leader: bool, + + /// Current leader's node ID + pub leader_id: Option, + + /// Current term (Raft) + pub term: u64, + + /// All known cluster members + pub members: Vec, + + /// Whether the cluster is ready (initial leader elected) + pub ready: bool, +} + +impl Default for ClusterState { + fn default() -> Self { + Self { + is_leader: false, + leader_id: None, + term: 0, + members: Vec::new(), + ready: false, + } + } +} + +/// Main cluster instance +/// +/// This is the primary interface for interacting with a Chainfire cluster. +/// It manages Raft consensus, gossip membership, and the distributed KV store. +pub struct Cluster { + /// Node configuration + config: ClusterConfig, + + /// Current cluster state + state: Arc>, + + /// KV store + kv: Arc, + + /// Event dispatcher + event_dispatcher: Arc, + + /// Shutdown flag + shutdown: AtomicBool, + + /// Shutdown signal sender + shutdown_tx: broadcast::Sender<()>, +} + +impl Cluster { + /// Create a new cluster instance + pub(crate) fn new( + config: ClusterConfig, + event_dispatcher: EventDispatcher, + ) -> Self { + let (shutdown_tx, _) = broadcast::channel(1); + + Self { + config, + state: Arc::new(RwLock::new(ClusterState::default())), + kv: Arc::new(Kv::new()), + event_dispatcher: Arc::new(event_dispatcher), + shutdown: AtomicBool::new(false), + shutdown_tx, + } + } + + /// Get this node's ID + pub fn node_id(&self) -> u64 { + self.config.node_id + } + + /// Get this node's name + pub fn node_name(&self) -> &str { + &self.config.node_name + } + + /// Get a handle for interacting with the cluster + /// + /// Handles are lightweight and can be cloned freely. + pub fn handle(&self) -> ClusterHandle { + ClusterHandle { + node_id: self.config.node_id, + state: self.state.clone(), + kv: self.kv.clone(), + shutdown_tx: self.shutdown_tx.clone(), + } + } + + /// Get the KV store interface + pub fn kv(&self) -> &Arc { + &self.kv + } + + /// Get current cluster state + pub fn state(&self) -> ClusterState { + self.state.read().clone() + } + + /// Check if this node is the leader + pub fn is_leader(&self) -> bool { + self.state.read().is_leader + } + + /// Get current leader ID + pub fn leader(&self) -> Option { + self.state.read().leader_id + } + + /// Get all cluster members + pub fn members(&self) -> Vec { + self.state.read().members.clone() + } + + /// Check if the cluster is ready + pub fn is_ready(&self) -> bool { + self.state.read().ready + } + + /// Join an existing cluster + /// + /// Connects to seed nodes and joins the cluster. + pub async fn join(&self, _seed_addrs: &[std::net::SocketAddr]) -> Result<()> { + // TODO: Implement cluster joining via gossip + Ok(()) + } + + /// Leave the cluster gracefully + pub async fn leave(&self) -> Result<()> { + // TODO: Implement graceful leave + self.shutdown(); + Ok(()) + } + + /// Add a new node to the cluster (leader only) + pub async fn add_node(&self, _node: NodeInfo, _as_learner: bool) -> Result<()> { + if !self.is_leader() { + return Err(ClusterError::NotLeader { + leader_id: self.leader(), + }); + } + + // TODO: Implement node addition via Raft + Ok(()) + } + + /// Remove a node from the cluster (leader only) + pub async fn remove_node(&self, _node_id: u64) -> Result<()> { + if !self.is_leader() { + return Err(ClusterError::NotLeader { + leader_id: self.leader(), + }); + } + + // TODO: Implement node removal via Raft + Ok(()) + } + + /// Promote a learner to voter (leader only) + pub async fn promote_learner(&self, _node_id: u64) -> Result<()> { + if !self.is_leader() { + return Err(ClusterError::NotLeader { + leader_id: self.leader(), + }); + } + + // TODO: Implement learner promotion via Raft + Ok(()) + } + + /// Run the cluster (blocks until shutdown) + pub async fn run(self) -> Result<()> { + self.run_until_shutdown(std::future::pending()).await + } + + /// Run with graceful shutdown signal + pub async fn run_until_shutdown(self, shutdown_signal: F) -> Result<()> + where + F: std::future::Future, + { + let mut shutdown_rx = self.shutdown_tx.subscribe(); + + tokio::select! { + _ = shutdown_signal => { + tracing::info!("Received shutdown signal"); + } + _ = shutdown_rx.recv() => { + tracing::info!("Received internal shutdown"); + } + } + + // TODO: Cleanup resources + + Ok(()) + } + + /// Trigger shutdown + pub fn shutdown(&self) { + self.shutdown.store(true, Ordering::SeqCst); + let _ = self.shutdown_tx.send(()); + } + + /// Check if shutdown was requested + pub fn is_shutting_down(&self) -> bool { + self.shutdown.load(Ordering::SeqCst) + } + + /// Get the event dispatcher + pub(crate) fn event_dispatcher(&self) -> &Arc { + &self.event_dispatcher + } +} + +/// Lightweight handle for cluster operations +/// +/// This handle can be cloned and passed around cheaply. It provides +/// access to cluster state and the KV store without owning the cluster. +#[derive(Clone)] +pub struct ClusterHandle { + node_id: u64, + state: Arc>, + kv: Arc, + shutdown_tx: broadcast::Sender<()>, +} + +impl ClusterHandle { + /// Get this node's ID + pub fn node_id(&self) -> u64 { + self.node_id + } + + /// Get a KV handle + pub fn kv(&self) -> KvHandle { + KvHandle::new(self.kv.clone()) + } + + /// Check if this node is the leader + pub fn is_leader(&self) -> bool { + self.state.read().is_leader + } + + /// Get current leader ID + pub fn leader(&self) -> Option { + self.state.read().leader_id + } + + /// Get all cluster members + pub fn members(&self) -> Vec { + self.state.read().members.clone() + } + + /// Get current cluster state + pub fn state(&self) -> ClusterState { + self.state.read().clone() + } + + /// Trigger cluster shutdown + pub fn shutdown(&self) { + let _ = self.shutdown_tx.send(()); + } +} diff --git a/chainfire/crates/chainfire-core/src/config.rs b/chainfire/crates/chainfire-core/src/config.rs new file mode 100644 index 0000000..c206d4f --- /dev/null +++ b/chainfire/crates/chainfire-core/src/config.rs @@ -0,0 +1,162 @@ +//! Configuration types for chainfire-core + +use std::net::SocketAddr; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Duration; + +use chainfire_types::node::NodeRole; +use chainfire_types::RaftRole; + +// Forward declaration - will be implemented in chainfire-storage +// For now, use a placeholder trait +use async_trait::async_trait; + +/// Storage backend trait for pluggable storage +#[async_trait] +pub trait StorageBackend: Send + Sync { + /// Get a value by key + async fn get(&self, key: &[u8]) -> std::io::Result>>; + /// Put a value + async fn put(&self, key: &[u8], value: &[u8]) -> std::io::Result<()>; + /// Delete a key + async fn delete(&self, key: &[u8]) -> std::io::Result; +} + +/// Configuration for a cluster node +#[derive(Debug, Clone)] +pub struct ClusterConfig { + /// Unique node ID + pub node_id: u64, + + /// Human-readable node name + pub node_name: String, + + /// Node role (ControlPlane or Worker) + pub node_role: NodeRole, + + /// Raft participation role (Voter, Learner, or None) + pub raft_role: RaftRole, + + /// API listen address for client connections + pub api_addr: Option, + + /// Raft listen address for peer-to-peer Raft communication + pub raft_addr: Option, + + /// Gossip listen address for membership discovery + pub gossip_addr: SocketAddr, + + /// Storage backend configuration + pub storage: StorageBackendConfig, + + /// Initial cluster members for bootstrap + pub initial_members: Vec, + + /// Whether to bootstrap the cluster (first node) + pub bootstrap: bool, + + /// Cluster ID + pub cluster_id: u64, + + /// Enable gRPC API server + pub enable_grpc_api: bool, + + /// Timeouts + pub timeouts: TimeoutConfig, +} + +impl Default for ClusterConfig { + fn default() -> Self { + Self { + node_id: 0, + node_name: String::new(), + node_role: NodeRole::ControlPlane, + raft_role: RaftRole::Voter, + api_addr: None, + raft_addr: None, + gossip_addr: "0.0.0.0:7946".parse().unwrap(), + storage: StorageBackendConfig::Memory, + initial_members: Vec::new(), + bootstrap: false, + cluster_id: 1, + enable_grpc_api: false, + timeouts: TimeoutConfig::default(), + } + } +} + +/// Storage backend configuration +#[derive(Clone)] +pub enum StorageBackendConfig { + /// In-memory storage (for testing/simple deployments) + Memory, + + /// RocksDB storage + RocksDb { + /// Data directory path + path: PathBuf, + }, + + /// Custom storage backend + Custom(Arc), +} + +impl std::fmt::Debug for StorageBackendConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + StorageBackendConfig::Memory => write!(f, "Memory"), + StorageBackendConfig::RocksDb { path } => { + f.debug_struct("RocksDb").field("path", path).finish() + } + StorageBackendConfig::Custom(_) => write!(f, "Custom(...)"), + } + } +} + +/// Configuration for a cluster member +#[derive(Debug, Clone)] +pub struct MemberConfig { + /// Node ID + pub id: u64, + + /// Node name + pub name: String, + + /// Raft address + pub raft_addr: String, + + /// Client API address + pub client_addr: String, +} + +/// Timeout configuration +#[derive(Debug, Clone)] +pub struct TimeoutConfig { + /// Raft heartbeat interval + pub heartbeat_interval: Duration, + + /// Raft election timeout range (min) + pub election_timeout_min: Duration, + + /// Raft election timeout range (max) + pub election_timeout_max: Duration, + + /// Connection timeout + pub connection_timeout: Duration, + + /// Request timeout + pub request_timeout: Duration, +} + +impl Default for TimeoutConfig { + fn default() -> Self { + Self { + heartbeat_interval: Duration::from_millis(150), + election_timeout_min: Duration::from_millis(300), + election_timeout_max: Duration::from_millis(600), + connection_timeout: Duration::from_secs(5), + request_timeout: Duration::from_secs(10), + } + } +} diff --git a/chainfire/crates/chainfire-core/src/error.rs b/chainfire/crates/chainfire-core/src/error.rs new file mode 100644 index 0000000..0fcc21e --- /dev/null +++ b/chainfire/crates/chainfire-core/src/error.rs @@ -0,0 +1,78 @@ +//! Error types for chainfire-core + +use thiserror::Error; + +/// Result type for chainfire-core operations +pub type Result = std::result::Result; + +/// Errors that can occur in cluster operations +#[derive(Debug, Error)] +pub enum ClusterError { + /// Storage operation failed + #[error("storage error: {0}")] + Storage(String), + + /// Raft consensus error + #[error("raft error: {0}")] + Raft(String), + + /// Gossip protocol error + #[error("gossip error: {0}")] + Gossip(String), + + /// Network error + #[error("network error: {0}")] + Network(String), + + /// Configuration error + #[error("configuration error: {0}")] + Config(String), + + /// Not the leader - write operations must go to leader + #[error("not the leader, current leader is: {leader_id:?}")] + NotLeader { + /// Current leader's node ID, if known + leader_id: Option, + }, + + /// Key not found + #[error("key not found")] + KeyNotFound, + + /// Compare-and-swap version mismatch + #[error("version mismatch: expected {expected}, got {actual}")] + VersionMismatch { + /// Expected version + expected: u64, + /// Actual version + actual: u64, + }, + + /// Cluster not initialized + #[error("cluster not initialized")] + NotInitialized, + + /// Node already exists in cluster + #[error("node {0} already exists in cluster")] + NodeExists(u64), + + /// Node not found in cluster + #[error("node {0} not found in cluster")] + NodeNotFound(u64), + + /// Operation timed out + #[error("operation timed out")] + Timeout, + + /// Cluster is shutting down + #[error("cluster is shutting down")] + ShuttingDown, + + /// Internal error + #[error("internal error: {0}")] + Internal(String), + + /// IO error + #[error("io error: {0}")] + Io(#[from] std::io::Error), +} diff --git a/chainfire/crates/chainfire-core/src/events.rs b/chainfire/crates/chainfire-core/src/events.rs new file mode 100644 index 0000000..1395e67 --- /dev/null +++ b/chainfire/crates/chainfire-core/src/events.rs @@ -0,0 +1,198 @@ +//! Event types and dispatcher + +use std::sync::Arc; + +use tokio::sync::broadcast; + +use chainfire_types::node::NodeInfo; + +use crate::callbacks::{ClusterEventHandler, KvEventHandler, LeaveReason}; + +/// Cluster-level events +#[derive(Debug, Clone)] +pub enum ClusterEvent { + /// A node joined the cluster + NodeJoined(NodeInfo), + + /// A node left the cluster + NodeLeft { + /// The node ID that left + node_id: u64, + /// Why the node left + reason: LeaveReason, + }, + + /// Leadership changed + LeaderChanged { + /// Previous leader (None if no previous leader) + old: Option, + /// New leader + new: u64, + }, + + /// This node became the leader + BecameLeader, + + /// This node lost leadership + LostLeadership, + + /// Cluster membership changed + MembershipChanged(Vec), + + /// Network partition detected + PartitionDetected { + /// Nodes that are reachable + reachable: Vec, + /// Nodes that are unreachable + unreachable: Vec, + }, + + /// Cluster is ready + ClusterReady, +} + +/// KV store events +#[derive(Debug, Clone)] +pub enum KvEvent { + /// A key was created or updated + KeyChanged { + /// Namespace of the key + namespace: String, + /// The key that changed + key: Vec, + /// New value + value: Vec, + /// Revision number + revision: u64, + }, + + /// A key was deleted + KeyDeleted { + /// Namespace of the key + namespace: String, + /// The key that was deleted + key: Vec, + /// Revision number + revision: u64, + }, +} + +/// Event dispatcher that manages callbacks and event broadcasting +pub struct EventDispatcher { + cluster_handlers: Vec>, + kv_handlers: Vec>, + event_tx: broadcast::Sender, +} + +impl EventDispatcher { + /// Create a new event dispatcher + pub fn new() -> Self { + let (event_tx, _) = broadcast::channel(1024); + Self { + cluster_handlers: Vec::new(), + kv_handlers: Vec::new(), + event_tx, + } + } + + /// Add a cluster event handler + pub fn add_cluster_handler(&mut self, handler: Arc) { + self.cluster_handlers.push(handler); + } + + /// Add a KV event handler + pub fn add_kv_handler(&mut self, handler: Arc) { + self.kv_handlers.push(handler); + } + + /// Get a subscriber for cluster events + pub fn subscribe(&self) -> broadcast::Receiver { + self.event_tx.subscribe() + } + + /// Dispatch a cluster event to all handlers + pub async fn dispatch_cluster_event(&self, event: ClusterEvent) { + // Broadcast to channel subscribers + let _ = self.event_tx.send(event.clone()); + + // Call registered handlers + match &event { + ClusterEvent::NodeJoined(node) => { + for handler in &self.cluster_handlers { + handler.on_node_joined(node).await; + } + } + ClusterEvent::NodeLeft { node_id, reason } => { + for handler in &self.cluster_handlers { + handler.on_node_left(*node_id, *reason).await; + } + } + ClusterEvent::LeaderChanged { old, new } => { + for handler in &self.cluster_handlers { + handler.on_leader_changed(*old, *new).await; + } + } + ClusterEvent::BecameLeader => { + for handler in &self.cluster_handlers { + handler.on_became_leader().await; + } + } + ClusterEvent::LostLeadership => { + for handler in &self.cluster_handlers { + handler.on_lost_leadership().await; + } + } + ClusterEvent::MembershipChanged(members) => { + for handler in &self.cluster_handlers { + handler.on_membership_changed(members).await; + } + } + ClusterEvent::PartitionDetected { + reachable, + unreachable, + } => { + for handler in &self.cluster_handlers { + handler.on_partition_detected(reachable, unreachable).await; + } + } + ClusterEvent::ClusterReady => { + for handler in &self.cluster_handlers { + handler.on_cluster_ready().await; + } + } + } + } + + /// Dispatch a KV event to all handlers + pub async fn dispatch_kv_event(&self, event: KvEvent) { + match &event { + KvEvent::KeyChanged { + namespace, + key, + value, + revision, + } => { + for handler in &self.kv_handlers { + handler + .on_key_changed(namespace, key, value, *revision) + .await; + } + } + KvEvent::KeyDeleted { + namespace, + key, + revision, + } => { + for handler in &self.kv_handlers { + handler.on_key_deleted(namespace, key, *revision).await; + } + } + } + } +} + +impl Default for EventDispatcher { + fn default() -> Self { + Self::new() + } +} diff --git a/chainfire/crates/chainfire-core/src/kvs.rs b/chainfire/crates/chainfire-core/src/kvs.rs new file mode 100644 index 0000000..b4ffbac --- /dev/null +++ b/chainfire/crates/chainfire-core/src/kvs.rs @@ -0,0 +1,290 @@ +//! Key-Value store abstraction + +use std::sync::Arc; +use std::time::Duration; + +use dashmap::DashMap; + +use crate::error::{ClusterError, Result}; + +/// KV store interface +/// +/// Provides access to distributed key-value storage with namespace isolation. +pub struct Kv { + namespaces: DashMap>, + default_namespace: Arc, +} + +impl Kv { + /// Create a new KV store + pub(crate) fn new() -> Self { + let default_namespace = Arc::new(KvNamespace::new("default".to_string())); + Self { + namespaces: DashMap::new(), + default_namespace, + } + } + + /// Get or create a namespace + pub fn namespace(&self, name: &str) -> Arc { + if name == "default" { + return self.default_namespace.clone(); + } + + self.namespaces + .entry(name.to_string()) + .or_insert_with(|| Arc::new(KvNamespace::new(name.to_string()))) + .clone() + } + + /// Get the default namespace + pub fn default_namespace(&self) -> &Arc { + &self.default_namespace + } + + // Convenience methods on default namespace + + /// Get a value by key from the default namespace + pub async fn get(&self, key: impl AsRef<[u8]>) -> Result>> { + self.default_namespace.get(key).await + } + + /// Put a value in the default namespace + pub async fn put(&self, key: impl AsRef<[u8]>, value: impl AsRef<[u8]>) -> Result { + self.default_namespace.put(key, value).await + } + + /// Delete a key from the default namespace + pub async fn delete(&self, key: impl AsRef<[u8]>) -> Result { + self.default_namespace.delete(key).await + } + + /// Compare-and-swap in the default namespace + pub async fn compare_and_swap( + &self, + key: impl AsRef<[u8]>, + expected_version: u64, + value: impl AsRef<[u8]>, + ) -> Result { + self.default_namespace + .compare_and_swap(key, expected_version, value) + .await + } +} + +/// KV namespace for data isolation +pub struct KvNamespace { + name: String, + // TODO: Add storage backend and raft reference +} + +impl KvNamespace { + pub(crate) fn new(name: String) -> Self { + Self { name } + } + + /// Get the namespace name + pub fn name(&self) -> &str { + &self.name + } + + /// Get a value by key + pub async fn get(&self, _key: impl AsRef<[u8]>) -> Result>> { + // TODO: Implement with storage backend + Ok(None) + } + + /// Get with revision + pub async fn get_with_revision( + &self, + _key: impl AsRef<[u8]>, + ) -> Result, u64)>> { + // TODO: Implement with storage backend + Ok(None) + } + + /// Put a value (goes through Raft if available) + pub async fn put(&self, _key: impl AsRef<[u8]>, _value: impl AsRef<[u8]>) -> Result { + // TODO: Implement with Raft + Ok(0) + } + + /// Put with options + pub async fn put_with_options( + &self, + _key: impl AsRef<[u8]>, + _value: impl AsRef<[u8]>, + _options: KvOptions, + ) -> Result { + // TODO: Implement with Raft + Ok(KvPutResult { + revision: 0, + prev_value: None, + }) + } + + /// Delete a key + pub async fn delete(&self, _key: impl AsRef<[u8]>) -> Result { + // TODO: Implement with Raft + Ok(false) + } + + /// Compare-and-swap + pub async fn compare_and_swap( + &self, + _key: impl AsRef<[u8]>, + expected_version: u64, + _value: impl AsRef<[u8]>, + ) -> Result { + // TODO: Implement with storage backend + Err(ClusterError::VersionMismatch { + expected: expected_version, + actual: 0, + }) + } + + /// Scan keys with prefix + pub async fn scan_prefix( + &self, + _prefix: impl AsRef<[u8]>, + _limit: u32, + ) -> Result> { + // TODO: Implement with storage backend + Ok(Vec::new()) + } + + /// Scan keys in a range + pub async fn scan_range( + &self, + _start: impl AsRef<[u8]>, + _end: impl AsRef<[u8]>, + _limit: u32, + ) -> Result> { + // TODO: Implement with storage backend + Ok(Vec::new()) + } + + /// Get with specified consistency level + pub async fn get_with_consistency( + &self, + _key: impl AsRef<[u8]>, + _consistency: ReadConsistency, + ) -> Result>> { + // TODO: Implement with consistency options + Ok(None) + } +} + +/// Options for KV operations +#[derive(Debug, Clone, Default)] +pub struct KvOptions { + /// Lease ID for TTL-based expiration + pub lease_id: Option, + + /// Return previous value + pub prev_kv: bool, + + /// Time-to-live for the key + pub ttl: Option, +} + +/// Result of a put operation +#[derive(Debug, Clone)] +pub struct KvPutResult { + /// New revision after the put + pub revision: u64, + + /// Previous value, if requested and existed + pub prev_value: Option>, +} + +/// A key-value entry with metadata +#[derive(Debug, Clone)] +pub struct KvEntry { + /// The key + pub key: Vec, + + /// The value + pub value: Vec, + + /// Revision when the key was created + pub create_revision: u64, + + /// Revision when the key was last modified + pub mod_revision: u64, + + /// Version number (increments on each update) + pub version: u64, + + /// Lease ID if the key is attached to a lease + pub lease_id: Option, +} + +/// Result of a compare-and-swap operation +#[derive(Debug, Clone)] +pub enum CasResult { + /// CAS succeeded, contains new revision + Success(u64), + + /// CAS failed due to version mismatch + Conflict { + /// Expected version + expected: u64, + /// Actual version found + actual: u64, + }, + + /// Key did not exist + NotFound, +} + +/// Read consistency level +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub enum ReadConsistency { + /// Read from local storage (may be stale) + Local, + + /// Read from any node, but verify with leader's committed index + Serializable, + + /// Read only from leader (linearizable, strongest guarantee) + #[default] + Linearizable, +} + +/// Lightweight handle for KV operations +#[derive(Clone)] +pub struct KvHandle { + kv: Arc, +} + +impl KvHandle { + pub(crate) fn new(kv: Arc) -> Self { + Self { kv } + } + + /// Get the underlying KV store + pub fn inner(&self) -> &Arc { + &self.kv + } + + /// Get a value by key + pub async fn get(&self, key: impl AsRef<[u8]>) -> Result>> { + self.kv.get(key).await + } + + /// Put a value + pub async fn put(&self, key: impl AsRef<[u8]>, value: impl AsRef<[u8]>) -> Result { + self.kv.put(key, value).await + } + + /// Delete a key + pub async fn delete(&self, key: impl AsRef<[u8]>) -> Result { + self.kv.delete(key).await + } + + /// Get a namespace + pub fn namespace(&self, name: &str) -> Arc { + self.kv.namespace(name) + } +} diff --git a/chainfire/crates/chainfire-core/src/lib.rs b/chainfire/crates/chainfire-core/src/lib.rs new file mode 100644 index 0000000..e924c25 --- /dev/null +++ b/chainfire/crates/chainfire-core/src/lib.rs @@ -0,0 +1,58 @@ +//! Chainfire Core - Embeddable distributed cluster library +//! +//! This crate provides cluster management, distributed KVS, and event callbacks +//! for embedding Raft consensus and SWIM gossip into applications. +//! +//! # Example +//! +//! ```ignore +//! use chainfire_core::{ClusterBuilder, ClusterEventHandler}; +//! use std::net::SocketAddr; +//! +//! struct MyHandler; +//! +//! impl ClusterEventHandler for MyHandler { +//! async fn on_leader_changed(&self, old: Option, new: u64) { +//! println!("Leader changed: {:?} -> {}", old, new); +//! } +//! } +//! +//! #[tokio::main] +//! async fn main() -> Result<(), Box> { +//! let cluster = ClusterBuilder::new(1) +//! .name("node-1") +//! .gossip_addr("0.0.0.0:7946".parse()?) +//! .raft_addr("0.0.0.0:2380".parse()?) +//! .on_cluster_event(MyHandler) +//! .build() +//! .await?; +//! +//! // Use the KVS +//! cluster.kv().put("key", b"value").await?; +//! +//! Ok(()) +//! } +//! ``` + +pub mod builder; +pub mod callbacks; +pub mod cluster; +pub mod config; +pub mod error; +pub mod events; +pub mod kvs; + +// Re-exports from chainfire-types +pub use chainfire_types::{ + node::{NodeId, NodeInfo, NodeRole}, + RaftRole, +}; + +// Re-exports from this crate +pub use builder::ClusterBuilder; +pub use callbacks::{ClusterEventHandler, KvEventHandler, LeaveReason}; +pub use cluster::{Cluster, ClusterHandle, ClusterState}; +pub use config::{ClusterConfig, StorageBackend, StorageBackendConfig}; +pub use error::{ClusterError, Result}; +pub use events::{ClusterEvent, EventDispatcher, KvEvent}; +pub use kvs::{CasResult, Kv, KvEntry, KvHandle, KvNamespace, KvOptions, ReadConsistency}; diff --git a/chainfire/crates/chainfire-gossip/Cargo.toml b/chainfire/crates/chainfire-gossip/Cargo.toml new file mode 100644 index 0000000..5901925 --- /dev/null +++ b/chainfire/crates/chainfire-gossip/Cargo.toml @@ -0,0 +1,35 @@ +[package] +name = "chainfire-gossip" +version.workspace = true +edition.workspace = true +license.workspace = true +rust-version.workspace = true +description = "Gossip/SWIM protocol integration for Chainfire distributed KVS" + +[dependencies] +chainfire-types = { workspace = true } + +# Gossip (SWIM protocol) +foca = { workspace = true } + +# Async +tokio = { workspace = true } +futures = { workspace = true } + +# Serialization +serde = { workspace = true } +bincode = { workspace = true } + +# Utilities +tracing = { workspace = true } +parking_lot = { workspace = true } +dashmap = { workspace = true } +bytes = { workspace = true } +thiserror = { workspace = true } +rand = "0.9" + +[dev-dependencies] +tokio = { workspace = true, features = ["rt-multi-thread", "macros", "time"] } + +[lints] +workspace = true diff --git a/chainfire/crates/chainfire-gossip/src/agent.rs b/chainfire/crates/chainfire-gossip/src/agent.rs new file mode 100644 index 0000000..5f04ef3 --- /dev/null +++ b/chainfire/crates/chainfire-gossip/src/agent.rs @@ -0,0 +1,214 @@ +//! Gossip agent with UDP transport + +use crate::broadcast::ActualStateBroadcast; +use crate::identity::GossipId; +use crate::membership::{MembershipChange, MembershipState}; +use crate::runtime::GossipRuntime; +use crate::GossipError; +use foca::{Config as FocaConfig, Foca, NoCustomBroadcast, PostcardCodec, Timer}; +use futures::stream::FuturesUnordered; +use futures::StreamExt; +use rand::rngs::SmallRng; +use rand::SeedableRng; +use std::net::SocketAddr; +use std::sync::Arc; +use std::time::Duration; +use tokio::net::UdpSocket; +use tokio::sync::mpsc; +use tracing::{error, info, trace, warn}; + +/// Default gossip configuration +pub fn default_config() -> FocaConfig { + FocaConfig::simple() +} + +/// Gossip agent managing the SWIM protocol +pub struct GossipAgent { + /// Our identity + identity: GossipId, + /// UDP socket for gossip + socket: Arc, + /// Membership state + membership: Arc, + /// Actual state broadcast handler + broadcast: Arc, + /// Channel for receiving membership changes + membership_rx: mpsc::Receiver, + /// Channel for receiving outgoing packets + outgoing_rx: mpsc::Receiver<(SocketAddr, Vec)>, + /// Channel for receiving timer events + timer_rx: mpsc::Receiver<(Timer, Duration)>, + /// Foca instance + foca: Foca, + /// Runtime for callbacks + runtime: GossipRuntime, +} + +impl GossipAgent { + /// Create a new gossip agent + pub async fn new(identity: GossipId, config: FocaConfig) -> Result { + let socket = UdpSocket::bind(identity.addr) + .await + .map_err(|e| GossipError::BindFailed(e.to_string()))?; + + info!(addr = %identity.addr, node_id = identity.node_id, "Gossip agent bound"); + + let (outgoing_tx, outgoing_rx) = mpsc::channel(1024); + let (timer_tx, timer_rx) = mpsc::channel(256); + let (membership_tx, membership_rx) = mpsc::channel(256); + + let runtime = GossipRuntime::new(outgoing_tx, timer_tx, membership_tx); + + let rng = SmallRng::from_os_rng(); + let foca = Foca::new(identity.clone(), config, rng, PostcardCodec); + + Ok(Self { + identity, + socket: Arc::new(socket), + membership: Arc::new(MembershipState::new()), + broadcast: Arc::new(ActualStateBroadcast::new()), + membership_rx, + outgoing_rx, + timer_rx, + foca, + runtime, + }) + } + + /// Get the identity + pub fn identity(&self) -> &GossipId { + &self.identity + } + + /// Get the membership state + pub fn membership(&self) -> &Arc { + &self.membership + } + + /// Get the broadcast handler + pub fn broadcast(&self) -> &Arc { + &self.broadcast + } + + /// Announce to a known cluster member to join + pub fn announce(&mut self, addr: SocketAddr) -> Result<(), GossipError> { + // Create a probe identity for the target + let probe = GossipId::worker(0, addr); + self.foca + .announce(probe, &mut self.runtime) + .map_err(|e| GossipError::JoinFailed(format!("{:?}", e)))?; + + info!(addr = %addr, "Announced to cluster"); + Ok(()) + } + + /// Get current members + pub fn members(&self) -> Vec { + self.foca.iter_members().map(|m| m.id().clone()).collect() + } + + /// Run the gossip agent + pub async fn run(&mut self) -> Result<(), GossipError> { + let mut buf = vec![0u8; 65536]; + let mut timer_handles = FuturesUnordered::new(); + + info!(identity = %self.identity, "Starting gossip agent"); + + loop { + tokio::select! { + // Handle incoming UDP packets + result = self.socket.recv_from(&mut buf) => { + match result { + Ok((len, addr)) => { + trace!(from = %addr, len, "Received gossip packet"); + if let Err(e) = self.foca.handle_data(&buf[..len], &mut self.runtime) { + warn!(error = ?e, "Failed to handle gossip data"); + } + } + Err(e) => { + error!(error = %e, "Failed to receive UDP packet"); + } + } + } + + // Send outgoing packets + Some((addr, data)) = self.outgoing_rx.recv() => { + trace!(to = %addr, len = data.len(), "Sending gossip packet"); + if let Err(e) = self.socket.send_to(&data, addr).await { + warn!(error = %e, to = %addr, "Failed to send UDP packet"); + } + } + + // Schedule timers + Some((timer, duration)) = self.timer_rx.recv() => { + let timer_clone = timer.clone(); + timer_handles.push(async move { + tokio::time::sleep(duration).await; + timer_clone + }); + } + + // Fire timers + Some(timer) = timer_handles.next() => { + if let Err(e) = self.foca.handle_timer(timer, &mut self.runtime) { + warn!(error = ?e, "Failed to handle timer"); + } + } + + // Handle membership changes + Some(change) = self.membership_rx.recv() => { + // Also remove state on member down + if let MembershipChange::MemberDown(ref id) = change { + self.broadcast.remove_state(id.node_id); + } + self.membership.handle_change(change); + } + } + } + } + + /// Run the agent with graceful shutdown + pub async fn run_until_shutdown( + mut self, + mut shutdown: tokio::sync::broadcast::Receiver<()>, + ) -> Result<(), GossipError> { + tokio::select! { + result = self.run() => result, + _ = shutdown.recv() => { + info!("Gossip agent shutting down"); + Ok(()) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use chainfire_types::node::NodeRole; + + async fn create_test_agent(port: u16) -> GossipAgent { + let id = GossipId::new( + port as u64, + format!("127.0.0.1:{}", port).parse().unwrap(), + NodeRole::Worker, + ); + GossipAgent::new(id, default_config()).await.unwrap() + } + + #[tokio::test] + async fn test_agent_creation() { + let agent = create_test_agent(15000).await; + assert_eq!(agent.identity().node_id, 15000); + } + + #[tokio::test] + async fn test_membership_empty() { + let agent = create_test_agent(15001).await; + assert_eq!(agent.membership().count(), 0); + } + + // Note: Full gossip tests require multiple agents communicating + // which is complex to set up in unit tests. Integration tests + // would be more appropriate for testing actual gossip behavior. +} diff --git a/chainfire/crates/chainfire-gossip/src/broadcast.rs b/chainfire/crates/chainfire-gossip/src/broadcast.rs new file mode 100644 index 0000000..94e1f14 --- /dev/null +++ b/chainfire/crates/chainfire-gossip/src/broadcast.rs @@ -0,0 +1,210 @@ +//! Custom broadcast handler for actual state propagation + +use chainfire_types::NodeId; +use dashmap::DashMap; +use parking_lot::RwLock; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::time::SystemTime; +use tracing::debug; + +/// Actual state data broadcast via gossip +/// +/// This is the "Actual State" mentioned in the design - things like +/// CPU usage, memory, running tasks, etc. that are eventually consistent. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ActualState { + /// Node ID this state is from + pub node_id: NodeId, + /// Timestamp when this state was generated + pub timestamp: u64, + /// CPU usage percentage (0-100) + pub cpu_usage: f32, + /// Memory usage percentage (0-100) + pub memory_usage: f32, + /// Disk usage percentage (0-100) + pub disk_usage: f32, + /// Custom status fields (e.g., "vm-a" -> "running") + pub status: HashMap, +} + +impl ActualState { + /// Create a new actual state + pub fn new(node_id: NodeId) -> Self { + let timestamp = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_secs(); + + Self { + node_id, + timestamp, + cpu_usage: 0.0, + memory_usage: 0.0, + disk_usage: 0.0, + status: HashMap::new(), + } + } + + /// Set CPU usage + pub fn with_cpu(mut self, usage: f32) -> Self { + self.cpu_usage = usage; + self + } + + /// Set memory usage + pub fn with_memory(mut self, usage: f32) -> Self { + self.memory_usage = usage; + self + } + + /// Set disk usage + pub fn with_disk(mut self, usage: f32) -> Self { + self.disk_usage = usage; + self + } + + /// Add a status entry + pub fn with_status(mut self, key: impl Into, value: impl Into) -> Self { + self.status.insert(key.into(), value.into()); + self + } + + /// Update timestamp to now + pub fn touch(&mut self) { + self.timestamp = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_secs(); + } +} + +/// Broadcast handler for actual state propagation +pub struct ActualStateBroadcast { + /// Current node's actual state + local_state: RwLock>, + /// Collected states from other nodes + cluster_state: DashMap, +} + +impl ActualStateBroadcast { + /// Create a new broadcast handler + pub fn new() -> Self { + Self { + local_state: RwLock::new(None), + cluster_state: DashMap::new(), + } + } + + /// Set the local node's state + pub fn set_local_state(&self, state: ActualState) { + *self.local_state.write() = Some(state); + } + + /// Get the local node's state + pub fn local_state(&self) -> Option { + self.local_state.read().clone() + } + + /// Get a node's state + pub fn get_state(&self, node_id: NodeId) -> Option { + self.cluster_state.get(&node_id).map(|r| r.clone()) + } + + /// Get all cluster states + pub fn all_states(&self) -> Vec { + self.cluster_state.iter().map(|r| r.clone()).collect() + } + + /// Remove a node's state (on member down) + pub fn remove_state(&self, node_id: NodeId) { + self.cluster_state.remove(&node_id); + } +} + +impl Default for ActualStateBroadcast { + fn default() -> Self { + Self::new() + } +} + +impl ActualStateBroadcast { + /// Receive and process state from another node + /// Returns true if the state was newer and accepted + pub fn receive_state(&self, state: ActualState) -> bool { + let node_id = state.node_id; + + // Check if we should update + if let Some(existing) = self.cluster_state.get(&node_id) { + if existing.timestamp >= state.timestamp { + return false; // Stale data + } + } + + debug!(node_id, timestamp = state.timestamp, "Received actual state"); + self.cluster_state.insert(node_id, state); + true + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_actual_state_creation() { + let state = ActualState::new(1) + .with_cpu(50.0) + .with_memory(75.0) + .with_status("vm-1", "running"); + + assert_eq!(state.node_id, 1); + assert_eq!(state.cpu_usage, 50.0); + assert_eq!(state.memory_usage, 75.0); + assert_eq!(state.status.get("vm-1"), Some(&"running".to_string())); + } + + #[test] + fn test_receive_state() { + let handler = ActualStateBroadcast::new(); + + // Receive first state + let state1 = ActualState::new(1).with_cpu(50.0); + let result = handler.receive_state(state1.clone()); + assert!(result); // Should accept + + // Receive newer state + let mut state2 = ActualState::new(1).with_cpu(60.0); + state2.timestamp = state1.timestamp + 1; + let result = handler.receive_state(state2.clone()); + assert!(result); // Should accept + + // Receive older state + let mut state3 = ActualState::new(1).with_cpu(40.0); + state3.timestamp = state1.timestamp - 1; + let result = handler.receive_state(state3); + assert!(!result); // Should reject stale data + + // Verify final state + let stored = handler.get_state(1).unwrap(); + assert_eq!(stored.cpu_usage, 60.0); + } + + #[test] + fn test_cluster_state_collection() { + let handler = ActualStateBroadcast::new(); + + handler + .cluster_state + .insert(1, ActualState::new(1).with_cpu(50.0)); + handler + .cluster_state + .insert(2, ActualState::new(2).with_cpu(60.0)); + + let states = handler.all_states(); + assert_eq!(states.len(), 2); + + handler.remove_state(1); + assert_eq!(handler.all_states().len(), 1); + } +} diff --git a/chainfire/crates/chainfire-gossip/src/identity.rs b/chainfire/crates/chainfire-gossip/src/identity.rs new file mode 100644 index 0000000..9413cba --- /dev/null +++ b/chainfire/crates/chainfire-gossip/src/identity.rs @@ -0,0 +1,147 @@ +//! Node identity for the gossip protocol + +use chainfire_types::node::{NodeId, NodeRole}; +use foca::Identity; +use serde::{Deserialize, Serialize}; +use std::net::SocketAddr; + +/// Node identity for the SWIM gossip protocol +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct GossipId { + /// Unique node identifier + pub node_id: NodeId, + /// UDP address for gossip + pub addr: SocketAddr, + /// Incarnation number - bumped on rejoin to distinguish old/new instances + pub incarnation: u64, + /// Node role + pub role: NodeRole, +} + +impl GossipId { + /// Create a new gossip identity + pub fn new(node_id: NodeId, addr: SocketAddr, role: NodeRole) -> Self { + Self { + node_id, + addr, + incarnation: 0, + role, + } + } + + /// Create a Control Plane node identity + pub fn control_plane(node_id: NodeId, addr: SocketAddr) -> Self { + Self::new(node_id, addr, NodeRole::ControlPlane) + } + + /// Create a Worker node identity + pub fn worker(node_id: NodeId, addr: SocketAddr) -> Self { + Self::new(node_id, addr, NodeRole::Worker) + } + + /// Check if this node is a Control Plane node + pub fn is_control_plane(&self) -> bool { + self.role == NodeRole::ControlPlane + } + + /// Check if this node is a Worker node + pub fn is_worker(&self) -> bool { + self.role == NodeRole::Worker + } +} + +impl Identity for GossipId { + type Addr = SocketAddr; + + fn addr(&self) -> SocketAddr { + self.addr + } + + fn renew(&self) -> Option { + // Create new identity with bumped incarnation + Some(Self { + incarnation: self.incarnation + 1, + ..self.clone() + }) + } + + fn win_addr_conflict(&self, other: &Self) -> bool { + // Higher incarnation wins, tie-break by node_id + match self.incarnation.cmp(&other.incarnation) { + std::cmp::Ordering::Greater => true, + std::cmp::Ordering::Less => false, + std::cmp::Ordering::Equal => self.node_id > other.node_id, + } + } +} + +impl std::fmt::Display for GossipId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}@{}:{}", + self.node_id, + self.addr, + self.incarnation + ) + } +} + +impl PartialOrd for GossipId { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for GossipId { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + // First compare by node_id, then by incarnation + match self.node_id.cmp(&other.node_id) { + std::cmp::Ordering::Equal => self.incarnation.cmp(&other.incarnation), + other => other, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_identity_creation() { + let id = GossipId::control_plane(1, "127.0.0.1:5000".parse().unwrap()); + + assert_eq!(id.node_id, 1); + assert_eq!(id.incarnation, 0); + assert!(id.is_control_plane()); + } + + #[test] + fn test_identity_renew() { + let id = GossipId::worker(1, "127.0.0.1:5000".parse().unwrap()); + let renewed = id.renew().unwrap(); + + assert_eq!(renewed.node_id, id.node_id); + assert_eq!(renewed.addr, id.addr); + assert_eq!(renewed.incarnation, 1); + } + + #[test] + fn test_identity_ordering() { + let id1 = GossipId::new(1, "127.0.0.1:5000".parse().unwrap(), NodeRole::Worker); + let id2 = GossipId::new(2, "127.0.0.1:5001".parse().unwrap(), NodeRole::Worker); + let id1_renewed = id1.renew().unwrap(); + + assert!(id1 < id2); + assert!(id1 < id1_renewed); + } + + #[test] + fn test_serialization() { + let id = GossipId::control_plane(42, "192.168.1.1:5000".parse().unwrap()); + let bytes = bincode::serialize(&id).unwrap(); + let restored: GossipId = bincode::deserialize(&bytes).unwrap(); + + assert_eq!(id, restored); + } +} diff --git a/chainfire/crates/chainfire-gossip/src/lib.rs b/chainfire/crates/chainfire-gossip/src/lib.rs new file mode 100644 index 0000000..c8617b2 --- /dev/null +++ b/chainfire/crates/chainfire-gossip/src/lib.rs @@ -0,0 +1,40 @@ +//! Gossip/SWIM protocol integration for Chainfire distributed KVS +//! +//! This crate provides: +//! - Node identity for SWIM protocol +//! - Gossip agent with UDP transport +//! - Membership management +//! - Actual state broadcast + +pub mod agent; +pub mod broadcast; +pub mod identity; +pub mod membership; +pub mod runtime; + +pub use agent::GossipAgent; +pub use broadcast::ActualState; +pub use identity::GossipId; +pub use membership::MembershipChange; +pub use runtime::GossipRuntime; + +use thiserror::Error; + +/// Gossip protocol errors +#[derive(Error, Debug)] +pub enum GossipError { + #[error("Failed to bind to address: {0}")] + BindFailed(String), + + #[error("IO error: {0}")] + Io(#[from] std::io::Error), + + #[error("Serialization error: {0}")] + Serialization(String), + + #[error("Invalid identity: {0}")] + InvalidIdentity(String), + + #[error("Join failed: {0}")] + JoinFailed(String), +} diff --git a/chainfire/crates/chainfire-gossip/src/membership.rs b/chainfire/crates/chainfire-gossip/src/membership.rs new file mode 100644 index 0000000..e0e42de --- /dev/null +++ b/chainfire/crates/chainfire-gossip/src/membership.rs @@ -0,0 +1,141 @@ +//! Membership state management + +use crate::identity::GossipId; +use chainfire_types::NodeId; +use dashmap::DashMap; +use std::net::SocketAddr; +use tracing::debug; + +/// Membership change event +#[derive(Debug, Clone)] +pub enum MembershipChange { + /// A member joined or became reachable + MemberUp(GossipId), + /// A member left or became unreachable + MemberDown(GossipId), +} + +/// Manages cluster membership state +pub struct MembershipState { + /// Known members + members: DashMap, +} + +impl MembershipState { + /// Create a new membership state + pub fn new() -> Self { + Self { + members: DashMap::new(), + } + } + + /// Handle a membership change + pub fn handle_change(&self, change: MembershipChange) { + match change { + MembershipChange::MemberUp(id) => { + debug!(node_id = id.node_id, addr = %id.addr, "Adding member"); + self.members.insert(id.node_id, id); + } + MembershipChange::MemberDown(id) => { + debug!(node_id = id.node_id, "Removing member"); + self.members.remove(&id.node_id); + } + } + } + + /// Get a member by node ID + pub fn get(&self, node_id: NodeId) -> Option { + self.members.get(&node_id).map(|r| r.clone()) + } + + /// Get all members + pub fn all(&self) -> Vec { + self.members.iter().map(|r| r.clone()).collect() + } + + /// Get member count + pub fn count(&self) -> usize { + self.members.len() + } + + /// Check if a node is a member + pub fn contains(&self, node_id: NodeId) -> bool { + self.members.contains_key(&node_id) + } + + /// Get all member addresses + pub fn addresses(&self) -> Vec { + self.members.iter().map(|r| r.addr).collect() + } + + /// Get all control plane members + pub fn control_plane_members(&self) -> Vec { + self.members + .iter() + .filter(|r| r.is_control_plane()) + .map(|r| r.clone()) + .collect() + } + + /// Get all worker members + pub fn worker_members(&self) -> Vec { + self.members + .iter() + .filter(|r| r.is_worker()) + .map(|r| r.clone()) + .collect() + } +} + +impl Default for MembershipState { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use chainfire_types::node::NodeRole; + + fn create_id(node_id: NodeId, role: NodeRole) -> GossipId { + GossipId::new( + node_id, + format!("127.0.0.1:{}", 5000 + node_id).parse().unwrap(), + role, + ) + } + + #[test] + fn test_membership_changes() { + let state = MembershipState::new(); + + let id1 = create_id(1, NodeRole::ControlPlane); + let id2 = create_id(2, NodeRole::Worker); + + state.handle_change(MembershipChange::MemberUp(id1.clone())); + state.handle_change(MembershipChange::MemberUp(id2.clone())); + + assert_eq!(state.count(), 2); + assert!(state.contains(1)); + assert!(state.contains(2)); + + state.handle_change(MembershipChange::MemberDown(id1)); + assert_eq!(state.count(), 1); + assert!(!state.contains(1)); + } + + #[test] + fn test_role_filtering() { + let state = MembershipState::new(); + + state.handle_change(MembershipChange::MemberUp(create_id(1, NodeRole::ControlPlane))); + state.handle_change(MembershipChange::MemberUp(create_id(2, NodeRole::ControlPlane))); + state.handle_change(MembershipChange::MemberUp(create_id(3, NodeRole::Worker))); + state.handle_change(MembershipChange::MemberUp(create_id(4, NodeRole::Worker))); + state.handle_change(MembershipChange::MemberUp(create_id(5, NodeRole::Worker))); + + assert_eq!(state.control_plane_members().len(), 2); + assert_eq!(state.worker_members().len(), 3); + } +} diff --git a/chainfire/crates/chainfire-gossip/src/runtime.rs b/chainfire/crates/chainfire-gossip/src/runtime.rs new file mode 100644 index 0000000..b814b03 --- /dev/null +++ b/chainfire/crates/chainfire-gossip/src/runtime.rs @@ -0,0 +1,131 @@ +//! Foca runtime implementation + +use crate::identity::GossipId; +use crate::membership::MembershipChange; +use foca::{Notification, Runtime, Timer}; +use std::net::SocketAddr; +use std::time::Duration; +use tokio::sync::mpsc; +use tracing::{debug, trace}; + +/// Foca runtime implementation for async operation +pub struct GossipRuntime { + /// Channel for outgoing UDP packets + outgoing_tx: mpsc::Sender<(SocketAddr, Vec)>, + /// Channel for timer scheduling + timer_tx: mpsc::Sender<(Timer, Duration)>, + /// Channel for membership updates + membership_tx: mpsc::Sender, +} + +impl GossipRuntime { + /// Create a new gossip runtime + pub fn new( + outgoing_tx: mpsc::Sender<(SocketAddr, Vec)>, + timer_tx: mpsc::Sender<(Timer, Duration)>, + membership_tx: mpsc::Sender, + ) -> Self { + Self { + outgoing_tx, + timer_tx, + membership_tx, + } + } +} + +impl Runtime for GossipRuntime { + fn notify(&mut self, notification: Notification) { + match notification { + Notification::MemberUp(id) => { + debug!(node_id = id.node_id, addr = %id.addr, "Member up"); + let _ = self + .membership_tx + .try_send(MembershipChange::MemberUp(id.clone())); + } + Notification::MemberDown(id) => { + debug!(node_id = id.node_id, addr = %id.addr, "Member down"); + let _ = self + .membership_tx + .try_send(MembershipChange::MemberDown(id.clone())); + } + Notification::Idle => { + trace!("Gossip idle"); + } + Notification::Rejoin(id) => { + debug!(node_id = id.node_id, "Member rejoined"); + let _ = self + .membership_tx + .try_send(MembershipChange::MemberUp(id.clone())); + } + Notification::Active => { + trace!("Gossip active"); + } + Notification::Defunct => { + trace!("Member defunct"); + } + Notification::Rename(old, new) => { + debug!(old = old.node_id, new = new.node_id, "Member renamed"); + // Treat as down/up sequence + let _ = self + .membership_tx + .try_send(MembershipChange::MemberDown(old.clone())); + let _ = self + .membership_tx + .try_send(MembershipChange::MemberUp(new.clone())); + } + } + } + + fn send_to(&mut self, to: GossipId, data: &[u8]) { + trace!(to = %to.addr, len = data.len(), "Sending gossip packet"); + let _ = self.outgoing_tx.try_send((to.addr, data.to_vec())); + } + + fn submit_after(&mut self, event: Timer, after: Duration) { + trace!(?event, ?after, "Scheduling timer"); + let _ = self.timer_tx.try_send((event, after)); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use chainfire_types::node::NodeRole; + + #[tokio::test] + async fn test_runtime_notifications() { + let (outgoing_tx, _) = mpsc::channel(10); + let (timer_tx, _) = mpsc::channel(10); + let (membership_tx, mut membership_rx) = mpsc::channel(10); + + let mut runtime = GossipRuntime::new(outgoing_tx, timer_tx, membership_tx); + + let id = GossipId::new(1, "127.0.0.1:5000".parse().unwrap(), NodeRole::Worker); + + runtime.notify(Notification::MemberUp(&id)); + let change = membership_rx.try_recv().unwrap(); + assert!(matches!(change, MembershipChange::MemberUp(_))); + + runtime.notify(Notification::MemberDown(&id)); + let change = membership_rx.try_recv().unwrap(); + assert!(matches!(change, MembershipChange::MemberDown(_))); + } + + #[tokio::test] + async fn test_runtime_send() { + let (outgoing_tx, mut outgoing_rx) = mpsc::channel(10); + let (timer_tx, _) = mpsc::channel(10); + let (membership_tx, _) = mpsc::channel(10); + + let mut runtime = GossipRuntime::new(outgoing_tx, timer_tx, membership_tx); + + let id = GossipId::new(1, "127.0.0.1:5000".parse().unwrap(), NodeRole::Worker); + let data = b"test data"; + + runtime.send_to(id.clone(), data); + + let (recv_addr, recv_data) = outgoing_rx.try_recv().unwrap(); + assert_eq!(recv_addr, id.addr); + assert_eq!(recv_data, data); + } +} diff --git a/chainfire/crates/chainfire-proto/Cargo.toml b/chainfire/crates/chainfire-proto/Cargo.toml new file mode 100644 index 0000000..bf25386 --- /dev/null +++ b/chainfire/crates/chainfire-proto/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "chainfire-proto" +version.workspace = true +edition.workspace = true +license.workspace = true +rust-version.workspace = true +description = "Protocol buffer definitions for Chainfire (client-safe, no storage deps)" + +[dependencies] +tonic = { workspace = true } +prost = { workspace = true } +prost-types = { workspace = true } +tokio = { workspace = true } +tokio-stream = { workspace = true } + +[build-dependencies] +tonic-build = { workspace = true } +protoc-bin-vendored = "3" + +[lints] +workspace = true diff --git a/chainfire/crates/chainfire-proto/build.rs b/chainfire/crates/chainfire-proto/build.rs new file mode 100644 index 0000000..4be5ee5 --- /dev/null +++ b/chainfire/crates/chainfire-proto/build.rs @@ -0,0 +1,12 @@ +fn main() -> Result<(), Box> { + let protoc_path = protoc_bin_vendored::protoc_bin_path()?; + std::env::set_var("PROTOC", protoc_path); + + tonic_build::configure() + .build_server(false) + .build_client(true) + .compile_protos(&["../../proto/chainfire.proto"], &["../../proto"])?; + + println!("cargo:rerun-if-changed=../../proto/chainfire.proto"); + Ok(()) +} diff --git a/chainfire/crates/chainfire-proto/src/lib.rs b/chainfire/crates/chainfire-proto/src/lib.rs new file mode 100644 index 0000000..f024215 --- /dev/null +++ b/chainfire/crates/chainfire-proto/src/lib.rs @@ -0,0 +1,7 @@ +//! Lightweight protocol buffer definitions for Chainfire (client-side) +//! Generates client stubs only (no storage/backend dependencies). + +// Generated client stubs live under the `proto` module to mirror chainfire-api's re-exports. +pub mod proto { + include!(concat!(env!("OUT_DIR"), "/chainfire.v1.rs")); +} diff --git a/chainfire/crates/chainfire-raft/Cargo.toml b/chainfire/crates/chainfire-raft/Cargo.toml new file mode 100644 index 0000000..b2f4f7a --- /dev/null +++ b/chainfire/crates/chainfire-raft/Cargo.toml @@ -0,0 +1,38 @@ +[package] +name = "chainfire-raft" +version.workspace = true +edition.workspace = true +license.workspace = true +rust-version.workspace = true +description = "OpenRaft integration for Chainfire distributed KVS" + +[dependencies] +chainfire-types = { workspace = true } +chainfire-storage = { workspace = true } + +# Raft +openraft = { workspace = true } + +# Async +tokio = { workspace = true } +async-trait = { workspace = true } +futures = { workspace = true } + +# Serialization +serde = { workspace = true } +bincode = { workspace = true } + +# Utilities +tracing = { workspace = true } +parking_lot = { workspace = true } +dashmap = { workspace = true } +bytes = { workspace = true } +thiserror = { workspace = true } +anyhow = { workspace = true } + +[dev-dependencies] +tempfile = { workspace = true } +tokio = { workspace = true, features = ["rt-multi-thread", "macros"] } + +[lints] +workspace = true diff --git a/chainfire/crates/chainfire-raft/src/config.rs b/chainfire/crates/chainfire-raft/src/config.rs new file mode 100644 index 0000000..7ac8937 --- /dev/null +++ b/chainfire/crates/chainfire-raft/src/config.rs @@ -0,0 +1,79 @@ +//! OpenRaft type configuration for Chainfire + +use chainfire_types::command::{RaftCommand, RaftResponse}; +use chainfire_types::NodeId; +use openraft::BasicNode; +use std::io::Cursor; + +// Use the declare_raft_types macro for OpenRaft 0.9 +// NodeId defaults to u64, which matches our chainfire_types::NodeId +openraft::declare_raft_types!( + /// OpenRaft type configuration for Chainfire + pub TypeConfig: + D = RaftCommand, + R = RaftResponse, + Node = BasicNode, +); + +/// Request data type - commands submitted to Raft +pub type Request = RaftCommand; + +/// Response data type - responses from state machine +pub type Response = RaftResponse; + +/// Log ID type +pub type LogId = openraft::LogId; + +/// Vote type +pub type Vote = openraft::Vote; + +/// Snapshot meta type (uses NodeId and Node separately) +pub type SnapshotMeta = openraft::SnapshotMeta; + +/// Membership type (uses NodeId and Node separately) +pub type Membership = openraft::Membership; + +/// Stored membership type +pub type StoredMembership = openraft::StoredMembership; + +/// Entry type +pub type Entry = openraft::Entry; + +/// Leader ID type +pub type LeaderId = openraft::LeaderId; + +/// Committed Leader ID type +pub type CommittedLeaderId = openraft::CommittedLeaderId; + +/// Raft configuration builder +pub fn default_config() -> openraft::Config { + openraft::Config { + cluster_name: "chainfire".into(), + heartbeat_interval: 150, + election_timeout_min: 300, + election_timeout_max: 600, + install_snapshot_timeout: 400, + max_payload_entries: 300, + replication_lag_threshold: 1000, + snapshot_policy: openraft::SnapshotPolicy::LogsSinceLast(5000), + snapshot_max_chunk_size: 3 * 1024 * 1024, // 3MB + max_in_snapshot_log_to_keep: 1000, + purge_batch_size: 256, + enable_tick: true, + enable_heartbeat: true, + enable_elect: true, + ..Default::default() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_config() { + let config = default_config(); + assert_eq!(config.cluster_name, "chainfire"); + assert!(config.heartbeat_interval < config.election_timeout_min); + } +} diff --git a/chainfire/crates/chainfire-raft/src/lib.rs b/chainfire/crates/chainfire-raft/src/lib.rs new file mode 100644 index 0000000..87dc63e --- /dev/null +++ b/chainfire/crates/chainfire-raft/src/lib.rs @@ -0,0 +1,20 @@ +//! OpenRaft integration for Chainfire distributed KVS +//! +//! This crate provides: +//! - TypeConfig for OpenRaft +//! - Network implementation for Raft RPC +//! - Storage adapters +//! - Raft node management + +pub mod config; +pub mod network; +pub mod node; +pub mod storage; + +pub use config::TypeConfig; +pub use network::{NetworkFactory, RaftNetworkError}; +pub use node::RaftNode; +pub use storage::RaftStorage; + +/// Raft type alias with our configuration +pub type Raft = openraft::Raft; diff --git a/chainfire/crates/chainfire-raft/src/network.rs b/chainfire/crates/chainfire-raft/src/network.rs new file mode 100644 index 0000000..1ccfab8 --- /dev/null +++ b/chainfire/crates/chainfire-raft/src/network.rs @@ -0,0 +1,316 @@ +//! Network implementation for Raft RPC +//! +//! This module provides network adapters for OpenRaft to communicate between nodes. + +use crate::config::TypeConfig; +use chainfire_types::NodeId; +use openraft::error::{InstallSnapshotError, NetworkError, RaftError, RPCError, StreamingError, Fatal}; +use openraft::network::{RPCOption, RaftNetwork, RaftNetworkFactory}; +use openraft::raft::{ + AppendEntriesRequest, AppendEntriesResponse, InstallSnapshotRequest, InstallSnapshotResponse, + SnapshotResponse, VoteRequest, VoteResponse, +}; +use openraft::BasicNode; +use std::collections::HashMap; +use std::sync::Arc; +use thiserror::Error; +use tokio::sync::RwLock; +use tracing::{debug, trace}; + +/// Network error type +#[derive(Error, Debug)] +pub enum RaftNetworkError { + #[error("Connection failed to node {node_id}: {reason}")] + ConnectionFailed { node_id: NodeId, reason: String }, + + #[error("RPC failed: {0}")] + RpcFailed(String), + + #[error("Timeout")] + Timeout, + + #[error("Node {0} not found")] + NodeNotFound(NodeId), +} + +/// Trait for sending Raft RPCs +/// This will be implemented by the gRPC client in chainfire-api +#[async_trait::async_trait] +pub trait RaftRpcClient: Send + Sync + 'static { + async fn vote( + &self, + target: NodeId, + req: VoteRequest, + ) -> Result, RaftNetworkError>; + + async fn append_entries( + &self, + target: NodeId, + req: AppendEntriesRequest, + ) -> Result, RaftNetworkError>; + + async fn install_snapshot( + &self, + target: NodeId, + req: InstallSnapshotRequest, + ) -> Result, RaftNetworkError>; +} + +/// Factory for creating network connections to Raft peers +pub struct NetworkFactory { + /// RPC client for sending requests + client: Arc, + /// Node address mapping + nodes: Arc>>, +} + +impl NetworkFactory { + /// Create a new network factory + pub fn new(client: Arc) -> Self { + Self { + client, + nodes: Arc::new(RwLock::new(HashMap::new())), + } + } + + /// Add or update a node's address + pub async fn add_node(&self, id: NodeId, node: BasicNode) { + let mut nodes = self.nodes.write().await; + nodes.insert(id, node); + } + + /// Remove a node + pub async fn remove_node(&self, id: NodeId) { + let mut nodes = self.nodes.write().await; + nodes.remove(&id); + } +} + +impl RaftNetworkFactory for NetworkFactory { + type Network = NetworkConnection; + + async fn new_client(&mut self, target: NodeId, node: &BasicNode) -> Self::Network { + // Update our node map + self.nodes.write().await.insert(target, node.clone()); + + NetworkConnection { + target, + node: node.clone(), + client: Arc::clone(&self.client), + } + } +} + +/// A connection to a single Raft peer +pub struct NetworkConnection { + target: NodeId, + node: BasicNode, + client: Arc, +} + +/// Convert our network error to OpenRaft's RPCError +fn to_rpc_error(e: RaftNetworkError) -> RPCError> { + RPCError::Network(NetworkError::new(&e)) +} + +/// Convert our network error to OpenRaft's RPCError with InstallSnapshotError +fn to_snapshot_rpc_error(e: RaftNetworkError) -> RPCError> { + RPCError::Network(NetworkError::new(&e)) +} + +impl RaftNetwork for NetworkConnection { + async fn vote( + &mut self, + req: VoteRequest, + _option: RPCOption, + ) -> Result< + VoteResponse, + RPCError>, + > { + trace!(target = self.target, "Sending vote request"); + + self.client + .vote(self.target, req) + .await + .map_err(to_rpc_error) + } + + async fn append_entries( + &mut self, + req: AppendEntriesRequest, + _option: RPCOption, + ) -> Result< + AppendEntriesResponse, + RPCError>, + > { + trace!( + target = self.target, + entries = req.entries.len(), + "Sending append entries" + ); + + self.client + .append_entries(self.target, req) + .await + .map_err(to_rpc_error) + } + + async fn install_snapshot( + &mut self, + req: InstallSnapshotRequest, + _option: RPCOption, + ) -> Result< + InstallSnapshotResponse, + RPCError>, + > { + debug!( + target = self.target, + last_log_id = ?req.meta.last_log_id, + "Sending install snapshot" + ); + + self.client + .install_snapshot(self.target, req) + .await + .map_err(to_snapshot_rpc_error) + } + + async fn full_snapshot( + &mut self, + vote: openraft::Vote, + snapshot: openraft::Snapshot, + _cancel: impl std::future::Future + Send + 'static, + _option: RPCOption, + ) -> Result< + SnapshotResponse, + StreamingError>, + > { + // For simplicity, send snapshot in one chunk + // In production, you'd want to chunk large snapshots + let req = InstallSnapshotRequest { + vote, + meta: snapshot.meta.clone(), + offset: 0, + data: snapshot.snapshot.into_inner(), + done: true, + }; + + debug!( + target = self.target, + last_log_id = ?snapshot.meta.last_log_id, + "Sending full snapshot" + ); + + let resp = self + .client + .install_snapshot(self.target, req) + .await + .map_err(|e| StreamingError::Network(NetworkError::new(&e)))?; + + Ok(SnapshotResponse { vote: resp.vote }) + } +} + +/// In-memory RPC client for testing +#[cfg(test)] +pub mod test_client { + use super::*; + use std::collections::HashMap; + use tokio::sync::mpsc; + + /// A simple in-memory RPC client for testing + pub struct InMemoryRpcClient { + /// Channel senders to each node + channels: Arc>>>, + } + + pub enum RpcMessage { + Vote( + VoteRequest, + tokio::sync::oneshot::Sender>, + ), + AppendEntries( + AppendEntriesRequest, + tokio::sync::oneshot::Sender>, + ), + InstallSnapshot( + InstallSnapshotRequest, + tokio::sync::oneshot::Sender>, + ), + } + + impl InMemoryRpcClient { + pub fn new() -> Self { + Self { + channels: Arc::new(RwLock::new(HashMap::new())), + } + } + + pub async fn register(&self, id: NodeId, tx: mpsc::Sender) { + self.channels.write().await.insert(id, tx); + } + } + + #[async_trait::async_trait] + impl RaftRpcClient for InMemoryRpcClient { + async fn vote( + &self, + target: NodeId, + req: VoteRequest, + ) -> Result, RaftNetworkError> { + let channels = self.channels.read().await; + let tx = channels + .get(&target) + .ok_or(RaftNetworkError::NodeNotFound(target))?; + + let (resp_tx, resp_rx) = tokio::sync::oneshot::channel(); + tx.send(RpcMessage::Vote(req, resp_tx)) + .await + .map_err(|_| RaftNetworkError::RpcFailed("Channel closed".into()))?; + + resp_rx + .await + .map_err(|_| RaftNetworkError::RpcFailed("Response channel closed".into())) + } + + async fn append_entries( + &self, + target: NodeId, + req: AppendEntriesRequest, + ) -> Result, RaftNetworkError> { + let channels = self.channels.read().await; + let tx = channels + .get(&target) + .ok_or(RaftNetworkError::NodeNotFound(target))?; + + let (resp_tx, resp_rx) = tokio::sync::oneshot::channel(); + tx.send(RpcMessage::AppendEntries(req, resp_tx)) + .await + .map_err(|_| RaftNetworkError::RpcFailed("Channel closed".into()))?; + + resp_rx + .await + .map_err(|_| RaftNetworkError::RpcFailed("Response channel closed".into())) + } + + async fn install_snapshot( + &self, + target: NodeId, + req: InstallSnapshotRequest, + ) -> Result, RaftNetworkError> { + let channels = self.channels.read().await; + let tx = channels + .get(&target) + .ok_or(RaftNetworkError::NodeNotFound(target))?; + + let (resp_tx, resp_rx) = tokio::sync::oneshot::channel(); + tx.send(RpcMessage::InstallSnapshot(req, resp_tx)) + .await + .map_err(|_| RaftNetworkError::RpcFailed("Channel closed".into()))?; + + resp_rx + .await + .map_err(|_| RaftNetworkError::RpcFailed("Response channel closed".into())) + } + } +} diff --git a/chainfire/crates/chainfire-raft/src/node.rs b/chainfire/crates/chainfire-raft/src/node.rs new file mode 100644 index 0000000..e051ea4 --- /dev/null +++ b/chainfire/crates/chainfire-raft/src/node.rs @@ -0,0 +1,326 @@ +//! Raft node management +//! +//! This module provides the high-level API for managing a Raft node. + +use crate::config::{default_config, TypeConfig}; +use crate::network::{NetworkFactory, RaftRpcClient}; +use crate::storage::RaftStorage; +use crate::Raft; +use chainfire_storage::RocksStore; +use chainfire_types::command::{RaftCommand, RaftResponse}; +use chainfire_types::error::RaftError; +use chainfire_types::NodeId; +use openraft::{BasicNode, Config}; +use std::collections::BTreeMap; +use std::sync::Arc; +use tokio::sync::RwLock; +use tracing::{debug, info}; + +/// A Raft node instance +pub struct RaftNode { + /// Node ID + id: NodeId, + /// OpenRaft instance (wrapped in Arc for sharing) + raft: Arc, + /// Storage + storage: Arc>, + /// Network factory + network: Arc>, + /// Configuration + config: Arc, +} + +impl RaftNode { + /// Create a new Raft node + pub async fn new( + id: NodeId, + store: RocksStore, + rpc_client: Arc, + ) -> Result { + let config = Arc::new(default_config()); + + // Create storage wrapper for local access + let storage = + RaftStorage::new(store.clone()).map_err(|e| RaftError::Internal(e.to_string()))?; + let storage = Arc::new(RwLock::new(storage)); + + let network = NetworkFactory::new(Arc::clone(&rpc_client)); + + // Create log storage and state machine (they share the same underlying store) + let log_storage = RaftStorage::new(store.clone()) + .map_err(|e| RaftError::Internal(e.to_string()))?; + let state_machine = RaftStorage::new(store) + .map_err(|e| RaftError::Internal(e.to_string()))?; + + // Create Raft instance with separate log storage and state machine + let raft = Arc::new( + Raft::new( + id, + config.clone(), + network, + log_storage, + state_machine, + ) + .await + .map_err(|e| RaftError::Internal(e.to_string()))?, + ); + + info!(node_id = id, "Created Raft node"); + + Ok(Self { + id, + raft, + storage, + network: Arc::new(RwLock::new(NetworkFactory::new(rpc_client))), + config, + }) + } + + /// Get the node ID + pub fn id(&self) -> NodeId { + self.id + } + + /// Get the Raft instance (reference) + pub fn raft(&self) -> &Raft { + &self.raft + } + + /// Get the Raft instance (Arc clone for sharing) + pub fn raft_arc(&self) -> Arc { + Arc::clone(&self.raft) + } + + /// Get the storage + pub fn storage(&self) -> &Arc> { + &self.storage + } + + /// Initialize a single-node cluster + pub async fn initialize(&self) -> Result<(), RaftError> { + let mut nodes = BTreeMap::new(); + nodes.insert(self.id, BasicNode::default()); + + self.raft + .initialize(nodes) + .await + .map_err(|e| RaftError::Internal(e.to_string()))?; + + info!(node_id = self.id, "Initialized single-node cluster"); + Ok(()) + } + + /// Initialize a multi-node cluster + pub async fn initialize_cluster( + &self, + members: BTreeMap, + ) -> Result<(), RaftError> { + self.raft + .initialize(members) + .await + .map_err(|e| RaftError::Internal(e.to_string()))?; + + info!(node_id = self.id, "Initialized multi-node cluster"); + Ok(()) + } + + /// Add a learner node + pub async fn add_learner( + &self, + id: NodeId, + node: BasicNode, + blocking: bool, + ) -> Result<(), RaftError> { + self.raft + .add_learner(id, node, blocking) + .await + .map_err(|e| RaftError::Internal(e.to_string()))?; + + info!(node_id = id, "Added learner"); + Ok(()) + } + + /// Change cluster membership + pub async fn change_membership( + &self, + members: BTreeMap, + retain: bool, + ) -> Result<(), RaftError> { + let member_ids: std::collections::BTreeSet<_> = members.keys().cloned().collect(); + + self.raft + .change_membership(member_ids, retain) + .await + .map_err(|e| RaftError::Internal(e.to_string()))?; + + info!(?members, "Changed membership"); + Ok(()) + } + + /// Submit a write request (goes through Raft consensus) + pub async fn write(&self, cmd: RaftCommand) -> Result { + let response = self + .raft + .client_write(cmd) + .await + .map_err(|e| match e { + openraft::error::RaftError::APIError( + openraft::error::ClientWriteError::ForwardToLeader(fwd) + ) => RaftError::NotLeader { + leader_id: fwd.leader_id, + }, + _ => RaftError::ProposalFailed(e.to_string()), + })?; + + Ok(response.data) + } + + /// Read from the state machine (linearizable read) + pub async fn linearizable_read(&self) -> Result<(), RaftError> { + self.raft + .ensure_linearizable() + .await + .map_err(|e| RaftError::Internal(e.to_string()))?; + + Ok(()) + } + + /// Get current leader ID + pub async fn leader(&self) -> Option { + let metrics = self.raft.metrics().borrow().clone(); + metrics.current_leader + } + + /// Check if this node is the leader + pub async fn is_leader(&self) -> bool { + self.leader().await == Some(self.id) + } + + /// Get current term + pub async fn current_term(&self) -> u64 { + let metrics = self.raft.metrics().borrow().clone(); + metrics.current_term + } + + /// Get cluster membership + pub async fn membership(&self) -> Vec { + let metrics = self.raft.metrics().borrow().clone(); + metrics + .membership_config + .membership() + .voter_ids() + .collect() + } + + /// Shutdown the node + pub async fn shutdown(&self) -> Result<(), RaftError> { + self.raft + .shutdown() + .await + .map_err(|e| RaftError::Internal(e.to_string()))?; + + info!(node_id = self.id, "Raft node shutdown"); + Ok(()) + } + + /// Trigger a snapshot + pub async fn trigger_snapshot(&self) -> Result<(), RaftError> { + self.raft + .trigger() + .snapshot() + .await + .map_err(|e| RaftError::Internal(e.to_string()))?; + + debug!(node_id = self.id, "Triggered snapshot"); + Ok(()) + } +} + +/// Dummy RPC client for initialization +struct DummyRpcClient; + +#[async_trait::async_trait] +impl RaftRpcClient for DummyRpcClient { + async fn vote( + &self, + _target: NodeId, + _req: openraft::raft::VoteRequest, + ) -> Result, crate::network::RaftNetworkError> { + Err(crate::network::RaftNetworkError::RpcFailed( + "Dummy client".into(), + )) + } + + async fn append_entries( + &self, + _target: NodeId, + _req: openraft::raft::AppendEntriesRequest, + ) -> Result, crate::network::RaftNetworkError> + { + Err(crate::network::RaftNetworkError::RpcFailed( + "Dummy client".into(), + )) + } + + async fn install_snapshot( + &self, + _target: NodeId, + _req: openraft::raft::InstallSnapshotRequest, + ) -> Result, crate::network::RaftNetworkError> + { + Err(crate::network::RaftNetworkError::RpcFailed( + "Dummy client".into(), + )) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + async fn create_test_node(id: NodeId) -> RaftNode { + let dir = tempdir().unwrap(); + let store = RocksStore::new(dir.path()).unwrap(); + RaftNode::new(id, store, Arc::new(DummyRpcClient)) + .await + .unwrap() + } + + #[tokio::test] + async fn test_node_creation() { + let node = create_test_node(1).await; + assert_eq!(node.id(), 1); + } + + #[tokio::test] + async fn test_single_node_initialization() { + let node = create_test_node(1).await; + node.initialize().await.unwrap(); + + // Should be leader of single-node cluster + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + + let leader = node.leader().await; + assert_eq!(leader, Some(1)); + } + + #[tokio::test] + async fn test_single_node_write() { + let node = create_test_node(1).await; + node.initialize().await.unwrap(); + + // Wait for leader election + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + + let cmd = RaftCommand::Put { + key: b"test".to_vec(), + value: b"data".to_vec(), + lease_id: None, + prev_kv: false, + }; + + let response = node.write(cmd).await.unwrap(); + assert_eq!(response.revision, 1); + } +} diff --git a/chainfire/crates/chainfire-raft/src/storage.rs b/chainfire/crates/chainfire-raft/src/storage.rs new file mode 100644 index 0000000..06d853b --- /dev/null +++ b/chainfire/crates/chainfire-raft/src/storage.rs @@ -0,0 +1,475 @@ +//! Storage adapters for OpenRaft +//! +//! This module provides the storage traits implementation for OpenRaft using our RocksDB-based storage. + +use crate::config::{CommittedLeaderId, LogId, Membership, StoredMembership, TypeConfig}; +use chainfire_storage::{ + log_storage::{EntryPayload, LogEntry, LogId as InternalLogId, Vote as InternalVote}, + snapshot::{Snapshot, SnapshotBuilder}, + LogStorage, RocksStore, StateMachine, +}; +use chainfire_types::command::{RaftCommand, RaftResponse}; +use chainfire_types::error::StorageError as ChainfireStorageError; +use chainfire_types::NodeId; +use openraft::storage::{LogFlushed, LogState as OpenRaftLogState, RaftLogStorage, RaftStateMachine}; +use openraft::{ + AnyError, BasicNode, Entry, EntryPayload as OpenRaftEntryPayload, + ErrorSubject, ErrorVerb, SnapshotMeta as OpenRaftSnapshotMeta, + StorageError as OpenRaftStorageError, StorageIOError, + Vote as OpenRaftVote, +}; +use std::fmt::Debug; +use std::io::Cursor; +use std::sync::Arc; +use tokio::sync::{mpsc, RwLock}; +use tracing::{debug, info, trace}; + +/// Combined Raft storage implementing OpenRaft traits +pub struct RaftStorage { + /// Underlying RocksDB store + store: RocksStore, + /// Log storage + log: LogStorage, + /// State machine + state_machine: Arc>, + /// Snapshot builder + snapshot_builder: SnapshotBuilder, + /// Current membership + membership: RwLock>, + /// Last applied log ID + last_applied: RwLock>, +} + +/// Convert our storage error to OpenRaft StorageError +fn to_storage_error(e: ChainfireStorageError) -> OpenRaftStorageError { + let io_err = StorageIOError::new( + ErrorSubject::Store, + ErrorVerb::Read, + AnyError::new(&e), + ); + OpenRaftStorageError::IO { source: io_err } +} + +impl RaftStorage { + /// Create new Raft storage + pub fn new(store: RocksStore) -> Result { + let log = LogStorage::new(store.clone()); + let state_machine = Arc::new(RwLock::new(StateMachine::new(store.clone())?)); + let snapshot_builder = SnapshotBuilder::new(store.clone()); + + Ok(Self { + store, + log, + state_machine, + snapshot_builder, + membership: RwLock::new(None), + last_applied: RwLock::new(None), + }) + } + + /// Set the watch event sender + pub async fn set_watch_sender(&self, tx: mpsc::UnboundedSender) { + let mut sm = self.state_machine.write().await; + sm.set_watch_sender(tx); + } + + /// Get the state machine + pub fn state_machine(&self) -> &Arc> { + &self.state_machine + } + + /// Convert internal LogId to OpenRaft LogId + fn to_openraft_log_id(id: InternalLogId) -> LogId { + // Create CommittedLeaderId from term (node_id is ignored in std implementation) + let committed_leader_id = CommittedLeaderId::new(id.term, 0); + openraft::LogId::new(committed_leader_id, id.index) + } + + /// Convert OpenRaft LogId to internal LogId + fn from_openraft_log_id(id: &LogId) -> InternalLogId { + InternalLogId::new(id.leader_id.term, id.index) + } + + /// Convert internal Vote to OpenRaft Vote + fn to_openraft_vote(vote: InternalVote) -> OpenRaftVote { + OpenRaftVote::new(vote.term, vote.node_id.unwrap_or(0)) + } + + /// Convert OpenRaft Vote to internal Vote + fn from_openraft_vote(vote: &OpenRaftVote) -> InternalVote { + InternalVote { + term: vote.leader_id().term, + node_id: Some(vote.leader_id().node_id), + committed: vote.is_committed(), + } + } + + /// Convert internal entry to OpenRaft entry + fn to_openraft_entry(entry: LogEntry) -> Entry { + let payload = match entry.payload { + EntryPayload::Blank => OpenRaftEntryPayload::Blank, + EntryPayload::Normal(data) => OpenRaftEntryPayload::Normal(data), + EntryPayload::Membership(members) => { + // Create membership from node IDs + let nodes: std::collections::BTreeMap = members + .into_iter() + .map(|id| (id, BasicNode::default())) + .collect(); + let membership = Membership::new(vec![nodes.keys().cloned().collect()], None); + OpenRaftEntryPayload::Membership(membership) + } + }; + + Entry { + log_id: Self::to_openraft_log_id(entry.log_id), + payload, + } + } + + /// Convert OpenRaft entry to internal entry + fn from_openraft_entry(entry: &Entry) -> LogEntry { + let payload = match &entry.payload { + OpenRaftEntryPayload::Blank => EntryPayload::Blank, + OpenRaftEntryPayload::Normal(data) => EntryPayload::Normal(data.clone()), + OpenRaftEntryPayload::Membership(m) => { + let members: Vec = m.voter_ids().collect(); + EntryPayload::Membership(members) + } + }; + + LogEntry { + log_id: Self::from_openraft_log_id(&entry.log_id), + payload, + } + } +} + +impl RaftLogStorage for RaftStorage { + type LogReader = Self; + + async fn get_log_state( + &mut self, + ) -> Result, OpenRaftStorageError> { + let state = self + .log + .get_log_state() + .map_err(to_storage_error)?; + + Ok(OpenRaftLogState { + last_purged_log_id: state.last_purged_log_id.map(Self::to_openraft_log_id), + last_log_id: state.last_log_id.map(Self::to_openraft_log_id), + }) + } + + async fn save_vote( + &mut self, + vote: &OpenRaftVote, + ) -> Result<(), OpenRaftStorageError> { + let internal_vote = Self::from_openraft_vote(vote); + self.log + .save_vote(internal_vote) + .map_err(to_storage_error) + } + + async fn read_vote( + &mut self, + ) -> Result>, OpenRaftStorageError> { + match self.log.read_vote() { + Ok(Some(vote)) => Ok(Some(Self::to_openraft_vote(vote))), + Ok(None) => Ok(None), + Err(e) => Err(to_storage_error(e)), + } + } + + async fn save_committed( + &mut self, + committed: Option, + ) -> Result<(), OpenRaftStorageError> { + // Store committed index in metadata + debug!(?committed, "Saving committed log id"); + Ok(()) + } + + async fn read_committed( + &mut self, + ) -> Result, OpenRaftStorageError> { + // Return the last applied as committed + let last_applied = self.last_applied.read().await; + Ok(last_applied.clone()) + } + + async fn append> + Send>( + &mut self, + entries: I, + callback: LogFlushed, + ) -> Result<(), OpenRaftStorageError> { + let entries: Vec<_> = entries.into_iter().collect(); + if entries.is_empty() { + callback.log_io_completed(Ok(())); + return Ok(()); + } + + let internal_entries: Vec<_> = entries.iter().map(Self::from_openraft_entry).collect(); + + match self.log.append(&internal_entries) { + Ok(()) => { + callback.log_io_completed(Ok(())); + Ok(()) + } + Err(e) => { + let io_err = std::io::Error::new(std::io::ErrorKind::Other, e.to_string()); + callback.log_io_completed(Err(io_err)); + Err(to_storage_error(e)) + } + } + } + + async fn truncate( + &mut self, + log_id: LogId, + ) -> Result<(), OpenRaftStorageError> { + self.log + .truncate(log_id.index) + .map_err(to_storage_error) + } + + async fn purge( + &mut self, + log_id: LogId, + ) -> Result<(), OpenRaftStorageError> { + self.log + .purge(log_id.index) + .map_err(to_storage_error) + } + + async fn get_log_reader(&mut self) -> Self::LogReader { + // Return self as the log reader + RaftStorage { + store: self.store.clone(), + log: LogStorage::new(self.store.clone()), + state_machine: Arc::clone(&self.state_machine), + snapshot_builder: SnapshotBuilder::new(self.store.clone()), + membership: RwLock::new(None), + last_applied: RwLock::new(None), + } + } +} + +impl openraft::storage::RaftLogReader for RaftStorage { + async fn try_get_log_entries + Clone + Debug + Send>( + &mut self, + range: RB, + ) -> Result>, OpenRaftStorageError> { + let entries: Vec> = + self.log.get_log_entries(range).map_err(to_storage_error)?; + + Ok(entries.into_iter().map(Self::to_openraft_entry).collect()) + } +} + +impl RaftStateMachine for RaftStorage { + type SnapshotBuilder = Self; + + async fn applied_state( + &mut self, + ) -> Result<(Option, StoredMembership), OpenRaftStorageError> { + let last_applied = self.last_applied.read().await.clone(); + let membership = self + .membership + .read() + .await + .clone() + .unwrap_or_else(|| StoredMembership::new(None, Membership::new(vec![], None))); + + Ok((last_applied, membership)) + } + + async fn apply> + Send>( + &mut self, + entries: I, + ) -> Result, OpenRaftStorageError> { + let mut responses = Vec::new(); + let sm = self.state_machine.write().await; + + for entry in entries { + trace!(log_id = ?entry.log_id, "Applying entry"); + + let response = match &entry.payload { + OpenRaftEntryPayload::Blank => RaftResponse::new(sm.current_revision()), + OpenRaftEntryPayload::Normal(cmd) => { + sm.apply(cmd.clone()).map_err(to_storage_error)? + } + OpenRaftEntryPayload::Membership(m) => { + // Update stored membership + let stored = StoredMembership::new(Some(entry.log_id.clone()), m.clone()); + *self.membership.write().await = Some(stored); + RaftResponse::new(sm.current_revision()) + } + }; + + responses.push(response); + + // Update last applied + *self.last_applied.write().await = Some(entry.log_id.clone()); + } + + Ok(responses) + } + + async fn get_snapshot_builder(&mut self) -> Self::SnapshotBuilder { + RaftStorage { + store: self.store.clone(), + log: LogStorage::new(self.store.clone()), + state_machine: Arc::clone(&self.state_machine), + snapshot_builder: SnapshotBuilder::new(self.store.clone()), + membership: RwLock::new(None), + last_applied: RwLock::new(None), + } + } + + async fn begin_receiving_snapshot( + &mut self, + ) -> Result>>, OpenRaftStorageError> { + Ok(Box::new(Cursor::new(Vec::new()))) + } + + async fn install_snapshot( + &mut self, + meta: &OpenRaftSnapshotMeta, + snapshot: Box>>, + ) -> Result<(), OpenRaftStorageError> { + let data = snapshot.into_inner(); + + // Parse and apply snapshot + let snapshot = Snapshot::from_bytes(&data).map_err(to_storage_error)?; + + self.snapshot_builder + .apply(&snapshot) + .map_err(to_storage_error)?; + + // Update state + *self.last_applied.write().await = meta.last_log_id.clone(); + + *self.membership.write().await = Some(meta.last_membership.clone()); + + info!(last_log_id = ?meta.last_log_id, "Installed snapshot"); + Ok(()) + } + + async fn get_current_snapshot( + &mut self, + ) -> Result>, OpenRaftStorageError> { + let last_applied = self.last_applied.read().await.clone(); + let membership = self.membership.read().await.clone(); + + let Some(log_id) = last_applied else { + return Ok(None); + }; + + let membership_ids: Vec = membership + .as_ref() + .map(|m| m.membership().voter_ids().collect()) + .unwrap_or_default(); + + let snapshot = self + .snapshot_builder + .build(log_id.index, log_id.leader_id.term, membership_ids) + .map_err(to_storage_error)?; + + let data = snapshot.to_bytes().map_err(to_storage_error)?; + + let last_membership = membership + .unwrap_or_else(|| StoredMembership::new(None, Membership::new(vec![], None))); + + let meta = OpenRaftSnapshotMeta { + last_log_id: Some(log_id), + last_membership, + snapshot_id: format!( + "{}-{}", + self.last_applied.read().await.as_ref().map(|l| l.leader_id.term).unwrap_or(0), + self.last_applied.read().await.as_ref().map(|l| l.index).unwrap_or(0) + ), + }; + + Ok(Some(openraft::Snapshot { + meta, + snapshot: Box::new(Cursor::new(data)), + })) + } +} + +impl openraft::storage::RaftSnapshotBuilder for RaftStorage { + async fn build_snapshot( + &mut self, + ) -> Result, OpenRaftStorageError> { + self.get_current_snapshot() + .await? + .ok_or_else(|| { + let io_err = StorageIOError::new( + ErrorSubject::Snapshot(None), + ErrorVerb::Read, + AnyError::error("No snapshot available"), + ); + OpenRaftStorageError::IO { source: io_err } + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use openraft::RaftLogReader; + use tempfile::tempdir; + + fn create_test_storage() -> RaftStorage { + let dir = tempdir().unwrap(); + let store = RocksStore::new(dir.path()).unwrap(); + RaftStorage::new(store).unwrap() + } + + #[tokio::test] + async fn test_vote_persistence() { + let mut storage = create_test_storage(); + + let vote = OpenRaftVote::new(5, 1); + storage.save_vote(&vote).await.unwrap(); + + let loaded = storage.read_vote().await.unwrap().unwrap(); + assert_eq!(loaded.leader_id().term, 5); + assert_eq!(loaded.leader_id().node_id, 1); + } + + #[tokio::test] + async fn test_log_state_initial() { + let mut storage = create_test_storage(); + + // Initially, log should be empty + let state = storage.get_log_state().await.unwrap(); + assert!(state.last_log_id.is_none()); + assert!(state.last_purged_log_id.is_none()); + } + + #[tokio::test] + async fn test_apply_entries() { + let mut storage = create_test_storage(); + + let entries = vec![Entry { + log_id: openraft::LogId::new(CommittedLeaderId::new(1, 0), 1), + payload: OpenRaftEntryPayload::Normal(RaftCommand::Put { + key: b"test".to_vec(), + value: b"data".to_vec(), + lease_id: None, + prev_kv: false, + }), + }]; + + let responses = storage.apply(entries).await.unwrap(); + assert_eq!(responses.len(), 1); + assert_eq!(responses[0].revision, 1); + + // Verify in state machine + let sm = storage.state_machine.read().await; + let entry = sm.kv().get(b"test").unwrap().unwrap(); + assert_eq!(entry.value, b"data"); + } +} diff --git a/chainfire/crates/chainfire-server/Cargo.toml b/chainfire/crates/chainfire-server/Cargo.toml new file mode 100644 index 0000000..3e7fb7f --- /dev/null +++ b/chainfire/crates/chainfire-server/Cargo.toml @@ -0,0 +1,59 @@ +[package] +name = "chainfire-server" +version.workspace = true +edition.workspace = true +license.workspace = true +rust-version.workspace = true +description = "Chainfire distributed KVS server" + +[lib] +name = "chainfire_server" +path = "src/lib.rs" + +[[bin]] +name = "chainfire" +path = "src/main.rs" + +[dependencies] +chainfire-types = { workspace = true } +chainfire-storage = { workspace = true } +chainfire-raft = { workspace = true } +chainfire-gossip = { workspace = true } +chainfire-watch = { workspace = true } +chainfire-api = { workspace = true } + +# Async +tokio = { workspace = true } +futures = { workspace = true } +async-trait = { workspace = true } + +# Raft (for RPC types) +openraft = { workspace = true } + +# gRPC +tonic = { workspace = true } +tonic-health = { workspace = true } + +# Configuration +clap = { workspace = true } +toml = { workspace = true } +serde = { workspace = true } + +# Logging +tracing = { workspace = true } +tracing-subscriber = { workspace = true } + +# Metrics +metrics = { workspace = true } +metrics-exporter-prometheus = { workspace = true } + +# Utilities +anyhow = { workspace = true } + +[dev-dependencies] +tempfile = { workspace = true } +chainfire-client = { workspace = true } +tokio = { workspace = true, features = ["rt-multi-thread", "macros", "time"] } + +[lints] +workspace = true diff --git a/chainfire/crates/chainfire-server/src/config.rs b/chainfire/crates/chainfire-server/src/config.rs new file mode 100644 index 0000000..943bb1e --- /dev/null +++ b/chainfire/crates/chainfire-server/src/config.rs @@ -0,0 +1,160 @@ +//! Server configuration + +use anyhow::Result; +use chainfire_types::RaftRole; +use serde::{Deserialize, Serialize}; +use std::net::SocketAddr; +use std::path::{Path, PathBuf}; + +/// Server configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ServerConfig { + /// Node configuration + pub node: NodeConfig, + /// Storage configuration + pub storage: StorageConfig, + /// Network configuration + pub network: NetworkConfig, + /// Cluster configuration + pub cluster: ClusterConfig, + /// Raft configuration + #[serde(default)] + pub raft: RaftConfig, +} + +/// Node-specific configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NodeConfig { + /// Unique node ID + pub id: u64, + /// Human-readable name + pub name: String, + /// Node role (control_plane or worker) + pub role: String, +} + +/// Storage configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StorageConfig { + /// Data directory + pub data_dir: PathBuf, +} + +/// Network configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NetworkConfig { + /// API listen address (gRPC) + pub api_addr: SocketAddr, + /// Raft listen address + pub raft_addr: SocketAddr, + /// Gossip listen address (UDP) + pub gossip_addr: SocketAddr, +} + +/// Cluster configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ClusterConfig { + /// Cluster ID + pub id: u64, + /// Initial cluster members + pub initial_members: Vec, + /// Whether to bootstrap a new cluster + pub bootstrap: bool, +} + +/// Cluster member configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MemberConfig { + /// Node ID + pub id: u64, + /// Raft address + pub raft_addr: String, +} + +/// Raft-specific configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RaftConfig { + /// Raft participation role: "voter", "learner", or "none" + /// + /// - `voter`: Full voting member in Raft consensus + /// - `learner`: Non-voting replica that receives log replication + /// - `none`: No Raft participation, node acts as agent/proxy only + #[serde(default)] + pub role: RaftRole, +} + +impl Default for RaftConfig { + fn default() -> Self { + Self { + role: RaftRole::Voter, + } + } +} + +impl Default for ServerConfig { + fn default() -> Self { + Self { + node: NodeConfig { + id: 1, + name: "chainfire-1".into(), + role: "control_plane".into(), + }, + storage: StorageConfig { + data_dir: PathBuf::from("./data"), + }, + network: NetworkConfig { + api_addr: "127.0.0.1:2379".parse().unwrap(), + raft_addr: "127.0.0.1:2380".parse().unwrap(), + gossip_addr: "127.0.0.1:2381".parse().unwrap(), + }, + cluster: ClusterConfig { + id: 1, + initial_members: vec![], + bootstrap: true, + }, + raft: RaftConfig::default(), + } + } +} + +impl ServerConfig { + /// Load configuration from a file + pub fn load(path: &Path) -> Result { + let contents = std::fs::read_to_string(path)?; + let config: ServerConfig = toml::from_str(&contents)?; + Ok(config) + } + + /// Save configuration to a file + pub fn save(&self, path: &Path) -> Result<()> { + let contents = toml::to_string_pretty(self)?; + std::fs::write(path, contents)?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + #[test] + fn test_default_config() { + let config = ServerConfig::default(); + assert_eq!(config.node.id, 1); + assert!(config.cluster.bootstrap); + } + + #[test] + fn test_config_roundtrip() { + let dir = tempdir().unwrap(); + let path = dir.path().join("config.toml"); + + let config = ServerConfig::default(); + config.save(&path).unwrap(); + + let loaded = ServerConfig::load(&path).unwrap(); + assert_eq!(loaded.node.id, config.node.id); + assert_eq!(loaded.network.api_addr, config.network.api_addr); + } +} diff --git a/chainfire/crates/chainfire-server/src/lib.rs b/chainfire/crates/chainfire-server/src/lib.rs new file mode 100644 index 0000000..bf93188 --- /dev/null +++ b/chainfire/crates/chainfire-server/src/lib.rs @@ -0,0 +1,10 @@ +//! Chainfire distributed KVS server library +//! +//! This crate provides the server implementation for Chainfire, including: +//! - Server configuration +//! - Node management +//! - gRPC service hosting + +pub mod config; +pub mod node; +pub mod server; diff --git a/chainfire/crates/chainfire-server/src/main.rs b/chainfire/crates/chainfire-server/src/main.rs new file mode 100644 index 0000000..3e931fd --- /dev/null +++ b/chainfire/crates/chainfire-server/src/main.rs @@ -0,0 +1,148 @@ +//! Chainfire distributed KVS server + +use anyhow::Result; +use chainfire_server::config::ServerConfig; +use clap::Parser; +use metrics_exporter_prometheus::PrometheusBuilder; +use std::path::PathBuf; +use tracing::info; + +/// Chainfire distributed Key-Value Store +#[derive(Parser, Debug)] +#[command(name = "chainfire")] +#[command(author, version, about, long_about = None)] +struct Args { + /// Configuration file path + #[arg(short, long, default_value = "chainfire.toml")] + config: PathBuf, + + /// Node ID (overrides config) + #[arg(long)] + node_id: Option, + + /// Data directory (overrides config) + #[arg(long)] + data_dir: Option, + + /// API listen address (overrides config) + #[arg(long)] + api_addr: Option, + + /// Raft listen address (overrides config) + #[arg(long)] + raft_addr: Option, + + /// Gossip listen address (overrides config) + #[arg(long)] + gossip_addr: Option, + + /// Initial cluster members for bootstrap (comma-separated node_id=addr pairs) + #[arg(long)] + initial_cluster: Option, + + /// Enable verbose logging + #[arg(short, long)] + verbose: bool, + + /// Metrics port for Prometheus scraping + #[arg(long, default_value = "9091")] + metrics_port: u16, +} + +#[tokio::main] +async fn main() -> Result<()> { + let args = Args::parse(); + + // Initialize logging + let filter = if args.verbose { + "chainfire=debug,tower_http=debug" + } else { + "chainfire=info" + }; + + tracing_subscriber::fmt() + .with_env_filter(filter) + .with_target(true) + .init(); + + info!("Chainfire v{}", env!("CARGO_PKG_VERSION")); + + // Initialize Prometheus metrics exporter + let metrics_addr = format!("0.0.0.0:{}", args.metrics_port); + let builder = PrometheusBuilder::new(); + builder + .with_http_listener(metrics_addr.parse::()?) + .install() + .expect("Failed to install Prometheus metrics exporter"); + + info!( + "Prometheus metrics available at http://{}/metrics", + metrics_addr + ); + + // Register chainfire metrics + metrics::describe_counter!( + "chainfire_kv_requests_total", + "Total number of KV requests by operation type" + ); + metrics::describe_counter!( + "chainfire_kv_bytes_read", + "Total bytes read from KV store" + ); + metrics::describe_counter!( + "chainfire_kv_bytes_written", + "Total bytes written to KV store" + ); + metrics::describe_histogram!( + "chainfire_kv_request_duration_seconds", + "KV request duration in seconds" + ); + metrics::describe_gauge!( + "chainfire_raft_term", + "Current Raft term" + ); + metrics::describe_gauge!( + "chainfire_raft_is_leader", + "Whether this node is the Raft leader (1=yes, 0=no)" + ); + metrics::describe_counter!( + "chainfire_watch_events_total", + "Total number of watch events emitted" + ); + + // Load or create configuration + let mut config = if args.config.exists() { + ServerConfig::load(&args.config)? + } else { + info!("Config file not found, using defaults"); + ServerConfig::default() + }; + + // Apply command line overrides + if let Some(node_id) = args.node_id { + config.node.id = node_id; + } + if let Some(data_dir) = args.data_dir { + config.storage.data_dir = data_dir; + } + if let Some(api_addr) = args.api_addr { + config.network.api_addr = api_addr.parse()?; + } + if let Some(raft_addr) = args.raft_addr { + config.network.raft_addr = raft_addr.parse()?; + } + if let Some(gossip_addr) = args.gossip_addr { + config.network.gossip_addr = gossip_addr.parse()?; + } + + info!(node_id = config.node.id, "Starting node"); + info!(api_addr = %config.network.api_addr, "API address"); + info!(raft_addr = %config.network.raft_addr, "Raft address"); + info!(gossip_addr = %config.network.gossip_addr, "Gossip address"); + + // Start the server + let server = chainfire_server::server::Server::new(config).await?; + server.run().await?; + + Ok(()) +} diff --git a/chainfire/crates/chainfire-server/src/node.rs b/chainfire/crates/chainfire-server/src/node.rs new file mode 100644 index 0000000..6651fa3 --- /dev/null +++ b/chainfire/crates/chainfire-server/src/node.rs @@ -0,0 +1,201 @@ +//! Node orchestration +//! +//! This module manages the lifecycle of all components in a Chainfire node. + +use crate::config::ServerConfig; +use anyhow::Result; +use chainfire_api::GrpcRaftClient; +use chainfire_gossip::{GossipAgent, GossipId}; +use chainfire_raft::{Raft, RaftNode}; +use chainfire_storage::RocksStore; +use chainfire_types::node::NodeRole; +use chainfire_types::RaftRole; +use chainfire_watch::WatchRegistry; +use std::sync::Arc; +use tokio::sync::broadcast; +use tracing::info; + +/// Node instance managing all components +pub struct Node { + /// Server configuration + config: ServerConfig, + /// Raft node (None if role is RaftRole::None) + raft: Option>, + /// Watch registry + watch_registry: Arc, + /// Gossip agent (runs on all nodes) + gossip: Option, + /// Shutdown signal + shutdown_tx: broadcast::Sender<()>, +} + +impl Node { + /// Create a new node + pub async fn new(config: ServerConfig) -> Result { + // Ensure data directory exists + std::fs::create_dir_all(&config.storage.data_dir)?; + + // Create watch registry + let watch_registry = Arc::new(WatchRegistry::new()); + + // Create Raft node only if role participates in Raft + let raft = if config.raft.role.participates_in_raft() { + // Create RocksDB store + let store = RocksStore::new(&config.storage.data_dir)?; + info!(data_dir = ?config.storage.data_dir, "Opened storage"); + + // Create gRPC Raft client and register peer addresses + let rpc_client = Arc::new(GrpcRaftClient::new()); + for member in &config.cluster.initial_members { + rpc_client.add_node(member.id, member.raft_addr.clone()).await; + info!(node_id = member.id, addr = %member.raft_addr, "Registered peer"); + } + + // Create Raft node + let raft_node = Arc::new( + RaftNode::new(config.node.id, store, rpc_client).await?, + ); + info!( + node_id = config.node.id, + raft_role = %config.raft.role, + "Created Raft node" + ); + Some(raft_node) + } else { + info!( + node_id = config.node.id, + raft_role = %config.raft.role, + "Skipping Raft node (role=none)" + ); + None + }; + + // Gossip runs on ALL nodes regardless of Raft role + let gossip_role = match config.node.role.as_str() { + "control_plane" => NodeRole::ControlPlane, + _ => NodeRole::Worker, + }; + + let gossip_id = GossipId::new(config.node.id, config.network.gossip_addr, gossip_role); + + let gossip = Some( + GossipAgent::new(gossip_id, chainfire_gossip::agent::default_config()) + .await?, + ); + info!( + addr = %config.network.gossip_addr, + gossip_role = ?gossip_role, + "Created gossip agent" + ); + + let (shutdown_tx, _) = broadcast::channel(1); + + Ok(Self { + config, + raft, + watch_registry, + gossip, + shutdown_tx, + }) + } + + /// Get the Raft node (None if role is RaftRole::None) + pub fn raft(&self) -> Option<&Arc> { + self.raft.as_ref() + } + + /// Get the underlying Raft instance for internal service (None if role is RaftRole::None) + pub fn raft_instance(&self) -> Option> { + self.raft.as_ref().map(|r| r.raft_arc()) + } + + /// Check if this node has Raft enabled + pub fn has_raft(&self) -> bool { + self.raft.is_some() + } + + /// Get the Raft role configuration + pub fn raft_role(&self) -> RaftRole { + self.config.raft.role + } + + /// Get the watch registry + pub fn watch_registry(&self) -> &Arc { + &self.watch_registry + } + + /// Get the cluster ID + pub fn cluster_id(&self) -> u64 { + self.config.cluster.id + } + + /// Initialize the cluster if bootstrapping + /// + /// This handles different behaviors based on RaftRole: + /// - Voter with bootstrap=true: Initialize cluster (single or multi-node) + /// - Learner: Wait to be added by the leader via add_learner + /// - None: No Raft, nothing to do + pub async fn maybe_bootstrap(&self) -> Result<()> { + let Some(raft) = &self.raft else { + info!("No Raft node to bootstrap (role=none)"); + return Ok(()); + }; + + match self.config.raft.role { + RaftRole::Voter if self.config.cluster.bootstrap => { + if self.config.cluster.initial_members.is_empty() { + // Single-node bootstrap + info!("Bootstrapping single-node cluster"); + raft.initialize().await?; + } else { + // Multi-node bootstrap with initial_members + use openraft::BasicNode; + use std::collections::BTreeMap; + + info!( + members = self.config.cluster.initial_members.len(), + "Bootstrapping multi-node cluster" + ); + + let members: BTreeMap = self + .config + .cluster + .initial_members + .iter() + .map(|m| (m.id, BasicNode::default())) + .collect(); + + raft.initialize_cluster(members).await?; + } + } + RaftRole::Learner => { + info!( + node_id = self.config.node.id, + "Learner node ready, waiting to be added to cluster" + ); + // Learners don't bootstrap; they wait to be added via add_learner + } + _ => { + // Voter without bootstrap flag or other cases + info!( + node_id = self.config.node.id, + raft_role = %self.config.raft.role, + bootstrap = self.config.cluster.bootstrap, + "Not bootstrapping" + ); + } + } + + Ok(()) + } + + /// Get shutdown receiver + pub fn shutdown_receiver(&self) -> broadcast::Receiver<()> { + self.shutdown_tx.subscribe() + } + + /// Trigger shutdown + pub fn shutdown(&self) { + let _ = self.shutdown_tx.send(()); + } +} diff --git a/chainfire/crates/chainfire-server/src/server.rs b/chainfire/crates/chainfire-server/src/server.rs new file mode 100644 index 0000000..6b5cafa --- /dev/null +++ b/chainfire/crates/chainfire-server/src/server.rs @@ -0,0 +1,207 @@ +//! gRPC server +//! +//! This module provides the main server implementation that hosts all gRPC services. +//! Supports two modes: +//! - Full server mode (voter/learner): Runs Raft consensus and all services +//! - Agent mode (role=none): Runs gossip only, proxies requests to control-plane + +use crate::config::ServerConfig; +use crate::node::Node; +use anyhow::Result; +use chainfire_api::internal_proto::raft_service_server::RaftServiceServer; +use chainfire_api::proto::{ + cluster_server::ClusterServer, kv_server::KvServer, watch_server::WatchServer, +}; +use chainfire_api::{ClusterServiceImpl, KvServiceImpl, RaftServiceImpl, WatchServiceImpl}; +use chainfire_types::RaftRole; +use std::sync::Arc; +use tokio::signal; +use tonic::transport::Server as TonicServer; +use tonic_health::server::health_reporter; +use tracing::info; + +/// Main server instance +pub struct Server { + node: Arc, + config: ServerConfig, +} + +impl Server { + /// Create a new server + pub async fn new(config: ServerConfig) -> Result { + let node = Arc::new(Node::new(config.clone()).await?); + Ok(Self { node, config }) + } + + /// Run the server in the appropriate mode based on Raft role + pub async fn run(self) -> Result<()> { + match self.node.raft_role() { + RaftRole::None => self.run_agent_mode().await, + _ => self.run_full_mode().await, + } + } + + /// Run in full server mode (voter/learner with Raft consensus) + async fn run_full_mode(self) -> Result<()> { + let raft = self + .node + .raft() + .expect("raft node should exist in full mode") + .clone(); + + let raft_instance = self.node.raft_instance().expect("raft instance should exist"); + + // Bootstrap cluster if needed + self.node.maybe_bootstrap().await?; + + // Create gRPC services for client API + let kv_service = KvServiceImpl::new(Arc::clone(&raft), self.node.cluster_id()); + + let watch_service = WatchServiceImpl::new( + Arc::clone(self.node.watch_registry()), + self.node.cluster_id(), + raft.id(), + ); + + let cluster_service = ClusterServiceImpl::new(Arc::clone(&raft), self.node.cluster_id()); + + // Internal Raft service for inter-node communication + let raft_service = RaftServiceImpl::new(raft_instance); + + // Health check service for K8s liveness/readiness probes + let (mut health_reporter, health_service) = health_reporter(); + health_reporter + .set_serving::>() + .await; + health_reporter + .set_serving::>() + .await; + health_reporter + .set_serving::>() + .await; + + info!( + api_addr = %self.config.network.api_addr, + raft_addr = %self.config.network.raft_addr, + "Starting gRPC servers" + ); + + // Shutdown signal channel + let (shutdown_tx, _) = tokio::sync::broadcast::channel::<()>(1); + let mut shutdown_rx1 = shutdown_tx.subscribe(); + let mut shutdown_rx2 = shutdown_tx.subscribe(); + + // Client API server (KV, Watch, Cluster, Health) + let api_addr = self.config.network.api_addr; + let api_server = TonicServer::builder() + .add_service(health_service) + .add_service(KvServer::new(kv_service)) + .add_service(WatchServer::new(watch_service)) + .add_service(ClusterServer::new(cluster_service)) + .serve_with_shutdown(api_addr, async move { + let _ = shutdown_rx1.recv().await; + }); + + // Internal Raft server (peer-to-peer communication) + let raft_addr = self.config.network.raft_addr; + let raft_server = TonicServer::builder() + .add_service(RaftServiceServer::new(raft_service)) + .serve_with_shutdown(raft_addr, async move { + let _ = shutdown_rx2.recv().await; + }); + + info!(api_addr = %api_addr, "Client API server starting"); + info!(raft_addr = %raft_addr, "Raft server starting"); + + // Run both servers concurrently + tokio::select! { + result = api_server => { + if let Err(e) = result { + tracing::error!(error = %e, "API server error"); + } + } + result = raft_server => { + if let Err(e) = result { + tracing::error!(error = %e, "Raft server error"); + } + } + _ = signal::ctrl_c() => { + info!("Received shutdown signal"); + let _ = shutdown_tx.send(()); + } + } + + info!("Server stopped"); + Ok(()) + } + + /// Run in agent mode (role=none, gossip only, no Raft) + /// + /// Agent mode runs a lightweight server that: + /// - Participates in gossip protocol for cluster discovery + /// - Can subscribe to watch events (if connected to control-plane) + /// - Does not run Raft consensus + /// - Suitable for worker nodes that only need cluster membership + async fn run_agent_mode(self) -> Result<()> { + info!( + node_id = self.config.node.id, + api_addr = %self.config.network.api_addr, + "Starting agent mode (no Raft)" + ); + + // Get control-plane Raft addresses from initial_members + // These can be used to derive API addresses or discover them via gossip + let control_plane_addrs: Vec<&str> = self + .config + .cluster + .initial_members + .iter() + .map(|m| m.raft_addr.as_str()) + .collect(); + + if !control_plane_addrs.is_empty() { + info!( + control_plane_nodes = ?control_plane_addrs, + "Agent mode: control-plane Raft endpoints (use gossip for API discovery)" + ); + } + + // Health check service for K8s liveness/readiness probes + let (mut health_reporter, health_service) = health_reporter(); + // In agent mode, we report the agent service as serving (gossip is running) + health_reporter + .set_service_status("chainfire.Agent", tonic_health::ServingStatus::Serving) + .await; + + // Shutdown signal channel + let (shutdown_tx, _) = tokio::sync::broadcast::channel::<()>(1); + let mut shutdown_rx = shutdown_tx.subscribe(); + + // Run health check server for K8s probes + let api_addr = self.config.network.api_addr; + let health_server = TonicServer::builder() + .add_service(health_service) + .serve_with_shutdown(api_addr, async move { + let _ = shutdown_rx.recv().await; + }); + + info!(api_addr = %api_addr, "Agent health server starting"); + info!("Agent running. Press Ctrl+C to stop."); + + tokio::select! { + result = health_server => { + if let Err(e) = result { + tracing::error!(error = %e, "Agent health server error"); + } + } + _ = signal::ctrl_c() => { + info!("Received shutdown signal"); + let _ = shutdown_tx.send(()); + } + } + + self.node.shutdown(); + info!("Agent stopped"); + Ok(()) + } +} diff --git a/chainfire/crates/chainfire-server/tests/integration_test.rs b/chainfire/crates/chainfire-server/tests/integration_test.rs new file mode 100644 index 0000000..f39292d --- /dev/null +++ b/chainfire/crates/chainfire-server/tests/integration_test.rs @@ -0,0 +1,159 @@ +//! Integration tests for Chainfire +//! +//! These tests verify that the server, client, and all components work together correctly. + +use chainfire_client::Client; +use chainfire_server::{ + config::{ClusterConfig, NetworkConfig, NodeConfig, RaftConfig, ServerConfig, StorageConfig}, + server::Server, +}; +use std::time::Duration; +use tokio::time::sleep; + +/// Create a test server configuration +fn test_config(port: u16) -> (ServerConfig, tempfile::TempDir) { + use std::net::SocketAddr; + + let api_addr: SocketAddr = format!("127.0.0.1:{}", port).parse().unwrap(); + let raft_addr: SocketAddr = format!("127.0.0.1:{}", port + 100).parse().unwrap(); + let gossip_addr: SocketAddr = format!("127.0.0.1:{}", port + 200).parse().unwrap(); + + let temp_dir = tempfile::tempdir().unwrap(); + + let config = ServerConfig { + node: NodeConfig { + id: 1, + name: format!("test-node-{}", port), + role: "control_plane".to_string(), + }, + cluster: ClusterConfig { + id: 1, + bootstrap: true, + initial_members: vec![], + }, + network: NetworkConfig { + api_addr, + raft_addr, + gossip_addr, + }, + storage: StorageConfig { + data_dir: temp_dir.path().to_path_buf(), + }, + raft: RaftConfig::default(), + }; + + (config, temp_dir) +} + +#[tokio::test] +async fn test_single_node_kv_operations() { + // Start server + let (config, _temp_dir) = test_config(23790); + let api_addr = config.network.api_addr; + let server = Server::new(config).await.unwrap(); + + // Run server in background + let server_handle = tokio::spawn(async move { + let _ = server.run().await; + }); + + // Wait for server to start + sleep(Duration::from_millis(500)).await; + + // Connect client + let mut client = Client::connect(format!("http://{}", api_addr)) + .await + .unwrap(); + + // Test put + let rev = client.put("test/key1", "value1").await.unwrap(); + assert!(rev > 0); + + // Test get + let value = client.get("test/key1").await.unwrap(); + assert_eq!(value, Some(b"value1".to_vec())); + + // Test put with different value + let rev2 = client.put("test/key1", "value2").await.unwrap(); + assert!(rev2 > rev); + + // Test get updated value + let value = client.get("test/key1").await.unwrap(); + assert_eq!(value, Some(b"value2".to_vec())); + + // Test get non-existent key + let value = client.get("test/nonexistent").await.unwrap(); + assert!(value.is_none()); + + // Test delete + let deleted = client.delete("test/key1").await.unwrap(); + assert!(deleted); + + // Verify deletion + let value = client.get("test/key1").await.unwrap(); + assert!(value.is_none()); + + // Test delete non-existent key + let deleted = client.delete("test/nonexistent").await.unwrap(); + assert!(!deleted); + + // Test prefix operations + client.put("prefix/a", "1").await.unwrap(); + client.put("prefix/b", "2").await.unwrap(); + client.put("prefix/c", "3").await.unwrap(); + client.put("other/key", "other").await.unwrap(); + + let prefix_values = client.get_prefix("prefix/").await.unwrap(); + assert_eq!(prefix_values.len(), 3); + + // Cleanup + server_handle.abort(); +} + +#[tokio::test] +async fn test_cluster_status() { + let (config, _temp_dir) = test_config(23800); + let api_addr = config.network.api_addr; + let server = Server::new(config).await.unwrap(); + + let server_handle = tokio::spawn(async move { + let _ = server.run().await; + }); + + sleep(Duration::from_millis(500)).await; + + let mut client = Client::connect(format!("http://{}", api_addr)) + .await + .unwrap(); + + let status = client.status().await.unwrap(); + assert_eq!(status.leader, 1); + assert!(status.raft_term > 0); + + server_handle.abort(); +} + +#[tokio::test] +async fn test_string_convenience_methods() { + let (config, _temp_dir) = test_config(23810); + let api_addr = config.network.api_addr; + let server = Server::new(config).await.unwrap(); + + let server_handle = tokio::spawn(async move { + let _ = server.run().await; + }); + + sleep(Duration::from_millis(500)).await; + + let mut client = Client::connect(format!("http://{}", api_addr)) + .await + .unwrap(); + + // Test string methods + client.put_str("/config/name", "chainfire").await.unwrap(); + + let value = client.get_str("/config/name").await.unwrap(); + assert_eq!(value, Some("chainfire".to_string())); + + server_handle.abort(); +} diff --git a/chainfire/crates/chainfire-storage/Cargo.toml b/chainfire/crates/chainfire-storage/Cargo.toml new file mode 100644 index 0000000..a33943c --- /dev/null +++ b/chainfire/crates/chainfire-storage/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "chainfire-storage" +version.workspace = true +edition.workspace = true +license.workspace = true +rust-version.workspace = true +description = "RocksDB storage layer for Chainfire distributed KVS" + +[dependencies] +chainfire-types = { workspace = true } + +# Storage +rocksdb = { workspace = true } + +# Async +tokio = { workspace = true } +async-trait = { workspace = true } + +# Serialization +serde = { workspace = true } +bincode = { workspace = true } + +# Utilities +tracing = { workspace = true } +parking_lot = { workspace = true } +bytes = { workspace = true } +dashmap = { workspace = true } + +[dev-dependencies] +tempfile = { workspace = true } +tokio = { workspace = true, features = ["rt-multi-thread", "macros"] } + +[lints] +workspace = true diff --git a/chainfire/crates/chainfire-storage/src/kv_store.rs b/chainfire/crates/chainfire-storage/src/kv_store.rs new file mode 100644 index 0000000..ddce691 --- /dev/null +++ b/chainfire/crates/chainfire-storage/src/kv_store.rs @@ -0,0 +1,435 @@ +//! Key-Value store operations + +use crate::{cf, meta_keys, RocksStore}; +use chainfire_types::error::StorageError; +use chainfire_types::kv::{KeyRange, KvEntry, Revision}; +use parking_lot::RwLock; +use rocksdb::WriteBatch; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; +use tracing::{debug, trace}; + +/// KV store built on RocksDB +pub struct KvStore { + store: RocksStore, + /// Current revision counter + revision: AtomicU64, +} + +impl KvStore { + /// Create a new KV store + pub fn new(store: RocksStore) -> Result { + let revision = Self::load_revision(&store)?; + + Ok(Self { + store, + revision: AtomicU64::new(revision), + }) + } + + /// Load the current revision from storage + fn load_revision(store: &RocksStore) -> Result { + let cf = store + .cf_handle(cf::META) + .ok_or_else(|| StorageError::RocksDb("META cf not found".into()))?; + + match store + .db() + .get_cf(&cf, meta_keys::REVISION) + .map_err(|e| StorageError::RocksDb(e.to_string()))? + { + Some(bytes) => { + let revision: Revision = bincode::deserialize(&bytes) + .map_err(|e| StorageError::Serialization(e.to_string()))?; + Ok(revision) + } + None => Ok(0), + } + } + + /// Get current revision + pub fn current_revision(&self) -> Revision { + self.revision.load(Ordering::SeqCst) + } + + /// Increment and return new revision + fn next_revision(&self) -> Revision { + self.revision.fetch_add(1, Ordering::SeqCst) + 1 + } + + /// Persist current revision + fn save_revision(&self, revision: Revision) -> Result<(), StorageError> { + let cf = self + .store + .cf_handle(cf::META) + .ok_or_else(|| StorageError::RocksDb("META cf not found".into()))?; + + let bytes = + bincode::serialize(&revision).map_err(|e| StorageError::Serialization(e.to_string()))?; + + self.store + .db() + .put_cf(&cf, meta_keys::REVISION, bytes) + .map_err(|e| StorageError::RocksDb(e.to_string()))?; + + Ok(()) + } + + /// Get a single key + pub fn get(&self, key: &[u8]) -> Result, StorageError> { + let cf = self + .store + .cf_handle(cf::KV) + .ok_or_else(|| StorageError::RocksDb("KV cf not found".into()))?; + + match self + .store + .db() + .get_cf(&cf, key) + .map_err(|e| StorageError::RocksDb(e.to_string()))? + { + Some(bytes) => { + let entry: KvEntry = bincode::deserialize(&bytes) + .map_err(|e| StorageError::Serialization(e.to_string()))?; + Ok(Some(entry)) + } + None => Ok(None), + } + } + + /// Put a key-value pair + pub fn put( + &self, + key: Vec, + value: Vec, + lease_id: Option, + ) -> Result<(Revision, Option), StorageError> { + let cf = self + .store + .cf_handle(cf::KV) + .ok_or_else(|| StorageError::RocksDb("KV cf not found".into()))?; + + // Get previous entry + let prev = self.get(&key)?; + let revision = self.next_revision(); + + // Create new entry + let entry = match &prev { + Some(old) => old.update(value, revision), + None => { + if let Some(lease) = lease_id { + KvEntry::with_lease(key.clone(), value, revision, lease) + } else { + KvEntry::new(key.clone(), value, revision) + } + } + }; + + // Write to RocksDB + let bytes = + bincode::serialize(&entry).map_err(|e| StorageError::Serialization(e.to_string()))?; + + let mut batch = WriteBatch::default(); + batch.put_cf(&cf, &key, &bytes); + + // Also persist revision + let meta_cf = self + .store + .cf_handle(cf::META) + .ok_or_else(|| StorageError::RocksDb("META cf not found".into()))?; + let rev_bytes = bincode::serialize(&revision) + .map_err(|e| StorageError::Serialization(e.to_string()))?; + batch.put_cf(&meta_cf, meta_keys::REVISION, &rev_bytes); + + self.store + .db() + .write(batch) + .map_err(|e| StorageError::RocksDb(e.to_string()))?; + + debug!(key = ?String::from_utf8_lossy(&key), revision, "Put key"); + Ok((revision, prev)) + } + + /// Delete a single key + pub fn delete(&self, key: &[u8]) -> Result<(Revision, Option), StorageError> { + let cf = self + .store + .cf_handle(cf::KV) + .ok_or_else(|| StorageError::RocksDb("KV cf not found".into()))?; + + // Get previous entry + let prev = self.get(key)?; + + if prev.is_none() { + return Ok((self.current_revision(), None)); + } + + let revision = self.next_revision(); + + // Delete from RocksDB + let mut batch = WriteBatch::default(); + batch.delete_cf(&cf, key); + + // Persist revision + let meta_cf = self + .store + .cf_handle(cf::META) + .ok_or_else(|| StorageError::RocksDb("META cf not found".into()))?; + let rev_bytes = bincode::serialize(&revision) + .map_err(|e| StorageError::Serialization(e.to_string()))?; + batch.put_cf(&meta_cf, meta_keys::REVISION, &rev_bytes); + + self.store + .db() + .write(batch) + .map_err(|e| StorageError::RocksDb(e.to_string()))?; + + debug!(key = ?String::from_utf8_lossy(key), revision, "Deleted key"); + Ok((revision, prev)) + } + + /// Delete a range of keys + pub fn delete_range( + &self, + start: &[u8], + end: &[u8], + ) -> Result<(Revision, Vec), StorageError> { + let cf = self + .store + .cf_handle(cf::KV) + .ok_or_else(|| StorageError::RocksDb("KV cf not found".into()))?; + + // First, collect all keys to delete + let entries = self.range(start, Some(end))?; + + if entries.is_empty() { + return Ok((self.current_revision(), Vec::new())); + } + + let revision = self.next_revision(); + + // Delete all keys + let mut batch = WriteBatch::default(); + for entry in &entries { + batch.delete_cf(&cf, &entry.key); + } + + // Persist revision + let meta_cf = self + .store + .cf_handle(cf::META) + .ok_or_else(|| StorageError::RocksDb("META cf not found".into()))?; + let rev_bytes = bincode::serialize(&revision) + .map_err(|e| StorageError::Serialization(e.to_string()))?; + batch.put_cf(&meta_cf, meta_keys::REVISION, &rev_bytes); + + self.store + .db() + .write(batch) + .map_err(|e| StorageError::RocksDb(e.to_string()))?; + + debug!( + start = ?String::from_utf8_lossy(start), + end = ?String::from_utf8_lossy(end), + deleted = entries.len(), + revision, + "Deleted range" + ); + + Ok((revision, entries)) + } + + /// Scan a range of keys + pub fn range(&self, start: &[u8], end: Option<&[u8]>) -> Result, StorageError> { + let cf = self + .store + .cf_handle(cf::KV) + .ok_or_else(|| StorageError::RocksDb("KV cf not found".into()))?; + + let mut entries = Vec::new(); + + let iter = self.store.db().iterator_cf( + &cf, + rocksdb::IteratorMode::From(start, rocksdb::Direction::Forward), + ); + + for item in iter { + let (key, value) = item.map_err(|e| StorageError::RocksDb(e.to_string()))?; + + // Check if we've passed the end + if let Some(end_key) = end { + if key.as_ref() >= end_key { + break; + } + } + + let entry: KvEntry = bincode::deserialize(&value) + .map_err(|e| StorageError::Serialization(e.to_string()))?; + entries.push(entry); + } + + trace!( + start = ?String::from_utf8_lossy(start), + count = entries.len(), + "Range scan" + ); + + Ok(entries) + } + + /// Scan keys with a prefix + pub fn prefix(&self, prefix: &[u8]) -> Result, StorageError> { + let range = KeyRange::prefix(prefix); + self.range(&range.start, range.end.as_deref()) + } + + /// Get the underlying store + pub fn store(&self) -> &RocksStore { + &self.store + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + fn create_test_store() -> KvStore { + let dir = tempdir().unwrap(); + let store = RocksStore::new(dir.path()).unwrap(); + KvStore::new(store).unwrap() + } + + #[test] + fn test_put_and_get() { + let kv = create_test_store(); + + let (rev, prev) = kv.put(b"key1".to_vec(), b"value1".to_vec(), None).unwrap(); + assert_eq!(rev, 1); + assert!(prev.is_none()); + + let entry = kv.get(b"key1").unwrap().unwrap(); + assert_eq!(entry.key, b"key1"); + assert_eq!(entry.value, b"value1"); + assert_eq!(entry.version, 1); + } + + #[test] + fn test_update() { + let kv = create_test_store(); + + kv.put(b"key1".to_vec(), b"value1".to_vec(), None).unwrap(); + let (rev, prev) = kv.put(b"key1".to_vec(), b"value2".to_vec(), None).unwrap(); + + assert_eq!(rev, 2); + assert!(prev.is_some()); + assert_eq!(prev.unwrap().value, b"value1"); + + let entry = kv.get(b"key1").unwrap().unwrap(); + assert_eq!(entry.value, b"value2"); + assert_eq!(entry.version, 2); + assert_eq!(entry.create_revision, 1); // Unchanged + assert_eq!(entry.mod_revision, 2); + } + + #[test] + fn test_delete() { + let kv = create_test_store(); + + kv.put(b"key1".to_vec(), b"value1".to_vec(), None).unwrap(); + let (rev, prev) = kv.delete(b"key1").unwrap(); + + assert_eq!(rev, 2); + assert!(prev.is_some()); + assert_eq!(prev.unwrap().value, b"value1"); + + let entry = kv.get(b"key1").unwrap(); + assert!(entry.is_none()); + } + + #[test] + fn test_delete_nonexistent() { + let kv = create_test_store(); + + let (rev, prev) = kv.delete(b"nonexistent").unwrap(); + assert_eq!(rev, 0); + assert!(prev.is_none()); + } + + #[test] + fn test_range() { + let kv = create_test_store(); + + kv.put(b"a".to_vec(), b"1".to_vec(), None).unwrap(); + kv.put(b"b".to_vec(), b"2".to_vec(), None).unwrap(); + kv.put(b"c".to_vec(), b"3".to_vec(), None).unwrap(); + kv.put(b"d".to_vec(), b"4".to_vec(), None).unwrap(); + + let entries = kv.range(b"b", Some(b"d")).unwrap(); + assert_eq!(entries.len(), 2); + assert_eq!(entries[0].key, b"b"); + assert_eq!(entries[1].key, b"c"); + } + + #[test] + fn test_prefix() { + let kv = create_test_store(); + + kv.put(b"/nodes/1".to_vec(), b"node1".to_vec(), None) + .unwrap(); + kv.put(b"/nodes/2".to_vec(), b"node2".to_vec(), None) + .unwrap(); + kv.put(b"/tasks/1".to_vec(), b"task1".to_vec(), None) + .unwrap(); + + let entries = kv.prefix(b"/nodes/").unwrap(); + assert_eq!(entries.len(), 2); + } + + #[test] + fn test_delete_range() { + let kv = create_test_store(); + + kv.put(b"/nodes/1".to_vec(), b"node1".to_vec(), None) + .unwrap(); + kv.put(b"/nodes/2".to_vec(), b"node2".to_vec(), None) + .unwrap(); + kv.put(b"/tasks/1".to_vec(), b"task1".to_vec(), None) + .unwrap(); + + let (rev, deleted) = kv.delete_range(b"/nodes/", b"/nodes0").unwrap(); + assert_eq!(deleted.len(), 2); + + // Verify nodes are gone + assert!(kv.get(b"/nodes/1").unwrap().is_none()); + assert!(kv.get(b"/nodes/2").unwrap().is_none()); + + // Verify task still exists + assert!(kv.get(b"/tasks/1").unwrap().is_some()); + } + + #[test] + fn test_revision_persistence() { + let dir = tempdir().unwrap(); + + // Create store and write some data + { + let store = RocksStore::new(dir.path()).unwrap(); + let kv = KvStore::new(store).unwrap(); + kv.put(b"key1".to_vec(), b"value1".to_vec(), None).unwrap(); + kv.put(b"key2".to_vec(), b"value2".to_vec(), None).unwrap(); + assert_eq!(kv.current_revision(), 2); + } + + // Reopen and verify revision is restored + { + let store = RocksStore::new(dir.path()).unwrap(); + let kv = KvStore::new(store).unwrap(); + assert_eq!(kv.current_revision(), 2); + + // Next write should continue from 3 + let (rev, _) = kv.put(b"key3".to_vec(), b"value3".to_vec(), None).unwrap(); + assert_eq!(rev, 3); + } + } +} diff --git a/chainfire/crates/chainfire-storage/src/lease_store.rs b/chainfire/crates/chainfire-storage/src/lease_store.rs new file mode 100644 index 0000000..07737a2 --- /dev/null +++ b/chainfire/crates/chainfire-storage/src/lease_store.rs @@ -0,0 +1,280 @@ +//! Lease storage for TTL-based key expiration +//! +//! Manages lease lifecycle: grant, revoke, refresh, expiration. + +use chainfire_types::error::StorageError; +use chainfire_types::lease::{Lease, LeaseData, LeaseId}; +use dashmap::DashMap; +use std::sync::atomic::{AtomicI64, Ordering}; +use std::sync::Arc; +use std::time::Duration; +use tokio::sync::mpsc; +use tracing::{debug, info, warn}; + +/// Store for managing leases +pub struct LeaseStore { + /// Active leases: lease_id -> Lease + leases: DashMap, + /// ID generator for new leases + next_id: AtomicI64, + /// Channel to notify of expired leases (lease_id, keys_to_delete) + expiration_tx: Option>)>>, +} + +impl LeaseStore { + /// Create a new lease store + pub fn new() -> Self { + Self { + leases: DashMap::new(), + next_id: AtomicI64::new(1), + expiration_tx: None, + } + } + + /// Set the expiration notification channel + pub fn set_expiration_sender(&mut self, tx: mpsc::UnboundedSender<(LeaseId, Vec>)>) { + self.expiration_tx = Some(tx); + } + + /// Grant a new lease + pub fn grant(&self, id: LeaseId, ttl: i64) -> Result { + let lease_id = if id == 0 { + self.next_id.fetch_add(1, Ordering::SeqCst) + } else { + // Check if ID is already in use + if self.leases.contains_key(&id) { + return Err(StorageError::LeaseError(format!("Lease {} already exists", id))); + } + // Update next_id if necessary + let _ = self.next_id.fetch_max(id + 1, Ordering::SeqCst); + id + }; + + let lease = Lease::new(lease_id, ttl); + self.leases.insert(lease_id, lease.clone()); + + debug!(lease_id, ttl, "Lease granted"); + Ok(lease) + } + + /// Revoke a lease and return keys to delete + pub fn revoke(&self, id: LeaseId) -> Result>, StorageError> { + match self.leases.remove(&id) { + Some((_, lease)) => { + info!(lease_id = id, keys_count = lease.keys.len(), "Lease revoked"); + Ok(lease.keys) + } + None => Err(StorageError::LeaseError(format!("Lease {} not found", id))), + } + } + + /// Refresh a lease (keep-alive) + pub fn refresh(&self, id: LeaseId) -> Result { + match self.leases.get_mut(&id) { + Some(mut lease) => { + lease.refresh(); + let ttl = lease.ttl; + debug!(lease_id = id, ttl, "Lease refreshed"); + Ok(ttl) + } + None => Err(StorageError::LeaseError(format!("Lease {} not found", id))), + } + } + + /// Get a lease by ID + pub fn get(&self, id: LeaseId) -> Option { + self.leases.get(&id).map(|l| l.clone()) + } + + /// Get remaining TTL for a lease + pub fn time_to_live(&self, id: LeaseId) -> Option<(i64, i64, Vec>)> { + self.leases.get(&id).map(|lease| { + (lease.remaining(), lease.ttl, lease.keys.clone()) + }) + } + + /// List all lease IDs + pub fn list(&self) -> Vec { + self.leases.iter().map(|entry| *entry.key()).collect() + } + + /// Attach a key to a lease + pub fn attach_key(&self, lease_id: LeaseId, key: Vec) -> Result<(), StorageError> { + match self.leases.get_mut(&lease_id) { + Some(mut lease) => { + lease.attach_key(key); + Ok(()) + } + None => Err(StorageError::LeaseError(format!("Lease {} not found", lease_id))), + } + } + + /// Detach a key from a lease + pub fn detach_key(&self, lease_id: LeaseId, key: &[u8]) { + if let Some(mut lease) = self.leases.get_mut(&lease_id) { + lease.detach_key(key); + } + } + + /// Check for expired leases and return their IDs and keys + pub fn collect_expired(&self) -> Vec<(LeaseId, Vec>)> { + let mut expired = Vec::new(); + + for entry in self.leases.iter() { + if entry.is_expired() { + expired.push((*entry.key(), entry.keys.clone())); + } + } + + // Remove expired leases + for (id, _) in &expired { + self.leases.remove(id); + } + + expired + } + + /// Export all leases for snapshot + pub fn export(&self) -> Vec { + self.leases + .iter() + .map(|entry| LeaseData::from_lease(&entry)) + .collect() + } + + /// Import leases from snapshot + pub fn import(&self, leases: Vec) { + self.leases.clear(); + for data in leases { + let id = data.id; + let lease = data.to_lease(); + self.leases.insert(id, lease); + // Update next_id + let _ = self.next_id.fetch_max(id + 1, Ordering::SeqCst); + } + } +} + +impl Default for LeaseStore { + fn default() -> Self { + Self::new() + } +} + +/// Background worker that checks for expired leases +pub struct LeaseExpirationWorker { + store: Arc, + interval: Duration, + shutdown_rx: mpsc::Receiver<()>, +} + +impl LeaseExpirationWorker { + /// Create a new expiration worker + pub fn new( + store: Arc, + interval: Duration, + shutdown_rx: mpsc::Receiver<()>, + ) -> Self { + Self { + store, + interval, + shutdown_rx, + } + } + + /// Run the expiration worker + pub async fn run(mut self, expire_callback: impl Fn(LeaseId, Vec>) + Send + 'static) { + let mut interval = tokio::time::interval(self.interval); + + loop { + tokio::select! { + _ = interval.tick() => { + let expired = self.store.collect_expired(); + for (lease_id, keys) in expired { + info!(lease_id, keys_count = keys.len(), "Lease expired"); + expire_callback(lease_id, keys); + } + } + _ = self.shutdown_rx.recv() => { + info!("Lease expiration worker shutting down"); + break; + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_lease_grant() { + let store = LeaseStore::new(); + let lease = store.grant(0, 10).unwrap(); + assert!(lease.id > 0); + assert_eq!(lease.ttl, 10); + } + + #[test] + fn test_lease_grant_with_id() { + let store = LeaseStore::new(); + let lease = store.grant(42, 10).unwrap(); + assert_eq!(lease.id, 42); + } + + #[test] + fn test_lease_revoke() { + let store = LeaseStore::new(); + let lease = store.grant(0, 10).unwrap(); + let id = lease.id; + + // Attach some keys + store.attach_key(id, b"key1".to_vec()).unwrap(); + store.attach_key(id, b"key2".to_vec()).unwrap(); + + let keys = store.revoke(id).unwrap(); + assert_eq!(keys.len(), 2); + + // Lease should be gone + assert!(store.get(id).is_none()); + } + + #[test] + fn test_lease_refresh() { + let store = LeaseStore::new(); + let lease = store.grant(0, 10).unwrap(); + let id = lease.id; + + let ttl = store.refresh(id).unwrap(); + assert_eq!(ttl, 10); + } + + #[test] + fn test_lease_list() { + let store = LeaseStore::new(); + store.grant(1, 10).unwrap(); + store.grant(2, 10).unwrap(); + store.grant(3, 10).unwrap(); + + let ids = store.list(); + assert_eq!(ids.len(), 3); + } + + #[test] + fn test_lease_attach_detach() { + let store = LeaseStore::new(); + let lease = store.grant(0, 10).unwrap(); + let id = lease.id; + + store.attach_key(id, b"key1".to_vec()).unwrap(); + store.attach_key(id, b"key2".to_vec()).unwrap(); + + let lease = store.get(id).unwrap(); + assert_eq!(lease.keys.len(), 2); + + store.detach_key(id, b"key1"); + let lease = store.get(id).unwrap(); + assert_eq!(lease.keys.len(), 1); + } +} diff --git a/chainfire/crates/chainfire-storage/src/lib.rs b/chainfire/crates/chainfire-storage/src/lib.rs new file mode 100644 index 0000000..7d1349e --- /dev/null +++ b/chainfire/crates/chainfire-storage/src/lib.rs @@ -0,0 +1,51 @@ +//! RocksDB storage layer for Chainfire distributed KVS +//! +//! This crate provides: +//! - RocksDB-backed persistent storage +//! - Key-Value operations (Put, Get, Delete, Scan) +//! - Lease management for TTL-based key expiration +//! - Log storage for Raft +//! - State machine for Raft +//! - Snapshot management + +pub mod kv_store; +pub mod lease_store; +pub mod log_storage; +pub mod snapshot; +pub mod state_machine; +pub mod store; + +pub use kv_store::KvStore; +pub use lease_store::{LeaseExpirationWorker, LeaseStore}; +pub use log_storage::LogStorage; +pub use snapshot::{Snapshot, SnapshotBuilder}; +pub use state_machine::StateMachine; +pub use store::RocksStore; + +/// Column family names for RocksDB +pub mod cf { + /// Raft log entries + pub const LOGS: &str = "raft_logs"; + /// Raft metadata (vote, term, etc.) + pub const META: &str = "raft_meta"; + /// Key-value data + pub const KV: &str = "key_value"; + /// Snapshot metadata + pub const SNAPSHOT: &str = "snapshot"; + /// Lease data + pub const LEASES: &str = "leases"; +} + +/// Metadata keys +pub mod meta_keys { + /// Current term and vote + pub const VOTE: &[u8] = b"vote"; + /// Last applied log ID + pub const LAST_APPLIED: &[u8] = b"last_applied"; + /// Current membership + pub const MEMBERSHIP: &[u8] = b"membership"; + /// Current revision + pub const REVISION: &[u8] = b"revision"; + /// Last snapshot ID + pub const LAST_SNAPSHOT: &[u8] = b"last_snapshot"; +} diff --git a/chainfire/crates/chainfire-storage/src/log_storage.rs b/chainfire/crates/chainfire-storage/src/log_storage.rs new file mode 100644 index 0000000..8208ed9 --- /dev/null +++ b/chainfire/crates/chainfire-storage/src/log_storage.rs @@ -0,0 +1,478 @@ +//! Raft log storage implementation +//! +//! This module provides persistent storage for Raft log entries using RocksDB. + +use crate::{cf, meta_keys, RocksStore}; +use chainfire_types::error::StorageError; +use rocksdb::WriteBatch; +use serde::{Deserialize, Serialize}; +use std::ops::RangeBounds; +use tracing::{debug, trace}; + +/// Log entry index type +pub type LogIndex = u64; + +/// Raft term type +pub type Term = u64; + +/// Log ID combining term and index +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct LogId { + pub term: Term, + pub index: LogIndex, +} + +impl LogId { + pub fn new(term: Term, index: LogIndex) -> Self { + Self { term, index } + } +} + +impl Default for LogId { + fn default() -> Self { + Self { term: 0, index: 0 } + } +} + +/// A log entry stored in the Raft log +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LogEntry { + pub log_id: LogId, + pub payload: EntryPayload, +} + +/// Payload of a log entry +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum EntryPayload { + /// A blank entry for leader establishment + Blank, + /// A normal data entry + Normal(D), + /// Membership change entry + Membership(Vec), // Just node IDs for simplicity +} + +impl LogEntry { + pub fn blank(log_id: LogId) -> Self { + Self { + log_id, + payload: EntryPayload::Blank, + } + } + + pub fn normal(log_id: LogId, data: D) -> Self { + Self { + log_id, + payload: EntryPayload::Normal(data), + } + } +} + +/// Persisted vote information +#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)] +pub struct Vote { + pub term: Term, + pub node_id: Option, + pub committed: bool, +} + +/// Log storage state +#[derive(Debug, Clone, Default)] +pub struct LogState { + /// Last purged log ID + pub last_purged_log_id: Option, + /// Last log ID in storage + pub last_log_id: Option, +} + +/// Raft log storage backed by RocksDB +pub struct LogStorage { + store: RocksStore, +} + +impl LogStorage { + /// Create a new log storage + pub fn new(store: RocksStore) -> Self { + Self { store } + } + + /// Encode log index as bytes for storage + fn encode_index(index: LogIndex) -> [u8; 8] { + index.to_be_bytes() + } + + /// Decode log index from bytes + fn decode_index(bytes: &[u8]) -> LogIndex { + let arr: [u8; 8] = bytes.try_into().unwrap_or_default(); + LogIndex::from_be_bytes(arr) + } + + /// Get log state (first and last log IDs) + pub fn get_log_state(&self) -> Result { + let cf = self + .store + .cf_handle(cf::LOGS) + .ok_or_else(|| StorageError::RocksDb("LOGS cf not found".into()))?; + + // Get first and last entries + let mut iter = self + .store + .db() + .iterator_cf(&cf, rocksdb::IteratorMode::Start); + + let _first = iter.next(); + let last_purged_log_id = self.get_last_purged_log_id()?; + + // Get last log ID + let mut last_iter = self + .store + .db() + .iterator_cf(&cf, rocksdb::IteratorMode::End); + + let last_log_id = if let Some(Ok((_, value))) = last_iter.next() { + let entry: LogEntry> = bincode::deserialize(&value) + .map_err(|e| StorageError::Serialization(e.to_string()))?; + Some(entry.log_id) + } else { + last_purged_log_id + }; + + Ok(LogState { + last_purged_log_id, + last_log_id, + }) + } + + /// Save vote to persistent storage + pub fn save_vote(&self, vote: Vote) -> Result<(), StorageError> { + let cf = self + .store + .cf_handle(cf::META) + .ok_or_else(|| StorageError::RocksDb("META cf not found".into()))?; + + let bytes = + bincode::serialize(&vote).map_err(|e| StorageError::Serialization(e.to_string()))?; + + self.store + .db() + .put_cf(&cf, meta_keys::VOTE, bytes) + .map_err(|e| StorageError::RocksDb(e.to_string()))?; + + debug!(?vote, "Saved vote"); + Ok(()) + } + + /// Read vote from persistent storage + pub fn read_vote(&self) -> Result, StorageError> { + let cf = self + .store + .cf_handle(cf::META) + .ok_or_else(|| StorageError::RocksDb("META cf not found".into()))?; + + match self + .store + .db() + .get_cf(&cf, meta_keys::VOTE) + .map_err(|e| StorageError::RocksDb(e.to_string()))? + { + Some(bytes) => { + let vote: Vote = bincode::deserialize(&bytes) + .map_err(|e| StorageError::Serialization(e.to_string()))?; + Ok(Some(vote)) + } + None => Ok(None), + } + } + + /// Append log entries + pub fn append(&self, entries: &[LogEntry]) -> Result<(), StorageError> { + if entries.is_empty() { + return Ok(()); + } + + let cf = self + .store + .cf_handle(cf::LOGS) + .ok_or_else(|| StorageError::RocksDb("LOGS cf not found".into()))?; + + let mut batch = WriteBatch::default(); + + for entry in entries { + let key = Self::encode_index(entry.log_id.index); + let value = bincode::serialize(entry) + .map_err(|e| StorageError::Serialization(e.to_string()))?; + batch.put_cf(&cf, key, value); + } + + self.store + .db() + .write(batch) + .map_err(|e| StorageError::RocksDb(e.to_string()))?; + + debug!( + first = entries.first().map(|e| e.log_id.index), + last = entries.last().map(|e| e.log_id.index), + count = entries.len(), + "Appended log entries" + ); + + Ok(()) + } + + /// Get log entries in a range + pub fn get_log_entries Deserialize<'de>>( + &self, + range: impl RangeBounds, + ) -> Result>, StorageError> { + let cf = self + .store + .cf_handle(cf::LOGS) + .ok_or_else(|| StorageError::RocksDb("LOGS cf not found".into()))?; + + let start = match range.start_bound() { + std::ops::Bound::Included(&idx) => idx, + std::ops::Bound::Excluded(&idx) => idx + 1, + std::ops::Bound::Unbounded => 0, + }; + + let end = match range.end_bound() { + std::ops::Bound::Included(&idx) => Some(idx + 1), + std::ops::Bound::Excluded(&idx) => Some(idx), + std::ops::Bound::Unbounded => None, + }; + + let mut entries = Vec::new(); + let iter = self.store.db().iterator_cf( + &cf, + rocksdb::IteratorMode::From(&Self::encode_index(start), rocksdb::Direction::Forward), + ); + + for item in iter { + let (key, value) = item.map_err(|e| StorageError::RocksDb(e.to_string()))?; + + let idx = Self::decode_index(&key); + if let Some(end_idx) = end { + if idx >= end_idx { + break; + } + } + + let entry: LogEntry = bincode::deserialize(&value) + .map_err(|e| StorageError::Serialization(e.to_string()))?; + entries.push(entry); + } + + trace!(start, ?end, count = entries.len(), "Get log entries"); + Ok(entries) + } + + /// Truncate log from the given index (inclusive) + pub fn truncate(&self, from_index: LogIndex) -> Result<(), StorageError> { + let cf = self + .store + .cf_handle(cf::LOGS) + .ok_or_else(|| StorageError::RocksDb("LOGS cf not found".into()))?; + + let mut batch = WriteBatch::default(); + + let iter = self.store.db().iterator_cf( + &cf, + rocksdb::IteratorMode::From( + &Self::encode_index(from_index), + rocksdb::Direction::Forward, + ), + ); + + for item in iter { + let (key, _) = item.map_err(|e| StorageError::RocksDb(e.to_string()))?; + batch.delete_cf(&cf, key); + } + + self.store + .db() + .write(batch) + .map_err(|e| StorageError::RocksDb(e.to_string()))?; + + debug!(from_index, "Truncated log"); + Ok(()) + } + + /// Purge log entries up to the given index (inclusive) + pub fn purge(&self, up_to_index: LogIndex) -> Result<(), StorageError> { + let cf = self + .store + .cf_handle(cf::LOGS) + .ok_or_else(|| StorageError::RocksDb("LOGS cf not found".into()))?; + + // First, get the log ID of the entry we're purging to + let entries: Vec>> = self.get_log_entries(up_to_index..=up_to_index)?; + let last_purged = entries.first().map(|e| e.log_id); + + let mut batch = WriteBatch::default(); + + let iter = self + .store + .db() + .iterator_cf(&cf, rocksdb::IteratorMode::Start); + + for item in iter { + let (key, _) = item.map_err(|e| StorageError::RocksDb(e.to_string()))?; + let idx = Self::decode_index(&key); + if idx > up_to_index { + break; + } + batch.delete_cf(&cf, key); + } + + // Save last purged log ID + if let Some(log_id) = last_purged { + let meta_cf = self + .store + .cf_handle(cf::META) + .ok_or_else(|| StorageError::RocksDb("META cf not found".into()))?; + let bytes = bincode::serialize(&log_id) + .map_err(|e| StorageError::Serialization(e.to_string()))?; + batch.put_cf(&meta_cf, b"last_purged", bytes); + } + + self.store + .db() + .write(batch) + .map_err(|e| StorageError::RocksDb(e.to_string()))?; + + debug!(up_to_index, "Purged log"); + Ok(()) + } + + /// Get last purged log ID + fn get_last_purged_log_id(&self) -> Result, StorageError> { + let cf = self + .store + .cf_handle(cf::META) + .ok_or_else(|| StorageError::RocksDb("META cf not found".into()))?; + + match self + .store + .db() + .get_cf(&cf, b"last_purged") + .map_err(|e| StorageError::RocksDb(e.to_string()))? + { + Some(bytes) => { + let log_id: LogId = bincode::deserialize(&bytes) + .map_err(|e| StorageError::Serialization(e.to_string()))?; + Ok(Some(log_id)) + } + None => Ok(None), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + fn create_test_storage() -> LogStorage { + let dir = tempdir().unwrap(); + let store = RocksStore::new(dir.path()).unwrap(); + LogStorage::new(store) + } + + #[test] + fn test_vote_persistence() { + let storage = create_test_storage(); + + let vote = Vote { + term: 5, + node_id: Some(1), + committed: true, + }; + + storage.save_vote(vote).unwrap(); + let loaded = storage.read_vote().unwrap().unwrap(); + + assert_eq!(loaded.term, 5); + assert_eq!(loaded.node_id, Some(1)); + assert!(loaded.committed); + } + + #[test] + fn test_append_and_get_entries() { + let storage = create_test_storage(); + + let entries = vec![ + LogEntry::>::blank(LogId::new(1, 1)), + LogEntry::normal(LogId::new(1, 2), b"data1".to_vec()), + LogEntry::normal(LogId::new(1, 3), b"data2".to_vec()), + ]; + + storage.append(&entries).unwrap(); + + let loaded: Vec>> = storage.get_log_entries(1..=3).unwrap(); + assert_eq!(loaded.len(), 3); + assert_eq!(loaded[0].log_id.index, 1); + assert_eq!(loaded[2].log_id.index, 3); + } + + #[test] + fn test_log_state() { + let storage = create_test_storage(); + + // Initially empty + let state = storage.get_log_state().unwrap(); + assert!(state.last_log_id.is_none()); + + // Add entries + let entries = vec![ + LogEntry::>::blank(LogId::new(1, 1)), + LogEntry::normal(LogId::new(1, 2), b"data".to_vec()), + ]; + storage.append(&entries).unwrap(); + + let state = storage.get_log_state().unwrap(); + assert_eq!(state.last_log_id, Some(LogId::new(1, 2))); + } + + #[test] + fn test_truncate() { + let storage = create_test_storage(); + + let entries = vec![ + LogEntry::>::blank(LogId::new(1, 1)), + LogEntry::normal(LogId::new(1, 2), b"data1".to_vec()), + LogEntry::normal(LogId::new(1, 3), b"data2".to_vec()), + LogEntry::normal(LogId::new(1, 4), b"data3".to_vec()), + ]; + storage.append(&entries).unwrap(); + + // Truncate from index 3 + storage.truncate(3).unwrap(); + + let loaded: Vec>> = storage.get_log_entries(1..=4).unwrap(); + assert_eq!(loaded.len(), 2); + assert_eq!(loaded.last().unwrap().log_id.index, 2); + } + + #[test] + fn test_purge() { + let storage = create_test_storage(); + + let entries = vec![ + LogEntry::>::blank(LogId::new(1, 1)), + LogEntry::normal(LogId::new(1, 2), b"data1".to_vec()), + LogEntry::normal(LogId::new(1, 3), b"data2".to_vec()), + LogEntry::normal(LogId::new(1, 4), b"data3".to_vec()), + ]; + storage.append(&entries).unwrap(); + + // Purge up to index 2 + storage.purge(2).unwrap(); + + let loaded: Vec>> = storage.get_log_entries(1..=4).unwrap(); + assert_eq!(loaded.len(), 2); + assert_eq!(loaded.first().unwrap().log_id.index, 3); + + let state = storage.get_log_state().unwrap(); + assert_eq!(state.last_purged_log_id, Some(LogId::new(1, 2))); + } +} diff --git a/chainfire/crates/chainfire-storage/src/snapshot.rs b/chainfire/crates/chainfire-storage/src/snapshot.rs new file mode 100644 index 0000000..6726ab2 --- /dev/null +++ b/chainfire/crates/chainfire-storage/src/snapshot.rs @@ -0,0 +1,316 @@ +//! Snapshot management for Raft state +//! +//! Snapshots allow compacting the Raft log while preserving the state machine state. + +use crate::{cf, RocksStore}; +use chainfire_types::error::StorageError; +use chainfire_types::kv::KvEntry; +use serde::{Deserialize, Serialize}; +use std::io::{Read, Write}; +use tracing::info; + +/// Snapshot metadata +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SnapshotMeta { + /// Last log index included in snapshot + pub last_log_index: u64, + /// Term of last log entry included + pub last_log_term: u64, + /// Cluster membership at snapshot time + pub membership: Vec, + /// Size of snapshot data in bytes + pub size: u64, +} + +/// A complete snapshot +#[derive(Debug)] +pub struct Snapshot { + pub meta: SnapshotMeta, + pub data: Vec, +} + +impl Snapshot { + /// Create snapshot from raw data + pub fn new(meta: SnapshotMeta, data: Vec) -> Self { + Self { meta, data } + } + + /// Serialize snapshot to bytes + pub fn to_bytes(&self) -> Result, StorageError> { + // Format: [meta_len: u32][meta][data] + let meta_bytes = + bincode::serialize(&self.meta).map_err(|e| StorageError::Serialization(e.to_string()))?; + + let mut result = Vec::with_capacity(4 + meta_bytes.len() + self.data.len()); + result.extend_from_slice(&(meta_bytes.len() as u32).to_le_bytes()); + result.extend_from_slice(&meta_bytes); + result.extend_from_slice(&self.data); + + Ok(result) + } + + /// Deserialize snapshot from bytes + pub fn from_bytes(bytes: &[u8]) -> Result { + if bytes.len() < 4 { + return Err(StorageError::Snapshot("Invalid snapshot: too short".into())); + } + + let meta_len = u32::from_le_bytes(bytes[..4].try_into().unwrap()) as usize; + if bytes.len() < 4 + meta_len { + return Err(StorageError::Snapshot( + "Invalid snapshot: meta truncated".into(), + )); + } + + let meta: SnapshotMeta = bincode::deserialize(&bytes[4..4 + meta_len]) + .map_err(|e| StorageError::Serialization(e.to_string()))?; + + let data = bytes[4 + meta_len..].to_vec(); + + Ok(Self { meta, data }) + } +} + +/// Builder for creating snapshots from KV store state +pub struct SnapshotBuilder { + store: RocksStore, +} + +impl SnapshotBuilder { + pub fn new(store: RocksStore) -> Self { + Self { store } + } + + /// Build a snapshot of the current KV state + pub fn build( + &self, + last_log_index: u64, + last_log_term: u64, + membership: Vec, + ) -> Result { + let cf = self + .store + .cf_handle(cf::KV) + .ok_or_else(|| StorageError::RocksDb("KV cf not found".into()))?; + + // Collect all KV entries + let mut entries: Vec = Vec::new(); + let iter = self + .store + .db() + .iterator_cf(&cf, rocksdb::IteratorMode::Start); + + for item in iter { + let (_, value) = item.map_err(|e| StorageError::RocksDb(e.to_string()))?; + let entry: KvEntry = bincode::deserialize(&value) + .map_err(|e| StorageError::Serialization(e.to_string()))?; + entries.push(entry); + } + + // Serialize entries + let data = bincode::serialize(&entries) + .map_err(|e| StorageError::Serialization(e.to_string()))?; + + let meta = SnapshotMeta { + last_log_index, + last_log_term, + membership, + size: data.len() as u64, + }; + + info!( + last_log_index, + entries = entries.len(), + size = data.len(), + "Built snapshot" + ); + + Ok(Snapshot::new(meta, data)) + } + + /// Apply a snapshot to restore state + pub fn apply(&self, snapshot: &Snapshot) -> Result<(), StorageError> { + let cf = self + .store + .cf_handle(cf::KV) + .ok_or_else(|| StorageError::RocksDb("KV cf not found".into()))?; + + // Deserialize entries + let entries: Vec = bincode::deserialize(&snapshot.data) + .map_err(|e| StorageError::Serialization(e.to_string()))?; + + // Clear existing KV data + let mut batch = rocksdb::WriteBatch::default(); + let iter = self + .store + .db() + .iterator_cf(&cf, rocksdb::IteratorMode::Start); + for item in iter { + let (key, _) = item.map_err(|e| StorageError::RocksDb(e.to_string()))?; + batch.delete_cf(&cf, key); + } + + // Write new entries + for entry in &entries { + let value = bincode::serialize(entry) + .map_err(|e| StorageError::Serialization(e.to_string()))?; + batch.put_cf(&cf, &entry.key, value); + } + + self.store + .db() + .write(batch) + .map_err(|e| StorageError::RocksDb(e.to_string()))?; + + info!( + last_log_index = snapshot.meta.last_log_index, + entries = entries.len(), + "Applied snapshot" + ); + + Ok(()) + } +} + +/// Streaming snapshot reader for large snapshots +pub struct SnapshotReader { + data: Vec, + position: usize, +} + +impl SnapshotReader { + pub fn new(data: Vec) -> Self { + Self { data, position: 0 } + } + + pub fn remaining(&self) -> usize { + self.data.len() - self.position + } +} + +impl Read for SnapshotReader { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + let remaining = self.remaining(); + if remaining == 0 { + return Ok(0); + } + + let to_read = std::cmp::min(buf.len(), remaining); + buf[..to_read].copy_from_slice(&self.data[self.position..self.position + to_read]); + self.position += to_read; + Ok(to_read) + } +} + +/// Streaming snapshot writer for building large snapshots +pub struct SnapshotWriter { + data: Vec, +} + +impl SnapshotWriter { + pub fn new() -> Self { + Self { data: Vec::new() } + } + + pub fn into_inner(self) -> Vec { + self.data + } +} + +impl Default for SnapshotWriter { + fn default() -> Self { + Self::new() + } +} + +impl Write for SnapshotWriter { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.data.extend_from_slice(buf); + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::KvStore; + use tempfile::tempdir; + + fn create_test_store() -> RocksStore { + let dir = tempdir().unwrap(); + RocksStore::new(dir.path()).unwrap() + } + + #[test] + fn test_snapshot_roundtrip() { + let store = create_test_store(); + + // Add some data + let kv = KvStore::new(store.clone()).unwrap(); + kv.put(b"key1".to_vec(), b"value1".to_vec(), None).unwrap(); + kv.put(b"key2".to_vec(), b"value2".to_vec(), None).unwrap(); + + // Build snapshot + let builder = SnapshotBuilder::new(store.clone()); + let snapshot = builder.build(10, 1, vec![1, 2, 3]).unwrap(); + + assert_eq!(snapshot.meta.last_log_index, 10); + assert_eq!(snapshot.meta.last_log_term, 1); + assert_eq!(snapshot.meta.membership, vec![1, 2, 3]); + + // Serialize and deserialize + let bytes = snapshot.to_bytes().unwrap(); + let restored = Snapshot::from_bytes(&bytes).unwrap(); + + assert_eq!(restored.meta.last_log_index, snapshot.meta.last_log_index); + assert_eq!(restored.data.len(), snapshot.data.len()); + } + + #[test] + fn test_snapshot_apply() { + let store1 = create_test_store(); + let store2 = create_test_store(); + + // Add data to store1 + let kv1 = KvStore::new(store1.clone()).unwrap(); + kv1.put(b"key1".to_vec(), b"value1".to_vec(), None) + .unwrap(); + kv1.put(b"key2".to_vec(), b"value2".to_vec(), None) + .unwrap(); + + // Build snapshot from store1 + let builder1 = SnapshotBuilder::new(store1.clone()); + let snapshot = builder1.build(10, 1, vec![1]).unwrap(); + + // Apply to store2 + let builder2 = SnapshotBuilder::new(store2.clone()); + builder2.apply(&snapshot).unwrap(); + + // Verify data in store2 + let kv2 = KvStore::new(store2).unwrap(); + let entry1 = kv2.get(b"key1").unwrap().unwrap(); + let entry2 = kv2.get(b"key2").unwrap().unwrap(); + + assert_eq!(entry1.value, b"value1"); + assert_eq!(entry2.value, b"value2"); + } + + #[test] + fn test_snapshot_reader() { + let data = vec![1, 2, 3, 4, 5]; + let mut reader = SnapshotReader::new(data.clone()); + + let mut buf = [0u8; 3]; + assert_eq!(reader.read(&mut buf).unwrap(), 3); + assert_eq!(&buf, &[1, 2, 3]); + + assert_eq!(reader.read(&mut buf).unwrap(), 2); + assert_eq!(&buf[..2], &[4, 5]); + + assert_eq!(reader.read(&mut buf).unwrap(), 0); + } +} diff --git a/chainfire/crates/chainfire-storage/src/state_machine.rs b/chainfire/crates/chainfire-storage/src/state_machine.rs new file mode 100644 index 0000000..f1c50a1 --- /dev/null +++ b/chainfire/crates/chainfire-storage/src/state_machine.rs @@ -0,0 +1,587 @@ +//! Raft state machine implementation +//! +//! The state machine applies committed Raft log entries to the KV store. + +use crate::{KvStore, LeaseStore, RocksStore}; +use chainfire_types::command::{Compare, CompareResult, CompareTarget, RaftCommand, RaftResponse}; +use chainfire_types::error::StorageError; +use chainfire_types::watch::WatchEvent; +use chainfire_types::Revision; +use std::sync::Arc; +use tokio::sync::mpsc; +use tracing::warn; + +/// State machine that applies Raft commands to the KV store +pub struct StateMachine { + /// Underlying KV store + kv: KvStore, + /// Lease store for TTL management + leases: Arc, + /// Channel to send watch events + watch_tx: Option>, +} + +impl StateMachine { + /// Create a new state machine + pub fn new(store: RocksStore) -> Result { + let kv = KvStore::new(store)?; + Ok(Self { + kv, + leases: Arc::new(LeaseStore::new()), + watch_tx: None, + }) + } + + /// Set the watch event sender + pub fn set_watch_sender(&mut self, tx: mpsc::UnboundedSender) { + self.watch_tx = Some(tx); + } + + /// Get the underlying KV store + pub fn kv(&self) -> &KvStore { + &self.kv + } + + /// Get the lease store + pub fn leases(&self) -> &Arc { + &self.leases + } + + /// Get current revision + pub fn current_revision(&self) -> Revision { + self.kv.current_revision() + } + + /// Apply a Raft command and return the response + pub fn apply(&self, command: RaftCommand) -> Result { + match command { + RaftCommand::Put { + key, + value, + lease_id, + prev_kv, + } => self.apply_put(key, value, lease_id, prev_kv), + + RaftCommand::Delete { key, prev_kv } => self.apply_delete(key, prev_kv), + + RaftCommand::DeleteRange { + start, + end, + prev_kv, + } => self.apply_delete_range(start, end, prev_kv), + + RaftCommand::Txn { + compare, + success, + failure, + } => self.apply_txn(compare, success, failure), + + RaftCommand::LeaseGrant { id, ttl } => self.apply_lease_grant(id, ttl), + + RaftCommand::LeaseRevoke { id } => self.apply_lease_revoke(id), + + RaftCommand::LeaseRefresh { id } => self.apply_lease_refresh(id), + + RaftCommand::Noop => Ok(RaftResponse::new(self.current_revision())), + } + } + + /// Apply a Put command + fn apply_put( + &self, + key: Vec, + value: Vec, + lease_id: Option, + return_prev: bool, + ) -> Result { + // If key previously had a lease, detach it + if let Some(ref prev_entry) = self.kv.get(&key)? { + if let Some(old_lease_id) = prev_entry.lease_id { + self.leases.detach_key(old_lease_id, &key); + } + } + + let (revision, prev) = self.kv.put(key.clone(), value.clone(), lease_id)?; + + // Attach key to new lease if specified + if let Some(lid) = lease_id { + if let Err(e) = self.leases.attach_key(lid, key.clone()) { + warn!("Failed to attach key to lease {}: {}", lid, e); + } + } + + // Emit watch event + if let Some(tx) = &self.watch_tx { + let entry = self.kv.get(&key)?.unwrap(); + let event = WatchEvent::put(entry, if return_prev { prev.clone() } else { None }); + if tx.send(event).is_err() { + warn!("Watch event channel closed"); + } + } + + Ok(RaftResponse::with_prev_kv( + revision, + if return_prev { prev } else { None }, + )) + } + + /// Apply a Delete command + fn apply_delete(&self, key: Vec, return_prev: bool) -> Result { + // Detach from lease if attached + if let Some(ref entry) = self.kv.get(&key)? { + if let Some(lease_id) = entry.lease_id { + self.leases.detach_key(lease_id, &key); + } + } + + let (revision, prev) = self.kv.delete(&key)?; + + // Emit watch event if key existed + if let (Some(tx), Some(ref deleted)) = (&self.watch_tx, &prev) { + let event = WatchEvent::delete( + deleted.clone(), + if return_prev { prev.clone() } else { None }, + ); + if tx.send(event).is_err() { + warn!("Watch event channel closed"); + } + } + + let deleted = if prev.is_some() { 1 } else { 0 }; + Ok(RaftResponse { + revision, + prev_kv: if return_prev { prev } else { None }, + deleted, + ..Default::default() + }) + } + + /// Apply a DeleteRange command + fn apply_delete_range( + &self, + start: Vec, + end: Vec, + return_prev: bool, + ) -> Result { + let (revision, deleted_entries) = self.kv.delete_range(&start, &end)?; + + // Emit watch events for each deleted key + if let Some(tx) = &self.watch_tx { + for entry in &deleted_entries { + let event = WatchEvent::delete(entry.clone(), None); + if tx.send(event).is_err() { + warn!("Watch event channel closed"); + break; + } + } + } + + Ok(RaftResponse::deleted( + revision, + deleted_entries.len() as u64, + if return_prev { deleted_entries } else { vec![] }, + )) + } + + /// Apply a transaction + fn apply_txn( + &self, + compare: Vec, + success: Vec, + failure: Vec, + ) -> Result { + use chainfire_types::command::TxnOpResponse; + + // Evaluate all comparisons + let all_match = compare.iter().all(|c| self.evaluate_compare(c)); + + let ops = if all_match { &success } else { &failure }; + + // Apply operations and collect responses + let mut txn_responses = Vec::with_capacity(ops.len()); + + for op in ops { + match op { + chainfire_types::command::TxnOp::Put { + key, + value, + lease_id, + } => { + let resp = self.apply_put(key.clone(), value.clone(), *lease_id, true)?; + txn_responses.push(TxnOpResponse::Put { + prev_kv: resp.prev_kv, + }); + } + chainfire_types::command::TxnOp::Delete { key } => { + let resp = self.apply_delete(key.clone(), true)?; + txn_responses.push(TxnOpResponse::Delete { + deleted: resp.deleted, + prev_kvs: resp.prev_kvs, + }); + } + chainfire_types::command::TxnOp::DeleteRange { start, end } => { + let resp = self.apply_delete_range(start.clone(), end.clone(), true)?; + txn_responses.push(TxnOpResponse::Delete { + deleted: resp.deleted, + prev_kvs: resp.prev_kvs, + }); + } + chainfire_types::command::TxnOp::Range { + key, + range_end, + limit, + keys_only, + count_only, + } => { + // Range operations are read-only - perform the read here + let entries = if range_end.is_empty() { + // Single key lookup + match self.kv.get(key)? { + Some(entry) => vec![entry], + None => vec![], + } + } else { + // Range query + let end_opt = if range_end.is_empty() { + None + } else { + Some(range_end.as_slice()) + }; + let mut results = self.kv.range(key, end_opt)?; + // Apply limit + if *limit > 0 { + results.truncate(*limit as usize); + } + results + }; + + let count = entries.len() as u64; + let kvs = if *count_only { + vec![] + } else if *keys_only { + entries + .into_iter() + .map(|e| chainfire_types::kv::KvEntry { + key: e.key, + value: vec![], + version: e.version, + create_revision: e.create_revision, + mod_revision: e.mod_revision, + lease_id: e.lease_id, + }) + .collect() + } else { + entries + }; + + txn_responses.push(TxnOpResponse::Range { + kvs, + count, + more: false, // TODO: handle pagination + }); + } + } + } + + Ok(RaftResponse::txn( + self.current_revision(), + all_match, + txn_responses, + )) + } + + /// Evaluate a single comparison + fn evaluate_compare(&self, compare: &Compare) -> bool { + let entry = match self.kv.get(&compare.key) { + Ok(Some(e)) => e, + Ok(None) => { + // Key doesn't exist - special handling + return match &compare.target { + CompareTarget::Version(v) => match compare.result { + CompareResult::Equal => *v == 0, + CompareResult::NotEqual => *v != 0, + CompareResult::Greater => false, + CompareResult::Less => *v > 0, + }, + _ => false, + }; + } + Err(_) => return false, + }; + + match &compare.target { + CompareTarget::Version(expected) => { + self.compare_values(entry.version, *expected, compare.result) + } + CompareTarget::CreateRevision(expected) => { + self.compare_values(entry.create_revision, *expected, compare.result) + } + CompareTarget::ModRevision(expected) => { + self.compare_values(entry.mod_revision, *expected, compare.result) + } + CompareTarget::Value(expected) => match compare.result { + CompareResult::Equal => entry.value == *expected, + CompareResult::NotEqual => entry.value != *expected, + CompareResult::Greater => entry.value.as_slice() > expected.as_slice(), + CompareResult::Less => entry.value.as_slice() < expected.as_slice(), + }, + } + } + + /// Compare two numeric values + fn compare_values(&self, actual: u64, expected: u64, result: CompareResult) -> bool { + match result { + CompareResult::Equal => actual == expected, + CompareResult::NotEqual => actual != expected, + CompareResult::Greater => actual > expected, + CompareResult::Less => actual < expected, + } + } + + /// Apply a lease grant command + fn apply_lease_grant(&self, id: i64, ttl: i64) -> Result { + let lease = self.leases.grant(id, ttl)?; + Ok(RaftResponse::lease(self.current_revision(), lease.id, lease.ttl)) + } + + /// Apply a lease revoke command + fn apply_lease_revoke(&self, id: i64) -> Result { + let keys = self.leases.revoke(id)?; + + // Delete all keys attached to the lease + let mut deleted = 0u64; + for key in keys { + let (_, prev) = self.kv.delete(&key)?; + if prev.is_some() { + deleted += 1; + + // Emit watch event + if let (Some(tx), Some(ref entry)) = (&self.watch_tx, &prev) { + let event = WatchEvent::delete(entry.clone(), None); + if tx.send(event).is_err() { + warn!("Watch event channel closed"); + } + } + } + } + + Ok(RaftResponse { + revision: self.current_revision(), + deleted, + ..Default::default() + }) + } + + /// Apply a lease refresh command + fn apply_lease_refresh(&self, id: i64) -> Result { + let ttl = self.leases.refresh(id)?; + Ok(RaftResponse::lease(self.current_revision(), id, ttl)) + } + + /// Delete keys by lease ID (called when lease expires) + pub fn delete_keys_by_lease(&self, lease_id: i64) -> Result { + if let Some(lease) = self.leases.get(lease_id) { + let keys = lease.keys.clone(); + // Revoke will also return the keys, but we already have them + let _ = self.leases.revoke(lease_id); + + let mut deleted = 0u64; + for key in keys { + let (_, prev) = self.kv.delete(&key)?; + if prev.is_some() { + deleted += 1; + + // Emit watch event + if let (Some(tx), Some(ref entry)) = (&self.watch_tx, &prev) { + let event = WatchEvent::delete(entry.clone(), None); + if tx.send(event).is_err() { + warn!("Watch event channel closed"); + } + } + } + } + Ok(deleted) + } else { + Ok(0) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + fn create_test_state_machine() -> StateMachine { + let dir = tempdir().unwrap(); + let store = RocksStore::new(dir.path()).unwrap(); + StateMachine::new(store).unwrap() + } + + #[test] + fn test_apply_put() { + let sm = create_test_state_machine(); + + let cmd = RaftCommand::Put { + key: b"key1".to_vec(), + value: b"value1".to_vec(), + lease_id: None, + prev_kv: false, + }; + + let response = sm.apply(cmd).unwrap(); + assert_eq!(response.revision, 1); + assert!(response.prev_kv.is_none()); + + let entry = sm.kv().get(b"key1").unwrap().unwrap(); + assert_eq!(entry.value, b"value1"); + } + + #[test] + fn test_apply_put_with_prev() { + let sm = create_test_state_machine(); + + sm.apply(RaftCommand::Put { + key: b"key1".to_vec(), + value: b"value1".to_vec(), + lease_id: None, + prev_kv: false, + }) + .unwrap(); + + let response = sm + .apply(RaftCommand::Put { + key: b"key1".to_vec(), + value: b"value2".to_vec(), + lease_id: None, + prev_kv: true, + }) + .unwrap(); + + assert_eq!(response.revision, 2); + assert!(response.prev_kv.is_some()); + assert_eq!(response.prev_kv.unwrap().value, b"value1"); + } + + #[test] + fn test_apply_delete() { + let sm = create_test_state_machine(); + + sm.apply(RaftCommand::Put { + key: b"key1".to_vec(), + value: b"value1".to_vec(), + lease_id: None, + prev_kv: false, + }) + .unwrap(); + + let response = sm + .apply(RaftCommand::Delete { + key: b"key1".to_vec(), + prev_kv: true, + }) + .unwrap(); + + assert_eq!(response.deleted, 1); + assert!(response.prev_kv.is_some()); + + assert!(sm.kv().get(b"key1").unwrap().is_none()); + } + + #[test] + fn test_apply_txn_success() { + let sm = create_test_state_machine(); + + // Create initial key + sm.apply(RaftCommand::Put { + key: b"counter".to_vec(), + value: b"1".to_vec(), + lease_id: None, + prev_kv: false, + }) + .unwrap(); + + // Transaction: if version == 1, increment + let cmd = RaftCommand::Txn { + compare: vec![Compare { + key: b"counter".to_vec(), + target: CompareTarget::Version(1), + result: CompareResult::Equal, + }], + success: vec![chainfire_types::command::TxnOp::Put { + key: b"counter".to_vec(), + value: b"2".to_vec(), + lease_id: None, + }], + failure: vec![], + }; + + let response = sm.apply(cmd).unwrap(); + assert!(response.succeeded); + + let entry = sm.kv().get(b"counter").unwrap().unwrap(); + assert_eq!(entry.value, b"2"); + } + + #[test] + fn test_apply_txn_failure() { + let sm = create_test_state_machine(); + + // Create initial key + sm.apply(RaftCommand::Put { + key: b"counter".to_vec(), + value: b"1".to_vec(), + lease_id: None, + prev_kv: false, + }) + .unwrap(); + + // Transaction: if version == 5, increment (should fail) + let cmd = RaftCommand::Txn { + compare: vec![Compare { + key: b"counter".to_vec(), + target: CompareTarget::Version(5), + result: CompareResult::Equal, + }], + success: vec![chainfire_types::command::TxnOp::Put { + key: b"counter".to_vec(), + value: b"2".to_vec(), + lease_id: None, + }], + failure: vec![chainfire_types::command::TxnOp::Put { + key: b"counter".to_vec(), + value: b"failed".to_vec(), + lease_id: None, + }], + }; + + let response = sm.apply(cmd).unwrap(); + assert!(!response.succeeded); + + let entry = sm.kv().get(b"counter").unwrap().unwrap(); + assert_eq!(entry.value, b"failed"); + } + + #[tokio::test] + async fn test_watch_events() { + let mut sm = create_test_state_machine(); + + let (tx, mut rx) = mpsc::unbounded_channel(); + sm.set_watch_sender(tx); + + // Apply a put + sm.apply(RaftCommand::Put { + key: b"key1".to_vec(), + value: b"value1".to_vec(), + lease_id: None, + prev_kv: false, + }) + .unwrap(); + + // Check event was sent + let event = rx.recv().await.unwrap(); + assert!(event.is_put()); + assert_eq!(event.kv.key, b"key1"); + assert_eq!(event.kv.value, b"value1"); + } +} diff --git a/chainfire/crates/chainfire-storage/src/store.rs b/chainfire/crates/chainfire-storage/src/store.rs new file mode 100644 index 0000000..85939aa --- /dev/null +++ b/chainfire/crates/chainfire-storage/src/store.rs @@ -0,0 +1,132 @@ +//! RocksDB store management + +use crate::cf; +use chainfire_types::error::StorageError; +use rocksdb::{BoundColumnFamily, ColumnFamilyDescriptor, Options, DB}; +use std::path::Path; +use std::sync::Arc; + +/// RocksDB store wrapper with column families +pub struct RocksStore { + db: Arc, +} + +impl RocksStore { + /// Open or create a RocksDB database at the given path + pub fn new(path: impl AsRef) -> Result { + let path = path.as_ref(); + + let mut db_opts = Options::default(); + db_opts.create_if_missing(true); + db_opts.create_missing_column_families(true); + db_opts.set_max_background_jobs(4); + db_opts.set_bytes_per_sync(1024 * 1024); // 1MB + + // Define column families + let cf_descriptors = vec![ + ColumnFamilyDescriptor::new(cf::LOGS, Self::logs_cf_options()), + ColumnFamilyDescriptor::new(cf::META, Self::meta_cf_options()), + ColumnFamilyDescriptor::new(cf::KV, Self::kv_cf_options()), + ColumnFamilyDescriptor::new(cf::SNAPSHOT, Self::snapshot_cf_options()), + ]; + + let db = DB::open_cf_descriptors(&db_opts, path, cf_descriptors) + .map_err(|e| StorageError::RocksDb(e.to_string()))?; + + Ok(Self { db: Arc::new(db) }) + } + + /// Get the underlying DB handle + pub fn db(&self) -> &Arc { + &self.db + } + + /// Get a column family handle + pub fn cf_handle(&self, name: &str) -> Option>> { + self.db.cf_handle(name) + } + + /// Options for the logs column family + fn logs_cf_options() -> Options { + let mut opts = Options::default(); + // Optimize for sequential reads/writes + opts.set_write_buffer_size(64 * 1024 * 1024); // 64MB + opts.set_max_write_buffer_number(3); + opts + } + + /// Options for the metadata column family + fn meta_cf_options() -> Options { + let mut opts = Options::default(); + // Small, frequently updated + opts.set_write_buffer_size(16 * 1024 * 1024); // 16MB + opts + } + + /// Options for the KV column family + fn kv_cf_options() -> Options { + let mut opts = Options::default(); + // Optimize for point lookups and range scans + opts.set_write_buffer_size(128 * 1024 * 1024); // 128MB + opts.set_max_write_buffer_number(4); + // Enable bloom filters for faster lookups + opts.set_prefix_extractor(rocksdb::SliceTransform::create_fixed_prefix(8)); + opts + } + + /// Options for the snapshot column family + fn snapshot_cf_options() -> Options { + let mut opts = Options::default(); + opts.set_write_buffer_size(32 * 1024 * 1024); // 32MB + opts + } +} + +impl Clone for RocksStore { + fn clone(&self) -> Self { + Self { + db: Arc::clone(&self.db), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + #[test] + fn test_create_store() { + let dir = tempdir().unwrap(); + let store = RocksStore::new(dir.path()).unwrap(); + + // Verify all column families exist + assert!(store.cf_handle(cf::LOGS).is_some()); + assert!(store.cf_handle(cf::META).is_some()); + assert!(store.cf_handle(cf::KV).is_some()); + assert!(store.cf_handle(cf::SNAPSHOT).is_some()); + } + + #[test] + fn test_reopen_store() { + let dir = tempdir().unwrap(); + + // Create and close + { + let store = RocksStore::new(dir.path()).unwrap(); + let cf = store.cf_handle(cf::META).unwrap(); + store + .db() + .put_cf(&cf, b"test_key", b"test_value") + .unwrap(); + } + + // Reopen and verify data persisted + { + let store = RocksStore::new(dir.path()).unwrap(); + let cf = store.cf_handle(cf::META).unwrap(); + let value = store.db().get_cf(&cf, b"test_key").unwrap(); + assert_eq!(value, Some(b"test_value".to_vec())); + } + } +} diff --git a/chainfire/crates/chainfire-types/Cargo.toml b/chainfire/crates/chainfire-types/Cargo.toml new file mode 100644 index 0000000..d9cf471 --- /dev/null +++ b/chainfire/crates/chainfire-types/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "chainfire-types" +version.workspace = true +edition.workspace = true +license.workspace = true +rust-version.workspace = true +description = "Core types for Chainfire distributed KVS" + +[dependencies] +serde = { workspace = true } +thiserror = { workspace = true } +bytes = { workspace = true } + +[dev-dependencies] +bincode = { workspace = true } + +[lints] +workspace = true diff --git a/chainfire/crates/chainfire-types/src/command.rs b/chainfire/crates/chainfire-types/src/command.rs new file mode 100644 index 0000000..17c2300 --- /dev/null +++ b/chainfire/crates/chainfire-types/src/command.rs @@ -0,0 +1,270 @@ +//! Raft commands and responses +//! +//! These types are submitted to Raft for consensus and applied to the state machine. + +use crate::kv::KvEntry; +use crate::Revision; +use serde::{Deserialize, Serialize}; + +/// Commands submitted to Raft consensus +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum RaftCommand { + /// Put a key-value pair + Put { + key: Vec, + value: Vec, + lease_id: Option, + /// If true, return the previous value + prev_kv: bool, + }, + + /// Delete a single key + Delete { + key: Vec, + /// If true, return the deleted value + prev_kv: bool, + }, + + /// Delete a range of keys + DeleteRange { + start: Vec, + end: Vec, + /// If true, return deleted values + prev_kv: bool, + }, + + /// Transaction with multiple operations + Txn { + /// Comparison conditions + compare: Vec, + /// Operations to execute if all comparisons succeed + success: Vec, + /// Operations to execute if any comparison fails + failure: Vec, + }, + + /// Grant a new lease + LeaseGrant { + /// Requested lease ID (0 for server-assigned) + id: i64, + /// TTL in seconds + ttl: i64, + }, + + /// Revoke a lease (deletes all attached keys) + LeaseRevoke { + /// Lease ID to revoke + id: i64, + }, + + /// Refresh a lease TTL (keep-alive) + LeaseRefresh { + /// Lease ID to refresh + id: i64, + }, + + /// No-op command for Raft leadership establishment + Noop, +} + +impl Default for RaftCommand { + fn default() -> Self { + Self::Noop + } +} + +/// Comparison for transaction conditions +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Compare { + pub key: Vec, + pub target: CompareTarget, + pub result: CompareResult, +} + +/// What to compare against +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum CompareTarget { + /// Compare the version number + Version(u64), + /// Compare the creation revision + CreateRevision(Revision), + /// Compare the modification revision + ModRevision(Revision), + /// Compare the value + Value(Vec), +} + +/// Comparison operator +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum CompareResult { + Equal, + NotEqual, + Greater, + Less, +} + +/// Operation in a transaction +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum TxnOp { + Put { + key: Vec, + value: Vec, + lease_id: Option, + }, + Delete { + key: Vec, + }, + DeleteRange { + start: Vec, + end: Vec, + }, + /// Range query within a transaction + Range { + key: Vec, + range_end: Vec, + limit: i64, + keys_only: bool, + count_only: bool, + }, +} + +/// Response from a single operation in a transaction +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum TxnOpResponse { + /// Response from a Put operation + Put { + prev_kv: Option, + }, + /// Response from a Delete/DeleteRange operation + Delete { + deleted: u64, + prev_kvs: Vec, + }, + /// Response from a Range operation + Range { + kvs: Vec, + count: u64, + more: bool, + }, +} + +/// Response from applying a Raft command +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)] +pub struct RaftResponse { + /// Current revision after this operation + pub revision: Revision, + /// Previous key-value (if requested and existed) + pub prev_kv: Option, + /// Number of keys deleted (for delete operations) + pub deleted: u64, + /// Whether transaction succeeded (for Txn) + pub succeeded: bool, + /// Previous key-values for batch deletes + pub prev_kvs: Vec, + /// Lease ID (for lease operations) + pub lease_id: Option, + /// Lease TTL (for lease operations) + pub lease_ttl: Option, + /// Individual operation responses (for Txn) + pub txn_responses: Vec, +} + +impl RaftResponse { + /// Create a simple response with just revision + pub fn new(revision: Revision) -> Self { + Self { + revision, + ..Default::default() + } + } + + /// Create a response with previous key-value + pub fn with_prev_kv(revision: Revision, prev_kv: Option) -> Self { + Self { + revision, + prev_kv, + ..Default::default() + } + } + + /// Create a response for delete operations + pub fn deleted(revision: Revision, deleted: u64, prev_kvs: Vec) -> Self { + Self { + revision, + deleted, + prev_kvs, + ..Default::default() + } + } + + /// Create a response for transaction + pub fn txn(revision: Revision, succeeded: bool, txn_responses: Vec) -> Self { + Self { + revision, + succeeded, + txn_responses, + ..Default::default() + } + } + + /// Create a response for lease operations + pub fn lease(revision: Revision, lease_id: i64, ttl: i64) -> Self { + Self { + revision, + lease_id: Some(lease_id), + lease_ttl: Some(ttl), + ..Default::default() + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_put_command() { + let cmd = RaftCommand::Put { + key: b"key".to_vec(), + value: b"value".to_vec(), + lease_id: None, + prev_kv: false, + }; + + let serialized = bincode::serialize(&cmd).unwrap(); + let deserialized: RaftCommand = bincode::deserialize(&serialized).unwrap(); + + assert_eq!(cmd, deserialized); + } + + #[test] + fn test_txn_command() { + let cmd = RaftCommand::Txn { + compare: vec![Compare { + key: b"key".to_vec(), + target: CompareTarget::Version(1), + result: CompareResult::Equal, + }], + success: vec![TxnOp::Put { + key: b"key".to_vec(), + value: b"new_value".to_vec(), + lease_id: None, + }], + failure: vec![], + }; + + let serialized = bincode::serialize(&cmd).unwrap(); + let deserialized: RaftCommand = bincode::deserialize(&serialized).unwrap(); + + assert_eq!(cmd, deserialized); + } + + #[test] + fn test_response() { + let entry = KvEntry::new(b"key".to_vec(), b"old".to_vec(), 1); + let response = RaftResponse::with_prev_kv(5, Some(entry.clone())); + + assert_eq!(response.revision, 5); + assert_eq!(response.prev_kv, Some(entry)); + } +} diff --git a/chainfire/crates/chainfire-types/src/error.rs b/chainfire/crates/chainfire-types/src/error.rs new file mode 100644 index 0000000..f8757ed --- /dev/null +++ b/chainfire/crates/chainfire-types/src/error.rs @@ -0,0 +1,164 @@ +//! Error types for Chainfire + +use thiserror::Error; + +/// Result type alias using Chainfire's Error +pub type Result = std::result::Result; + +/// Main error type for Chainfire operations +#[derive(Error, Debug)] +pub enum Error { + /// Storage layer errors + #[error("Storage error: {0}")] + Storage(#[from] StorageError), + + /// Raft consensus errors + #[error("Raft error: {0}")] + Raft(#[from] RaftError), + + /// Network/RPC errors + #[error("Network error: {0}")] + Network(#[from] NetworkError), + + /// Watch errors + #[error("Watch error: {0}")] + Watch(#[from] WatchError), + + /// Gossip protocol errors + #[error("Gossip error: {0}")] + Gossip(#[from] GossipError), + + /// Configuration errors + #[error("Configuration error: {0}")] + Config(String), + + /// Serialization errors + #[error("Serialization error: {0}")] + Serialization(String), + + /// Generic internal error + #[error("Internal error: {0}")] + Internal(String), +} + +/// Storage layer errors +#[derive(Error, Debug)] +pub enum StorageError { + #[error("Key not found: {0:?}")] + KeyNotFound(Vec), + + #[error("IO error: {0}")] + Io(#[from] std::io::Error), + + #[error("RocksDB error: {0}")] + RocksDb(String), + + #[error("Serialization error: {0}")] + Serialization(String), + + #[error("Snapshot error: {0}")] + Snapshot(String), + + #[error("Log compacted: requested {requested}, compacted to {compacted}")] + LogCompacted { requested: u64, compacted: u64 }, + + #[error("Lease error: {0}")] + LeaseError(String), +} + +/// Raft consensus errors +#[derive(Error, Debug)] +pub enum RaftError { + #[error("Not leader, leader is node {leader_id:?}")] + NotLeader { leader_id: Option }, + + #[error("Node {0} not found")] + NodeNotFound(u64), + + #[error("Proposal failed: {0}")] + ProposalFailed(String), + + #[error("Timeout waiting for consensus")] + Timeout, + + #[error("Cluster not initialized")] + NotInitialized, + + #[error("Already initialized")] + AlreadyInitialized, + + #[error("Internal Raft error: {0}")] + Internal(String), +} + +/// Network/RPC errors +#[derive(Error, Debug)] +pub enum NetworkError { + #[error("Connection failed to {addr}: {reason}")] + ConnectionFailed { addr: String, reason: String }, + + #[error("RPC failed: {0}")] + RpcFailed(String), + + #[error("Node {0} unreachable")] + Unreachable(u64), + + #[error("Timeout")] + Timeout, + + #[error("Invalid address: {0}")] + InvalidAddress(String), +} + +/// Watch errors +#[derive(Error, Debug)] +pub enum WatchError { + #[error("Watch {0} not found")] + NotFound(i64), + + #[error("Watch {0} already exists")] + AlreadyExists(i64), + + #[error("Compacted: requested revision {requested}, compacted to {compacted}")] + Compacted { requested: u64, compacted: u64 }, + + #[error("Stream closed")] + StreamClosed, +} + +/// Gossip protocol errors +#[derive(Error, Debug)] +pub enum GossipError { + #[error("Failed to join cluster: {0}")] + JoinFailed(String), + + #[error("Broadcast failed: {0}")] + BroadcastFailed(String), + + #[error("Invalid identity: {0}")] + InvalidIdentity(String), + + #[error("UDP error: {0}")] + Udp(String), +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_error_display() { + let err = Error::Storage(StorageError::KeyNotFound(b"test".to_vec())); + assert!(err.to_string().contains("Key not found")); + + let err = Error::Raft(RaftError::NotLeader { leader_id: Some(1) }); + assert!(err.to_string().contains("Not leader")); + } + + #[test] + fn test_error_conversion() { + let storage_err = StorageError::KeyNotFound(b"key".to_vec()); + let err: Error = storage_err.into(); + assert!(matches!(err, Error::Storage(_))); + } +} diff --git a/chainfire/crates/chainfire-types/src/kv.rs b/chainfire/crates/chainfire-types/src/kv.rs new file mode 100644 index 0000000..87d3004 --- /dev/null +++ b/chainfire/crates/chainfire-types/src/kv.rs @@ -0,0 +1,201 @@ +//! Key-Value entry types with MVCC versioning + +use serde::{Deserialize, Serialize}; + +/// Revision number for MVCC-style versioning +/// Each write operation increments the global revision counter +pub type Revision = u64; + +/// A key-value entry with metadata +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct KvEntry { + /// The key + pub key: Vec, + /// The value + pub value: Vec, + /// Revision when this key was created + pub create_revision: Revision, + /// Revision of the last modification + pub mod_revision: Revision, + /// Number of modifications since creation + pub version: u64, + /// Optional lease ID for TTL-based expiration + pub lease_id: Option, +} + +impl KvEntry { + /// Create a new KV entry for initial insertion + pub fn new(key: Vec, value: Vec, revision: Revision) -> Self { + Self { + key, + value, + create_revision: revision, + mod_revision: revision, + version: 1, + lease_id: None, + } + } + + /// Create a new KV entry with lease + pub fn with_lease(key: Vec, value: Vec, revision: Revision, lease_id: i64) -> Self { + Self { + key, + value, + create_revision: revision, + mod_revision: revision, + version: 1, + lease_id: Some(lease_id), + } + } + + /// Update the entry with a new value and revision + pub fn update(&self, value: Vec, revision: Revision) -> Self { + Self { + key: self.key.clone(), + value, + create_revision: self.create_revision, + mod_revision: revision, + version: self.version + 1, + lease_id: self.lease_id, + } + } + + /// Get the key as a string (lossy conversion) + pub fn key_str(&self) -> String { + String::from_utf8_lossy(&self.key).to_string() + } + + /// Get the value as a string (lossy conversion) + pub fn value_str(&self) -> String { + String::from_utf8_lossy(&self.value).to_string() + } + + /// Check if this entry has a lease + pub fn has_lease(&self) -> bool { + self.lease_id.is_some() + } +} + +impl Default for KvEntry { + fn default() -> Self { + Self { + key: Vec::new(), + value: Vec::new(), + create_revision: 0, + mod_revision: 0, + version: 0, + lease_id: None, + } + } +} + +/// Range of keys for scan operations +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct KeyRange { + /// Start key (inclusive) + pub start: Vec, + /// End key (exclusive). If None, scan single key or to end + pub end: Option>, +} + +impl KeyRange { + /// Create a range for a single key + pub fn key(key: impl Into>) -> Self { + Self { + start: key.into(), + end: None, + } + } + + /// Create a range from start to end (exclusive) + pub fn range(start: impl Into>, end: impl Into>) -> Self { + Self { + start: start.into(), + end: Some(end.into()), + } + } + + /// Create a prefix range (all keys with given prefix) + pub fn prefix(prefix: impl Into>) -> Self { + let prefix = prefix.into(); + let end = prefix_end(&prefix); + Self { + start: prefix, + end: Some(end), + } + } + + /// Check if this range matches a single key + pub fn is_single_key(&self) -> bool { + self.end.is_none() + } +} + +/// Calculate the end key for a prefix scan +/// For prefix "abc", returns "abd" (increment last byte) +fn prefix_end(prefix: &[u8]) -> Vec { + let mut end = prefix.to_vec(); + for i in (0..end.len()).rev() { + if end[i] < 0xff { + end[i] += 1; + end.truncate(i + 1); + return end; + } + } + // All bytes are 0xff, return empty to indicate no upper bound + Vec::new() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_kv_entry_new() { + let entry = KvEntry::new(b"key".to_vec(), b"value".to_vec(), 1); + + assert_eq!(entry.key, b"key"); + assert_eq!(entry.value, b"value"); + assert_eq!(entry.create_revision, 1); + assert_eq!(entry.mod_revision, 1); + assert_eq!(entry.version, 1); + assert!(entry.lease_id.is_none()); + } + + #[test] + fn test_kv_entry_update() { + let entry = KvEntry::new(b"key".to_vec(), b"value1".to_vec(), 1); + let updated = entry.update(b"value2".to_vec(), 5); + + assert_eq!(updated.key, b"key"); + assert_eq!(updated.value, b"value2"); + assert_eq!(updated.create_revision, 1); // Unchanged + assert_eq!(updated.mod_revision, 5); + assert_eq!(updated.version, 2); + } + + #[test] + fn test_prefix_end() { + assert_eq!(prefix_end(b"abc"), b"abd"); + assert_eq!(prefix_end(b"ab\xff"), b"ac"); + assert_eq!(prefix_end(b"\xff\xff"), Vec::::new()); + } + + #[test] + fn test_key_range_prefix() { + let range = KeyRange::prefix("/nodes/"); + + assert_eq!(range.start, b"/nodes/"); + assert_eq!(range.end, Some(b"/nodes0".to_vec())); // '/' + 1 = '0' + } + + #[test] + fn test_kv_serialization() { + let entry = KvEntry::new(b"test".to_vec(), b"data".to_vec(), 42); + + let serialized = bincode::serialize(&entry).unwrap(); + let deserialized: KvEntry = bincode::deserialize(&serialized).unwrap(); + + assert_eq!(entry, deserialized); + } +} diff --git a/chainfire/crates/chainfire-types/src/lease.rs b/chainfire/crates/chainfire-types/src/lease.rs new file mode 100644 index 0000000..c562532 --- /dev/null +++ b/chainfire/crates/chainfire-types/src/lease.rs @@ -0,0 +1,187 @@ +//! Lease types for TTL-based key expiration +//! +//! Leases provide time-to-live (TTL) functionality for keys. When a lease expires +//! or is revoked, all keys attached to it are automatically deleted. + +use serde::{Deserialize, Serialize}; +use std::time::{Duration, Instant}; + +/// Unique identifier for a lease +pub type LeaseId = i64; + +/// A lease with TTL-based expiration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Lease { + /// Unique ID of the lease + pub id: LeaseId, + /// Time-to-live in seconds (as originally granted) + pub ttl: i64, + /// Remaining TTL in seconds (decremented over time) + #[serde(skip)] + pub remaining_ttl: i64, + /// Keys attached to this lease + pub keys: Vec>, + /// When the lease was created (for TTL calculation) + #[serde(skip)] + pub granted_at: Option, +} + +impl Lease { + /// Create a new lease with the given ID and TTL + pub fn new(id: LeaseId, ttl: i64) -> Self { + Self { + id, + ttl, + remaining_ttl: ttl, + keys: Vec::new(), + granted_at: Some(Instant::now()), + } + } + + /// Check if the lease has expired + pub fn is_expired(&self) -> bool { + if let Some(granted_at) = self.granted_at { + let elapsed = granted_at.elapsed(); + elapsed >= Duration::from_secs(self.ttl as u64) + } else { + // If no granted_at, use remaining_ttl + self.remaining_ttl <= 0 + } + } + + /// Get the remaining TTL in seconds + pub fn remaining(&self) -> i64 { + if let Some(granted_at) = self.granted_at { + let elapsed = granted_at.elapsed().as_secs() as i64; + (self.ttl - elapsed).max(0) + } else { + self.remaining_ttl.max(0) + } + } + + /// Refresh the lease TTL (for keep-alive) + pub fn refresh(&mut self) { + self.granted_at = Some(Instant::now()); + self.remaining_ttl = self.ttl; + } + + /// Attach a key to this lease + pub fn attach_key(&mut self, key: Vec) { + if !self.keys.contains(&key) { + self.keys.push(key); + } + } + + /// Detach a key from this lease + pub fn detach_key(&mut self, key: &[u8]) { + self.keys.retain(|k| k != key); + } +} + +/// Persistent lease data (for serialization without Instant) +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LeaseData { + /// Unique ID of the lease + pub id: LeaseId, + /// Time-to-live in seconds + pub ttl: i64, + /// Keys attached to this lease + pub keys: Vec>, + /// Unix timestamp when granted (for persistence) + pub granted_at_unix: u64, +} + +impl LeaseData { + /// Create lease data from a lease + pub fn from_lease(lease: &Lease) -> Self { + use std::time::SystemTime; + let now = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_secs(); + Self { + id: lease.id, + ttl: lease.ttl, + keys: lease.keys.clone(), + granted_at_unix: now, + } + } + + /// Convert to a lease (sets granted_at to now) + pub fn to_lease(&self) -> Lease { + use std::time::SystemTime; + let now = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_secs(); + let elapsed = (now - self.granted_at_unix) as i64; + let remaining = (self.ttl - elapsed).max(0); + + Lease { + id: self.id, + ttl: self.ttl, + remaining_ttl: remaining, + keys: self.keys.clone(), + granted_at: Some(Instant::now() - Duration::from_secs(elapsed as u64)), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::thread; + use std::time::Duration; + + #[test] + fn test_lease_creation() { + let lease = Lease::new(1, 10); + assert_eq!(lease.id, 1); + assert_eq!(lease.ttl, 10); + assert!(!lease.is_expired()); + } + + #[test] + fn test_lease_remaining() { + let lease = Lease::new(1, 10); + let remaining = lease.remaining(); + assert!(remaining >= 9 && remaining <= 10); + } + + #[test] + fn test_lease_attach_key() { + let mut lease = Lease::new(1, 10); + lease.attach_key(b"key1".to_vec()); + lease.attach_key(b"key2".to_vec()); + assert_eq!(lease.keys.len(), 2); + + // Duplicate should not add + lease.attach_key(b"key1".to_vec()); + assert_eq!(lease.keys.len(), 2); + } + + #[test] + fn test_lease_detach_key() { + let mut lease = Lease::new(1, 10); + lease.attach_key(b"key1".to_vec()); + lease.attach_key(b"key2".to_vec()); + + lease.detach_key(b"key1"); + assert_eq!(lease.keys.len(), 1); + assert_eq!(lease.keys[0], b"key2".to_vec()); + } + + #[test] + fn test_lease_refresh() { + let mut lease = Lease::new(1, 1); + // Sleep briefly to ensure some time passes + thread::sleep(Duration::from_millis(100)); + + let remaining_before = lease.remaining(); + lease.refresh(); + let remaining_after = lease.remaining(); + + // After refresh, remaining should be back to full TTL + assert!(remaining_after >= remaining_before); + } +} diff --git a/chainfire/crates/chainfire-types/src/lib.rs b/chainfire/crates/chainfire-types/src/lib.rs new file mode 100644 index 0000000..60c19c5 --- /dev/null +++ b/chainfire/crates/chainfire-types/src/lib.rs @@ -0,0 +1,23 @@ +//! Core types for Chainfire distributed Key-Value Store +//! +//! This crate contains all shared type definitions used across the Chainfire system: +//! - Node identification and metadata +//! - Key-Value entry representation with MVCC versioning +//! - Raft commands and responses +//! - Lease types for TTL-based key expiration +//! - Watch event types +//! - Error types + +pub mod command; +pub mod error; +pub mod kv; +pub mod lease; +pub mod node; +pub mod watch; + +pub use command::{RaftCommand, RaftResponse}; +pub use error::{Error, Result}; +pub use kv::{KvEntry, Revision}; +pub use lease::{Lease, LeaseData, LeaseId}; +pub use node::{NodeId, NodeInfo, NodeRole, RaftRole}; +pub use watch::{WatchEvent, WatchEventType, WatchRequest}; diff --git a/chainfire/crates/chainfire-types/src/node.rs b/chainfire/crates/chainfire-types/src/node.rs new file mode 100644 index 0000000..011312a --- /dev/null +++ b/chainfire/crates/chainfire-types/src/node.rs @@ -0,0 +1,255 @@ +//! Node identification and metadata types + +use serde::{Deserialize, Serialize}; +use std::net::SocketAddr; + +/// Unique identifier for each node in the cluster +pub type NodeId = u64; + +/// Role of a node in the cluster +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum NodeRole { + /// Control Plane node - participates in Raft consensus + ControlPlane, + /// Worker node - only participates in gossip, watches Control Plane + Worker, +} + +impl Default for NodeRole { + fn default() -> Self { + Self::Worker + } +} + +/// Raft participation role for a node. +/// +/// This determines whether and how a node participates in the Raft consensus protocol. +/// The RaftRole is separate from NodeRole (gossip role) - a node can be a ControlPlane +/// gossip participant without being a Raft voter. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)] +#[serde(rename_all = "lowercase")] +pub enum RaftRole { + /// Full voting member in Raft consensus. + /// Participates in leader election and log replication. + #[default] + Voter, + /// Non-voting replica that receives log replication. + /// Can be promoted to Voter via cluster membership change. + Learner, + /// No Raft participation. + /// Node only uses gossip and acts as a client proxy. + None, +} + +impl RaftRole { + /// Check if this role participates in Raft at all. + /// + /// Returns `true` for Voter and Learner, `false` for None. + pub fn participates_in_raft(&self) -> bool { + !matches!(self, RaftRole::None) + } + + /// Check if this role is a voting member. + pub fn is_voter(&self) -> bool { + matches!(self, RaftRole::Voter) + } + + /// Check if this role is a learner (non-voting replica). + pub fn is_learner(&self) -> bool { + matches!(self, RaftRole::Learner) + } + + /// Convert to string representation. + pub fn as_str(&self) -> &'static str { + match self { + RaftRole::Voter => "voter", + RaftRole::Learner => "learner", + RaftRole::None => "none", + } + } +} + +impl std::fmt::Display for RaftRole { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +impl std::str::FromStr for RaftRole { + type Err = String; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "voter" => Ok(RaftRole::Voter), + "learner" => Ok(RaftRole::Learner), + "none" => Ok(RaftRole::None), + _ => Err(format!( + "invalid raft role '{}', expected 'voter', 'learner', or 'none'", + s + )), + } + } +} + +/// Node metadata stored in cluster membership +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct NodeInfo { + /// Unique node identifier + pub id: NodeId, + /// Human-readable node name + pub name: String, + /// Address for Raft RPCs (Control Plane nodes only) + pub raft_addr: Option, + /// Address for client API (gRPC) + pub api_addr: SocketAddr, + /// Address for gossip protocol (UDP) + pub gossip_addr: SocketAddr, + /// Node role in the cluster + pub role: NodeRole, +} + +impl NodeInfo { + /// Create a new Control Plane node info + pub fn control_plane( + id: NodeId, + name: impl Into, + raft_addr: SocketAddr, + api_addr: SocketAddr, + gossip_addr: SocketAddr, + ) -> Self { + Self { + id, + name: name.into(), + raft_addr: Some(raft_addr), + api_addr, + gossip_addr, + role: NodeRole::ControlPlane, + } + } + + /// Create a new Worker node info + pub fn worker( + id: NodeId, + name: impl Into, + api_addr: SocketAddr, + gossip_addr: SocketAddr, + ) -> Self { + Self { + id, + name: name.into(), + raft_addr: None, + api_addr, + gossip_addr, + role: NodeRole::Worker, + } + } + + /// Check if this node is a Control Plane node + pub fn is_control_plane(&self) -> bool { + self.role == NodeRole::ControlPlane + } + + /// Check if this node is a Worker node + pub fn is_worker(&self) -> bool { + self.role == NodeRole::Worker + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_control_plane_node() { + let node = NodeInfo::control_plane( + 1, + "cp-1", + "127.0.0.1:5000".parse().unwrap(), + "127.0.0.1:5001".parse().unwrap(), + "127.0.0.1:5002".parse().unwrap(), + ); + + assert_eq!(node.id, 1); + assert_eq!(node.name, "cp-1"); + assert!(node.is_control_plane()); + assert!(!node.is_worker()); + assert!(node.raft_addr.is_some()); + } + + #[test] + fn test_worker_node() { + let node = NodeInfo::worker( + 100, + "worker-1", + "127.0.0.1:6001".parse().unwrap(), + "127.0.0.1:6002".parse().unwrap(), + ); + + assert_eq!(node.id, 100); + assert!(node.is_worker()); + assert!(!node.is_control_plane()); + assert!(node.raft_addr.is_none()); + } + + #[test] + fn test_node_serialization() { + let node = NodeInfo::control_plane( + 1, + "test", + "127.0.0.1:5000".parse().unwrap(), + "127.0.0.1:5001".parse().unwrap(), + "127.0.0.1:5002".parse().unwrap(), + ); + + let serialized = bincode::serialize(&node).unwrap(); + let deserialized: NodeInfo = bincode::deserialize(&serialized).unwrap(); + + assert_eq!(node, deserialized); + } + + #[test] + fn test_raft_role_default() { + let role = RaftRole::default(); + assert_eq!(role, RaftRole::Voter); + assert!(role.participates_in_raft()); + assert!(role.is_voter()); + } + + #[test] + fn test_raft_role_participates() { + assert!(RaftRole::Voter.participates_in_raft()); + assert!(RaftRole::Learner.participates_in_raft()); + assert!(!RaftRole::None.participates_in_raft()); + } + + #[test] + fn test_raft_role_from_str() { + assert_eq!("voter".parse::().unwrap(), RaftRole::Voter); + assert_eq!("learner".parse::().unwrap(), RaftRole::Learner); + assert_eq!("none".parse::().unwrap(), RaftRole::None); + assert_eq!("VOTER".parse::().unwrap(), RaftRole::Voter); + assert!("invalid".parse::().is_err()); + } + + #[test] + fn test_raft_role_display() { + assert_eq!(RaftRole::Voter.to_string(), "voter"); + assert_eq!(RaftRole::Learner.to_string(), "learner"); + assert_eq!(RaftRole::None.to_string(), "none"); + } + + #[test] + fn test_raft_role_serialization() { + // Test binary serialization + let serialized = bincode::serialize(&RaftRole::Voter).unwrap(); + let deserialized: RaftRole = bincode::deserialize(&serialized).unwrap(); + assert_eq!(deserialized, RaftRole::Voter); + + // Test all variants + for role in [RaftRole::Voter, RaftRole::Learner, RaftRole::None] { + let serialized = bincode::serialize(&role).unwrap(); + let deserialized: RaftRole = bincode::deserialize(&serialized).unwrap(); + assert_eq!(deserialized, role); + } + } +} diff --git a/chainfire/crates/chainfire-types/src/watch.rs b/chainfire/crates/chainfire-types/src/watch.rs new file mode 100644 index 0000000..6dc0737 --- /dev/null +++ b/chainfire/crates/chainfire-types/src/watch.rs @@ -0,0 +1,266 @@ +//! Watch event types for notifications + +use crate::kv::KvEntry; +use crate::Revision; +use serde::{Deserialize, Serialize}; + +/// Event type for watch notifications +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum WatchEventType { + /// Key was created or updated + Put, + /// Key was deleted + Delete, +} + +/// A single watch event +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct WatchEvent { + /// Type of event (Put or Delete) + pub event_type: WatchEventType, + /// Current key-value (for Put, contains new value; for Delete, contains deleted value) + pub kv: KvEntry, + /// Previous key-value (if requested and existed) + pub prev_kv: Option, +} + +impl WatchEvent { + /// Create a Put event + pub fn put(kv: KvEntry, prev_kv: Option) -> Self { + Self { + event_type: WatchEventType::Put, + kv, + prev_kv, + } + } + + /// Create a Delete event + pub fn delete(kv: KvEntry, prev_kv: Option) -> Self { + Self { + event_type: WatchEventType::Delete, + kv, + prev_kv, + } + } + + /// Check if this is a Put event + pub fn is_put(&self) -> bool { + self.event_type == WatchEventType::Put + } + + /// Check if this is a Delete event + pub fn is_delete(&self) -> bool { + self.event_type == WatchEventType::Delete + } +} + +/// Watch subscription request +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct WatchRequest { + /// Unique identifier for this watch + pub watch_id: i64, + /// Key to watch + pub key: Vec, + /// Range end for prefix/range watches. None = single key + pub range_end: Option>, + /// Start watching from this revision. None = from current + pub start_revision: Option, + /// Include previous value in events + pub prev_kv: bool, + /// Send periodic progress notifications + pub progress_notify: bool, +} + +impl WatchRequest { + /// Create a watch for a single key + pub fn key(watch_id: i64, key: impl Into>) -> Self { + Self { + watch_id, + key: key.into(), + range_end: None, + start_revision: None, + prev_kv: false, + progress_notify: false, + } + } + + /// Create a watch for all keys with a prefix + pub fn prefix(watch_id: i64, prefix: impl Into>) -> Self { + let prefix = prefix.into(); + let range_end = crate::kv::KeyRange::prefix(prefix.clone()) + .end + .unwrap_or_default(); + + Self { + watch_id, + key: prefix, + range_end: Some(range_end), + start_revision: None, + prev_kv: false, + progress_notify: false, + } + } + + /// Create a watch for a range of keys + pub fn range(watch_id: i64, start: impl Into>, end: impl Into>) -> Self { + Self { + watch_id, + key: start.into(), + range_end: Some(end.into()), + start_revision: None, + prev_kv: false, + progress_notify: false, + } + } + + /// Set start revision + pub fn from_revision(mut self, revision: Revision) -> Self { + self.start_revision = Some(revision); + self + } + + /// Request previous values in events + pub fn with_prev_kv(mut self) -> Self { + self.prev_kv = true; + self + } + + /// Request progress notifications + pub fn with_progress_notify(mut self) -> Self { + self.progress_notify = true; + self + } + + /// Check if this watch matches a key + pub fn matches(&self, key: &[u8]) -> bool { + match &self.range_end { + None => self.key == key, + Some(end) => { + if end.is_empty() { + // Empty end means all keys >= start + key >= self.key.as_slice() + } else { + key >= self.key.as_slice() && key < end.as_slice() + } + } + } + } +} + +/// Response for a watch stream +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct WatchResponse { + /// Watch ID this response is for + pub watch_id: i64, + /// True if this is a watch creation confirmation + pub created: bool, + /// True if the watch was canceled + pub canceled: bool, + /// Current revision (for progress notifications) + pub compact_revision: Revision, + /// Events in this response + pub events: Vec, +} + +impl WatchResponse { + /// Create a creation confirmation response + pub fn created(watch_id: i64) -> Self { + Self { + watch_id, + created: true, + canceled: false, + compact_revision: 0, + events: Vec::new(), + } + } + + /// Create a cancellation response + pub fn canceled(watch_id: i64) -> Self { + Self { + watch_id, + created: false, + canceled: true, + compact_revision: 0, + events: Vec::new(), + } + } + + /// Create an events response + pub fn events(watch_id: i64, events: Vec) -> Self { + Self { + watch_id, + created: false, + canceled: false, + compact_revision: 0, + events, + } + } + + /// Create a progress notification + pub fn progress(watch_id: i64, revision: Revision) -> Self { + Self { + watch_id, + created: false, + canceled: false, + compact_revision: revision, + events: Vec::new(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_watch_event_put() { + let kv = KvEntry::new(b"key".to_vec(), b"value".to_vec(), 1); + let event = WatchEvent::put(kv.clone(), None); + + assert!(event.is_put()); + assert!(!event.is_delete()); + assert_eq!(event.kv, kv); + } + + #[test] + fn test_watch_request_single_key() { + let req = WatchRequest::key(1, "/test/key"); + + assert!(req.matches(b"/test/key")); + assert!(!req.matches(b"/test/key2")); + assert!(!req.matches(b"/test")); + } + + #[test] + fn test_watch_request_prefix() { + let req = WatchRequest::prefix(1, "/nodes/"); + + assert!(req.matches(b"/nodes/node1")); + assert!(req.matches(b"/nodes/node2/tasks")); + assert!(!req.matches(b"/nodes")); // No trailing slash + assert!(!req.matches(b"/other/path")); + } + + #[test] + fn test_watch_request_range() { + let req = WatchRequest::range(1, "a", "d"); + + assert!(req.matches(b"a")); + assert!(req.matches(b"b")); + assert!(req.matches(b"c")); + assert!(!req.matches(b"d")); // End is exclusive + assert!(!req.matches(b"e")); + } + + #[test] + fn test_watch_serialization() { + let req = WatchRequest::prefix(42, "/test/") + .from_revision(100) + .with_prev_kv(); + + let serialized = bincode::serialize(&req).unwrap(); + let deserialized: WatchRequest = bincode::deserialize(&serialized).unwrap(); + + assert_eq!(req, deserialized); + } +} diff --git a/chainfire/crates/chainfire-watch/Cargo.toml b/chainfire/crates/chainfire-watch/Cargo.toml new file mode 100644 index 0000000..04b3591 --- /dev/null +++ b/chainfire/crates/chainfire-watch/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "chainfire-watch" +version.workspace = true +edition.workspace = true +license.workspace = true +rust-version.workspace = true +description = "Watch/notification system for Chainfire distributed KVS" + +[dependencies] +chainfire-types = { workspace = true } + +# Async +tokio = { workspace = true } +tokio-stream = { workspace = true } +futures = { workspace = true } + +# Utilities +tracing = { workspace = true } +parking_lot = { workspace = true } +dashmap = { workspace = true } + +[dev-dependencies] +tokio = { workspace = true, features = ["rt-multi-thread", "macros"] } + +[lints] +workspace = true diff --git a/chainfire/crates/chainfire-watch/src/lib.rs b/chainfire/crates/chainfire-watch/src/lib.rs new file mode 100644 index 0000000..23b5241 --- /dev/null +++ b/chainfire/crates/chainfire-watch/src/lib.rs @@ -0,0 +1,25 @@ +//! Watch/notification system for Chainfire distributed KVS +//! +//! This crate provides: +//! - Watch subscription registry +//! - Prefix/key matching +//! - Event dispatch to subscribers +//! - Watch stream management + +pub mod matcher; +pub mod registry; +pub mod stream; + +pub use matcher::KeyMatcher; +pub use registry::WatchRegistry; +pub use stream::WatchStream; + +use std::sync::atomic::{AtomicI64, Ordering}; + +/// Global watch ID counter +static WATCH_ID_COUNTER: AtomicI64 = AtomicI64::new(1); + +/// Generate a new unique watch ID +pub fn next_watch_id() -> i64 { + WATCH_ID_COUNTER.fetch_add(1, Ordering::SeqCst) +} diff --git a/chainfire/crates/chainfire-watch/src/matcher.rs b/chainfire/crates/chainfire-watch/src/matcher.rs new file mode 100644 index 0000000..30976e8 --- /dev/null +++ b/chainfire/crates/chainfire-watch/src/matcher.rs @@ -0,0 +1,150 @@ +//! Key matching utilities for watch subscriptions + +/// Key matcher for watch subscriptions +#[derive(Debug, Clone)] +pub struct KeyMatcher { + /// Start key + key: Vec, + /// Range end (exclusive). None = single key match + range_end: Option>, +} + +impl KeyMatcher { + /// Create a matcher for a single key + pub fn key(key: impl Into>) -> Self { + Self { + key: key.into(), + range_end: None, + } + } + + /// Create a matcher for a key range + pub fn range(key: impl Into>, range_end: impl Into>) -> Self { + Self { + key: key.into(), + range_end: Some(range_end.into()), + } + } + + /// Create a matcher for all keys with a given prefix + pub fn prefix(prefix: impl Into>) -> Self { + let prefix = prefix.into(); + let range_end = prefix_end(&prefix); + Self { + key: prefix, + range_end: Some(range_end), + } + } + + /// Create a matcher for all keys + pub fn all() -> Self { + Self { + key: vec![0], + range_end: Some(vec![]), + } + } + + /// Check if a key matches this matcher + pub fn matches(&self, target: &[u8]) -> bool { + match &self.range_end { + None => self.key == target, + Some(end) => { + if end.is_empty() { + // Empty end means all keys >= start + target >= self.key.as_slice() + } else { + target >= self.key.as_slice() && target < end.as_slice() + } + } + } + } + + /// Get the start key + pub fn start_key(&self) -> &[u8] { + &self.key + } + + /// Get the range end + pub fn range_end(&self) -> Option<&[u8]> { + self.range_end.as_deref() + } + + /// Check if this is a single key match + pub fn is_single_key(&self) -> bool { + self.range_end.is_none() + } + + /// Check if this is a prefix match + pub fn is_prefix(&self) -> bool { + self.range_end.is_some() + } +} + +/// Calculate the end key for a prefix scan +/// For prefix "abc", returns "abd" (increment last byte) +fn prefix_end(prefix: &[u8]) -> Vec { + let mut end = prefix.to_vec(); + for i in (0..end.len()).rev() { + if end[i] < 0xff { + end[i] += 1; + end.truncate(i + 1); + return end; + } + } + // All bytes are 0xff, return empty to indicate no upper bound + Vec::new() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_single_key_match() { + let matcher = KeyMatcher::key(b"/nodes/1"); + + assert!(matcher.matches(b"/nodes/1")); + assert!(!matcher.matches(b"/nodes/2")); + assert!(!matcher.matches(b"/nodes/10")); + assert!(!matcher.matches(b"/nodes")); + } + + #[test] + fn test_prefix_match() { + let matcher = KeyMatcher::prefix(b"/nodes/"); + + assert!(matcher.matches(b"/nodes/1")); + assert!(matcher.matches(b"/nodes/abc")); + assert!(matcher.matches(b"/nodes/")); + assert!(!matcher.matches(b"/nodes")); + assert!(!matcher.matches(b"/tasks/1")); + } + + #[test] + fn test_range_match() { + let matcher = KeyMatcher::range(b"a", b"d"); + + assert!(matcher.matches(b"a")); + assert!(matcher.matches(b"b")); + assert!(matcher.matches(b"c")); + assert!(!matcher.matches(b"d")); // End is exclusive + assert!(!matcher.matches(b"e")); + } + + #[test] + fn test_all_match() { + let matcher = KeyMatcher::all(); + + assert!(matcher.matches(b"any")); + assert!(matcher.matches(b"/path/to/key")); + assert!(matcher.matches(b"\xff\xff\xff")); + } + + #[test] + fn test_prefix_end() { + assert_eq!(prefix_end(b"abc"), b"abd"); + assert_eq!(prefix_end(b"ab\xff"), b"ac"); + assert_eq!(prefix_end(b"a\xff\xff"), b"b"); + assert_eq!(prefix_end(b"\xff\xff"), Vec::::new()); + } +} diff --git a/chainfire/crates/chainfire-watch/src/registry.rs b/chainfire/crates/chainfire-watch/src/registry.rs new file mode 100644 index 0000000..c1b5e4b --- /dev/null +++ b/chainfire/crates/chainfire-watch/src/registry.rs @@ -0,0 +1,353 @@ +//! Watch subscription registry + +use crate::matcher::KeyMatcher; +use crate::next_watch_id; +use chainfire_types::watch::{WatchEvent, WatchRequest, WatchResponse}; +use chainfire_types::Revision; +use dashmap::DashMap; +use parking_lot::RwLock; +use std::collections::{BTreeMap, HashSet}; +use tokio::sync::mpsc; +use tracing::{debug, trace, warn}; + +/// A registered watch subscription +struct WatchSubscription { + watch_id: i64, + matcher: KeyMatcher, + prev_kv: bool, + created_revision: Revision, + sender: mpsc::Sender, +} + +/// Registry for all active watch subscriptions +pub struct WatchRegistry { + /// Map of watch_id -> subscription + watches: DashMap, + /// Index: key prefix -> watch_ids for efficient dispatch + /// Uses BTreeMap for prefix range queries + prefix_index: RwLock, HashSet>>, + /// Current revision for progress notifications + current_revision: RwLock, +} + +impl WatchRegistry { + /// Create a new watch registry + pub fn new() -> Self { + Self { + watches: DashMap::new(), + prefix_index: RwLock::new(BTreeMap::new()), + current_revision: RwLock::new(0), + } + } + + /// Update current revision + pub fn set_revision(&self, revision: Revision) { + *self.current_revision.write() = revision; + } + + /// Get current revision + pub fn current_revision(&self) -> Revision { + *self.current_revision.read() + } + + /// Create a new watch subscription + pub fn create_watch( + &self, + req: WatchRequest, + sender: mpsc::Sender, + ) -> i64 { + let watch_id = if req.watch_id != 0 { + req.watch_id + } else { + next_watch_id() + }; + + let matcher = if let Some(ref end) = req.range_end { + KeyMatcher::range(req.key.clone(), end.clone()) + } else { + KeyMatcher::key(req.key.clone()) + }; + + let subscription = WatchSubscription { + watch_id, + matcher, + prev_kv: req.prev_kv, + created_revision: req.start_revision.unwrap_or_else(|| self.current_revision()), + sender, + }; + + // Add to watches + self.watches.insert(watch_id, subscription); + + // Add to prefix index + { + let mut index = self.prefix_index.write(); + index + .entry(req.key.clone()) + .or_insert_with(HashSet::new) + .insert(watch_id); + } + + debug!(watch_id, key = ?String::from_utf8_lossy(&req.key), "Created watch"); + watch_id + } + + /// Cancel a watch + pub fn cancel_watch(&self, watch_id: i64) -> bool { + if let Some((_, sub)) = self.watches.remove(&watch_id) { + // Remove from prefix index + let mut index = self.prefix_index.write(); + if let Some(ids) = index.get_mut(sub.matcher.start_key()) { + ids.remove(&watch_id); + if ids.is_empty() { + index.remove(sub.matcher.start_key()); + } + } + debug!(watch_id, "Canceled watch"); + true + } else { + false + } + } + + /// Get watch count + pub fn watch_count(&self) -> usize { + self.watches.len() + } + + /// Dispatch an event to matching watches + pub async fn dispatch_event(&self, event: WatchEvent) { + let key = &event.kv.key; + let revision = event.kv.mod_revision; + + // Update current revision + { + let mut current = self.current_revision.write(); + if revision > *current { + *current = revision; + } + } + + // Find all matching watches + let matching_ids = self.find_matching_watches(key); + + trace!( + key = ?String::from_utf8_lossy(key), + matches = matching_ids.len(), + "Dispatching event" + ); + + for watch_id in matching_ids { + if let Some(sub) = self.watches.get(&watch_id) { + // Check if event revision is after watch creation + if revision > sub.created_revision { + let response = WatchResponse::events( + watch_id, + vec![if sub.prev_kv { + event.clone() + } else { + WatchEvent { + event_type: event.event_type, + kv: event.kv.clone(), + prev_kv: None, + } + }], + ); + + // Non-blocking send + if sub.sender.try_send(response).is_err() { + warn!(watch_id, "Watch channel full or closed"); + } + } + } + } + } + + /// Find watches that match a key + fn find_matching_watches(&self, key: &[u8]) -> Vec { + let mut result = Vec::new(); + + // Check each subscription for match + // This is O(n) but can be optimized with better indexing + for entry in self.watches.iter() { + if entry.matcher.matches(key) { + result.push(*entry.key()); + } + } + + result + } + + /// Send progress notification to all watches + pub async fn send_progress(&self) { + let revision = self.current_revision(); + + for entry in self.watches.iter() { + let response = WatchResponse::progress(entry.watch_id, revision); + if entry.sender.try_send(response).is_err() { + trace!(watch_id = entry.watch_id, "Progress notification dropped"); + } + } + } + + /// Remove watches with closed channels + pub fn cleanup_closed(&self) { + let closed_ids: Vec = self + .watches + .iter() + .filter(|entry| entry.sender.is_closed()) + .map(|entry| *entry.key()) + .collect(); + + for id in closed_ids { + self.cancel_watch(id); + } + } +} + +impl Default for WatchRegistry { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use chainfire_types::kv::KvEntry; + use chainfire_types::watch::WatchEventType; + + fn create_test_event(key: &[u8], value: &[u8], revision: u64) -> WatchEvent { + WatchEvent { + event_type: WatchEventType::Put, + kv: KvEntry::new(key.to_vec(), value.to_vec(), revision), + prev_kv: None, + } + } + + #[tokio::test] + async fn test_create_and_cancel_watch() { + let registry = WatchRegistry::new(); + let (tx, _rx) = mpsc::channel(10); + + let req = WatchRequest::key(1, b"/test/key"); + let watch_id = registry.create_watch(req, tx); + + assert_eq!(watch_id, 1); + assert_eq!(registry.watch_count(), 1); + + assert!(registry.cancel_watch(watch_id)); + assert_eq!(registry.watch_count(), 0); + } + + #[tokio::test] + async fn test_dispatch_to_single_key_watch() { + let registry = WatchRegistry::new(); + let (tx, mut rx) = mpsc::channel(10); + + let req = WatchRequest::key(1, b"/test/key"); + registry.create_watch(req, tx); + + // Dispatch matching event + let event = create_test_event(b"/test/key", b"value", 1); + registry.dispatch_event(event).await; + + // Should receive event + let response = rx.try_recv().unwrap(); + assert_eq!(response.watch_id, 1); + assert_eq!(response.events.len(), 1); + assert_eq!(response.events[0].kv.key, b"/test/key"); + } + + #[tokio::test] + async fn test_dispatch_to_prefix_watch() { + let registry = WatchRegistry::new(); + let (tx, mut rx) = mpsc::channel(10); + + let req = WatchRequest::prefix(1, b"/nodes/"); + registry.create_watch(req, tx); + + // Dispatch matching events + registry + .dispatch_event(create_test_event(b"/nodes/1", b"data1", 1)) + .await; + registry + .dispatch_event(create_test_event(b"/nodes/2", b"data2", 2)) + .await; + registry + .dispatch_event(create_test_event(b"/tasks/1", b"other", 3)) + .await; + + // Should receive 2 events (not /tasks/1) + let resp1 = rx.try_recv().unwrap(); + let resp2 = rx.try_recv().unwrap(); + assert!(rx.try_recv().is_err()); + + assert_eq!(resp1.events[0].kv.key, b"/nodes/1"); + assert_eq!(resp2.events[0].kv.key, b"/nodes/2"); + } + + #[tokio::test] + async fn test_revision_filtering() { + let registry = WatchRegistry::new(); + registry.set_revision(5); + + let (tx, mut rx) = mpsc::channel(10); + + // Watch starting from revision 10 + let req = WatchRequest::key(1, b"/key").from_revision(10); + registry.create_watch(req, tx); + + // Event at revision 8 (before watch start) + registry + .dispatch_event(create_test_event(b"/key", b"old", 8)) + .await; + + // Event at revision 12 (after watch start) + registry + .dispatch_event(create_test_event(b"/key", b"new", 12)) + .await; + + // Should only receive the second event + let response = rx.try_recv().unwrap(); + assert_eq!(response.events[0].kv.mod_revision, 12); + assert!(rx.try_recv().is_err()); + } + + #[tokio::test] + async fn test_multiple_watches() { + let registry = WatchRegistry::new(); + + let (tx1, mut rx1) = mpsc::channel(10); + let (tx2, mut rx2) = mpsc::channel(10); + + registry.create_watch(WatchRequest::prefix(1, b"/a/"), tx1); + registry.create_watch(WatchRequest::prefix(2, b"/a/b/"), tx2); + + // Event matching both watches + registry + .dispatch_event(create_test_event(b"/a/b/c", b"value", 1)) + .await; + + // Both should receive the event + assert!(rx1.try_recv().is_ok()); + assert!(rx2.try_recv().is_ok()); + } + + #[tokio::test] + async fn test_cleanup_closed() { + let registry = WatchRegistry::new(); + + let (tx, rx) = mpsc::channel(10); + registry.create_watch(WatchRequest::key(1, b"/test"), tx); + + assert_eq!(registry.watch_count(), 1); + + // Drop the receiver to close the channel + drop(rx); + + // Cleanup should remove the watch + registry.cleanup_closed(); + assert_eq!(registry.watch_count(), 0); + } +} diff --git a/chainfire/crates/chainfire-watch/src/stream.rs b/chainfire/crates/chainfire-watch/src/stream.rs new file mode 100644 index 0000000..2993efd --- /dev/null +++ b/chainfire/crates/chainfire-watch/src/stream.rs @@ -0,0 +1,190 @@ +//! Watch stream management + +use crate::WatchRegistry; +use chainfire_types::watch::{WatchRequest, WatchResponse}; +use std::collections::HashSet; +use std::sync::Arc; +use tokio::sync::mpsc; +use tracing::{debug, trace}; + +/// Manages watch subscriptions for a single client stream +pub struct WatchStream { + /// Reference to the global registry + registry: Arc, + /// Watch IDs owned by this stream + active_watches: HashSet, + /// Channel for sending events to the client + event_tx: mpsc::Sender, +} + +impl WatchStream { + /// Create a new watch stream + pub fn new(registry: Arc, event_tx: mpsc::Sender) -> Self { + Self { + registry, + active_watches: HashSet::new(), + event_tx, + } + } + + /// Handle a create watch request + pub fn create_watch(&mut self, req: WatchRequest) -> WatchResponse { + let watch_id = self.registry.create_watch(req, self.event_tx.clone()); + self.active_watches.insert(watch_id); + + debug!(watch_id, "Stream created watch"); + WatchResponse::created(watch_id) + } + + /// Handle a cancel watch request + pub fn cancel_watch(&mut self, watch_id: i64) -> WatchResponse { + let canceled = if self.active_watches.remove(&watch_id) { + self.registry.cancel_watch(watch_id) + } else { + false + }; + + debug!(watch_id, canceled, "Stream canceled watch"); + WatchResponse::canceled(watch_id) + } + + /// Get the number of active watches in this stream + pub fn watch_count(&self) -> usize { + self.active_watches.len() + } + + /// Get active watch IDs + pub fn watch_ids(&self) -> impl Iterator + '_ { + self.active_watches.iter().copied() + } +} + +impl Drop for WatchStream { + fn drop(&mut self) { + // Clean up all watches when stream closes + for watch_id in self.active_watches.drain() { + self.registry.cancel_watch(watch_id); + trace!(watch_id, "Cleaned up watch on stream close"); + } + } +} + +/// Handle for spawning watch event processor +pub struct WatchEventHandler { + registry: Arc, +} + +impl WatchEventHandler { + /// Create a new event handler + pub fn new(registry: Arc) -> Self { + Self { registry } + } + + /// Spawn a background task that processes watch events + pub fn spawn_dispatcher( + self, + mut event_rx: mpsc::UnboundedReceiver, + ) -> tokio::task::JoinHandle<()> { + tokio::spawn(async move { + while let Some(event) = event_rx.recv().await { + self.registry.dispatch_event(event).await; + } + debug!("Watch event dispatcher stopped"); + }) + } + + /// Spawn a background task for progress notifications + pub fn spawn_progress_notifier( + registry: Arc, + interval: std::time::Duration, + ) -> tokio::task::JoinHandle<()> { + tokio::spawn(async move { + let mut ticker = tokio::time::interval(interval); + loop { + ticker.tick().await; + registry.send_progress().await; + } + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use chainfire_types::kv::KvEntry; + use chainfire_types::watch::{WatchEvent, WatchEventType}; + + #[tokio::test] + async fn test_watch_stream_lifecycle() { + let registry = Arc::new(WatchRegistry::new()); + let (tx, mut rx) = mpsc::channel(10); + + let mut stream = WatchStream::new(Arc::clone(®istry), tx); + + // Create watch + let req = WatchRequest::key(0, b"/test"); + let response = stream.create_watch(req); + assert!(response.created); + + let watch_id = response.watch_id; + assert_eq!(stream.watch_count(), 1); + assert_eq!(registry.watch_count(), 1); + + // Cancel watch + let response = stream.cancel_watch(watch_id); + assert!(response.canceled); + assert_eq!(stream.watch_count(), 0); + assert_eq!(registry.watch_count(), 0); + } + + #[tokio::test] + async fn test_watch_stream_cleanup_on_drop() { + let registry = Arc::new(WatchRegistry::new()); + let (tx, _rx) = mpsc::channel(10); + + { + let mut stream = WatchStream::new(Arc::clone(®istry), tx); + stream.create_watch(WatchRequest::key(0, b"/a")); + stream.create_watch(WatchRequest::key(0, b"/b")); + stream.create_watch(WatchRequest::key(0, b"/c")); + + assert_eq!(registry.watch_count(), 3); + } + // Stream dropped here + + // Registry should be cleaned up + assert_eq!(registry.watch_count(), 0); + } + + #[tokio::test] + async fn test_event_handler() { + let registry = Arc::new(WatchRegistry::new()); + let (event_tx, event_rx) = mpsc::unbounded_channel(); + let (watch_tx, mut watch_rx) = mpsc::channel(10); + + // Create a watch + let req = WatchRequest::key(1, b"/test"); + registry.create_watch(req, watch_tx); + + // Start event handler + let handler = WatchEventHandler::new(Arc::clone(®istry)); + let handle = handler.spawn_dispatcher(event_rx); + + // Send an event + event_tx + .send(WatchEvent { + event_type: WatchEventType::Put, + kv: KvEntry::new(b"/test".to_vec(), b"value".to_vec(), 1), + prev_kv: None, + }) + .unwrap(); + + // Should receive the event + let response = watch_rx.recv().await.unwrap(); + assert_eq!(response.events.len(), 1); + + // Cleanup + drop(event_tx); + handle.await.unwrap(); + } +} diff --git a/chainfire/flake.lock b/chainfire/flake.lock new file mode 100644 index 0000000..0bda89c --- /dev/null +++ b/chainfire/flake.lock @@ -0,0 +1,96 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1764517877, + "narHash": "sha256-pp3uT4hHijIC8JUK5MEqeAWmParJrgBVzHLNfJDZxg4=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "2d293cbfa5a793b4c50d17c05ef9e385b90edf6c", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "nixpkgs_2": { + "locked": { + "lastModified": 1744536153, + "narHash": "sha256-awS2zRgF4uTwrOKwwiJcByDzDOdo3Q1rPZbiHQg/N38=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "18dd725c29603f582cf1900e0d25f9f1063dbf11", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs", + "rust-overlay": "rust-overlay" + } + }, + "rust-overlay": { + "inputs": { + "nixpkgs": "nixpkgs_2" + }, + "locked": { + "lastModified": 1764729618, + "narHash": "sha256-z4RA80HCWv2los1KD346c+PwNPzMl79qgl7bCVgz8X0=", + "owner": "oxalica", + "repo": "rust-overlay", + "rev": "52764074a85145d5001bf0aa30cb71936e9ad5b8", + "type": "github" + }, + "original": { + "owner": "oxalica", + "repo": "rust-overlay", + "type": "github" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/chainfire/flake.nix b/chainfire/flake.nix new file mode 100644 index 0000000..47e2704 --- /dev/null +++ b/chainfire/flake.nix @@ -0,0 +1,79 @@ +{ + description = "Chainfire - Distributed Key-Value Store with Raft and Gossip"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + rust-overlay.url = "github:oxalica/rust-overlay"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = { self, nixpkgs, rust-overlay, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + overlays = [ (import rust-overlay) ]; + pkgs = import nixpkgs { + inherit system overlays; + }; + + rustToolchain = pkgs.rust-bin.stable.latest.default.override { + extensions = [ "rust-src" "rust-analyzer" ]; + }; + + nativeBuildInputs = with pkgs; [ + rustToolchain + pkg-config + protobuf + cmake + ]; + + buildInputs = with pkgs; [ + # For RocksDB bindgen + llvmPackages.libclang + llvmPackages.clang + + # RocksDB build dependencies (let cargo build rocksdb from source) + snappy + lz4 + zstd + zlib + bzip2 + + # OpenSSL for potential TLS support + openssl + ]; + + # Environment variables for build + shellHook = '' + export LIBCLANG_PATH="${pkgs.llvmPackages.libclang.lib}/lib" + export PROTOC="${pkgs.protobuf}/bin/protoc" + ''; + + in + { + devShells.default = pkgs.mkShell { + inherit nativeBuildInputs buildInputs shellHook; + + LIBCLANG_PATH = "${pkgs.llvmPackages.libclang.lib}/lib"; + PROTOC = "${pkgs.protobuf}/bin/protoc"; + }; + + packages.default = pkgs.rustPlatform.buildRustPackage { + pname = "chainfire"; + version = "0.1.0"; + src = ./.; + + cargoLock = { + lockFile = ./Cargo.lock; + }; + + inherit nativeBuildInputs buildInputs; + + LIBCLANG_PATH = "${pkgs.llvmPackages.libclang.lib}/lib"; + PROTOC = "${pkgs.protobuf}/bin/protoc"; + + # Skip tests during nix build (run separately) + doCheck = false; + }; + } + ); +} diff --git a/chainfire/proto/chainfire.proto b/chainfire/proto/chainfire.proto new file mode 100644 index 0000000..27a852b --- /dev/null +++ b/chainfire/proto/chainfire.proto @@ -0,0 +1,414 @@ +syntax = "proto3"; + +package chainfire.v1; + +// Key-Value service +service KV { + // Range gets the keys in the range from the key-value store + rpc Range(RangeRequest) returns (RangeResponse); + + // Put puts the given key into the key-value store + rpc Put(PutRequest) returns (PutResponse); + + // Delete deletes the given range from the key-value store + rpc Delete(DeleteRangeRequest) returns (DeleteRangeResponse); + + // Txn processes multiple requests in a single transaction + rpc Txn(TxnRequest) returns (TxnResponse); +} + +// Watch service +service Watch { + // Watch watches for events happening or that have happened + rpc Watch(stream WatchRequest) returns (stream WatchResponse); +} + +// Cluster management service +service Cluster { + // MemberAdd adds a member into the cluster + rpc MemberAdd(MemberAddRequest) returns (MemberAddResponse); + + // MemberRemove removes an existing member from the cluster + rpc MemberRemove(MemberRemoveRequest) returns (MemberRemoveResponse); + + // MemberList lists all the members in the cluster + rpc MemberList(MemberListRequest) returns (MemberListResponse); + + // Status gets the status of the cluster + rpc Status(StatusRequest) returns (StatusResponse); +} + +// Lease service for TTL-based key expiration +service Lease { + // LeaseGrant creates a new lease with a given TTL + rpc LeaseGrant(LeaseGrantRequest) returns (LeaseGrantResponse); + + // LeaseRevoke revokes a lease, deleting all keys attached to it + rpc LeaseRevoke(LeaseRevokeRequest) returns (LeaseRevokeResponse); + + // LeaseKeepAlive keeps a lease alive by refreshing its TTL + rpc LeaseKeepAlive(stream LeaseKeepAliveRequest) returns (stream LeaseKeepAliveResponse); + + // LeaseTimeToLive retrieves lease information + rpc LeaseTimeToLive(LeaseTimeToLiveRequest) returns (LeaseTimeToLiveResponse); + + // LeaseLeases lists all existing leases + rpc LeaseLeases(LeaseLeasesRequest) returns (LeaseLeasesResponse); +} + +// Response header included in all responses +message ResponseHeader { + // cluster_id is the ID of the cluster + uint64 cluster_id = 1; + // member_id is the ID of the responding member + uint64 member_id = 2; + // revision is the key-value store revision + int64 revision = 3; + // raft_term is the current Raft term + uint64 raft_term = 4; +} + +// Key-value pair +message KeyValue { + // key is the key in bytes + bytes key = 1; + // create_revision is the revision of last creation + int64 create_revision = 2; + // mod_revision is the revision of last modification + int64 mod_revision = 3; + // version is the version of the key + int64 version = 4; + // value is the value held by the key + bytes value = 5; + // lease is the ID of the lease attached to the key + int64 lease = 6; +} + +// ========== Range ========== + +message RangeRequest { + // key is the first key for the range + bytes key = 1; + // range_end is the upper bound on the requested range + bytes range_end = 2; + // limit is a limit on the number of keys returned + int64 limit = 3; + // revision is the point-in-time of the store to use + int64 revision = 4; + // keys_only when set returns only the keys and not the values + bool keys_only = 5; + // count_only when set returns only the count of the keys + bool count_only = 6; + // serializable sets the range request to use serializable (local) reads. + // When true, reads from local state (faster, but may be stale). + // When false (default), uses linearizable reads through Raft (consistent). + bool serializable = 7; +} + +message RangeResponse { + ResponseHeader header = 1; + // kvs is the list of key-value pairs matched by the range request + repeated KeyValue kvs = 2; + // more indicates if there are more keys to return + bool more = 3; + // count is set to the number of keys within the range + int64 count = 4; +} + +// ========== Put ========== + +message PutRequest { + // key is the key to put + bytes key = 1; + // value is the value to put + bytes value = 2; + // lease is the lease ID to attach to the key + int64 lease = 3; + // prev_kv when set returns the previous key-value pair + bool prev_kv = 4; +} + +message PutResponse { + ResponseHeader header = 1; + // prev_kv is the key-value pair before the put + KeyValue prev_kv = 2; +} + +// ========== Delete ========== + +message DeleteRangeRequest { + // key is the first key to delete + bytes key = 1; + // range_end is the key following the last key to delete + bytes range_end = 2; + // prev_kv when set returns deleted key-value pairs + bool prev_kv = 3; +} + +message DeleteRangeResponse { + ResponseHeader header = 1; + // deleted is the number of keys deleted + int64 deleted = 2; + // prev_kvs holds the deleted key-value pairs + repeated KeyValue prev_kvs = 3; +} + +// ========== Transaction ========== + +message TxnRequest { + // compare is a list of predicates + repeated Compare compare = 1; + // success is a list of operations to apply if all comparisons succeed + repeated RequestOp success = 2; + // failure is a list of operations to apply if any comparison fails + repeated RequestOp failure = 3; +} + +message TxnResponse { + ResponseHeader header = 1; + // succeeded is set to true if all comparisons evaluated to true + bool succeeded = 2; + // responses is a list of responses corresponding to the results + repeated ResponseOp responses = 3; +} + +message Compare { + enum CompareResult { + EQUAL = 0; + GREATER = 1; + LESS = 2; + NOT_EQUAL = 3; + } + enum CompareTarget { + VERSION = 0; + CREATE = 1; + MOD = 2; + VALUE = 3; + } + CompareResult result = 1; + CompareTarget target = 2; + bytes key = 3; + oneof target_union { + int64 version = 4; + int64 create_revision = 5; + int64 mod_revision = 6; + bytes value = 7; + } +} + +message RequestOp { + oneof request { + RangeRequest request_range = 1; + PutRequest request_put = 2; + DeleteRangeRequest request_delete_range = 3; + } +} + +message ResponseOp { + oneof response { + RangeResponse response_range = 1; + PutResponse response_put = 2; + DeleteRangeResponse response_delete_range = 3; + } +} + +// ========== Watch ========== + +message WatchRequest { + oneof request_union { + WatchCreateRequest create_request = 1; + WatchCancelRequest cancel_request = 2; + WatchProgressRequest progress_request = 3; + } +} + +message WatchCreateRequest { + // key is the key to watch + bytes key = 1; + // range_end is the end of the range to watch + bytes range_end = 2; + // start_revision is an optional revision to start watching from + int64 start_revision = 3; + // progress_notify is set to true to enable progress notifications + bool progress_notify = 4; + // prev_kv when set includes previous key-value in events + bool prev_kv = 5; + // watch_id is the user-provided watch ID (0 for server-assigned) + int64 watch_id = 6; +} + +message WatchCancelRequest { + // watch_id is the watch ID to cancel + int64 watch_id = 1; +} + +message WatchProgressRequest {} + +message WatchResponse { + ResponseHeader header = 1; + // watch_id is the watch ID for this response + int64 watch_id = 2; + // created is set to true if this response is for a create request + bool created = 3; + // canceled is set to true if the watch was canceled + bool canceled = 4; + // compact_revision is the minimum revision the watcher may receive + int64 compact_revision = 5; + // cancel_reason indicates the reason for cancellation + string cancel_reason = 6; + // events is the list of events in this response + repeated Event events = 11; +} + +message Event { + enum EventType { + PUT = 0; + DELETE = 1; + } + // type is the kind of event + EventType type = 1; + // kv is the KeyValue affected by the event + KeyValue kv = 2; + // prev_kv is the KeyValue prior to the event + KeyValue prev_kv = 3; +} + +// ========== Cluster Management ========== + +message Member { + // ID is the member ID + uint64 id = 1; + // name is the human-readable name + string name = 2; + // peer_urls are URLs for Raft communication + repeated string peer_urls = 3; + // client_urls are URLs for client communication + repeated string client_urls = 4; + // is_learner indicates if member is a learner + bool is_learner = 5; +} + +message MemberAddRequest { + // peer_urls are the URLs to reach the new member + repeated string peer_urls = 1; + // is_learner indicates if the member is a learner + bool is_learner = 2; +} + +message MemberAddResponse { + ResponseHeader header = 1; + // member is the member information for the added member + Member member = 2; + // members is the list of all members after adding + repeated Member members = 3; +} + +message MemberRemoveRequest { + // ID is the member ID to remove + uint64 id = 1; +} + +message MemberRemoveResponse { + ResponseHeader header = 1; + // members is the list of all members after removing + repeated Member members = 2; +} + +message MemberListRequest {} + +message MemberListResponse { + ResponseHeader header = 1; + // members is the list of all members + repeated Member members = 2; +} + +message StatusRequest {} + +message StatusResponse { + ResponseHeader header = 1; + // version is the version of the server + string version = 2; + // db_size is the size of the database + int64 db_size = 3; + // leader is the member ID of the current leader + uint64 leader = 4; + // raft_index is the current Raft committed index + uint64 raft_index = 5; + // raft_term is the current Raft term + uint64 raft_term = 6; + // raft_applied_index is the current Raft applied index + uint64 raft_applied_index = 7; +} + +// ========== Lease ========== + +message LeaseGrantRequest { + // TTL is the advisory time-to-live in seconds + int64 ttl = 1; + // ID is the requested lease ID. If 0, the server will choose an ID. + int64 id = 2; +} + +message LeaseGrantResponse { + ResponseHeader header = 1; + // ID is the lease ID for the granted lease + int64 id = 2; + // TTL is the actual TTL granted by the server + int64 ttl = 3; + // error is any error that occurred + string error = 4; +} + +message LeaseRevokeRequest { + // ID is the lease ID to revoke + int64 id = 1; +} + +message LeaseRevokeResponse { + ResponseHeader header = 1; +} + +message LeaseKeepAliveRequest { + // ID is the lease ID to keep alive + int64 id = 1; +} + +message LeaseKeepAliveResponse { + ResponseHeader header = 1; + // ID is the lease ID from the keep-alive request + int64 id = 2; + // TTL is the new TTL for the lease + int64 ttl = 3; +} + +message LeaseTimeToLiveRequest { + // ID is the lease ID to query + int64 id = 1; + // keys is true to query all keys attached to this lease + bool keys = 2; +} + +message LeaseTimeToLiveResponse { + ResponseHeader header = 1; + // ID is the lease ID + int64 id = 2; + // TTL is the remaining TTL in seconds; -1 if lease doesn't exist + int64 ttl = 3; + // grantedTTL is the initial TTL granted + int64 granted_ttl = 4; + // keys is the list of keys attached to this lease + repeated bytes keys = 5; +} + +message LeaseLeasesRequest {} + +message LeaseLeasesResponse { + ResponseHeader header = 1; + // leases is the list of all leases + repeated LeaseStatus leases = 2; +} + +message LeaseStatus { + // ID is the lease ID + int64 id = 1; +} diff --git a/chainfire/proto/internal.proto b/chainfire/proto/internal.proto new file mode 100644 index 0000000..3b48480 --- /dev/null +++ b/chainfire/proto/internal.proto @@ -0,0 +1,93 @@ +syntax = "proto3"; + +package chainfire.internal; + +// Internal Raft RPC service for node-to-node communication +service RaftService { + // Vote requests a vote from a peer + rpc Vote(VoteRequest) returns (VoteResponse); + + // AppendEntries sends log entries to followers + rpc AppendEntries(AppendEntriesRequest) returns (AppendEntriesResponse); + + // InstallSnapshot sends a snapshot to a follower + rpc InstallSnapshot(stream InstallSnapshotRequest) returns (InstallSnapshotResponse); +} + +message VoteRequest { + // term is the candidate's term + uint64 term = 1; + // candidate_id is the candidate requesting the vote + uint64 candidate_id = 2; + // last_log_index is index of candidate's last log entry + uint64 last_log_index = 3; + // last_log_term is term of candidate's last log entry + uint64 last_log_term = 4; +} + +message VoteResponse { + // term is the current term for the voter + uint64 term = 1; + // vote_granted is true if the candidate received the vote + bool vote_granted = 2; + // last_log_id is the voter's last log ID + uint64 last_log_index = 3; + uint64 last_log_term = 4; +} + +message AppendEntriesRequest { + // term is the leader's term + uint64 term = 1; + // leader_id is the leader's ID + uint64 leader_id = 2; + // prev_log_index is index of log entry immediately preceding new ones + uint64 prev_log_index = 3; + // prev_log_term is term of prev_log_index entry + uint64 prev_log_term = 4; + // entries are log entries to append + repeated LogEntry entries = 5; + // leader_commit is leader's commit index + uint64 leader_commit = 6; +} + +message LogEntry { + // index is the log entry index + uint64 index = 1; + // term is the term when entry was received + uint64 term = 2; + // data is the command data + bytes data = 3; +} + +message AppendEntriesResponse { + // term is the current term + uint64 term = 1; + // success is true if follower contained entry matching prevLogIndex + bool success = 2; + // conflict_index is the first conflicting index (for optimization) + uint64 conflict_index = 3; + // conflict_term is the term of the conflicting entry + uint64 conflict_term = 4; +} + +message InstallSnapshotRequest { + // term is the leader's term + uint64 term = 1; + // leader_id is the leader's ID + uint64 leader_id = 2; + // last_included_index is the snapshot replaces all entries up through and including this index + uint64 last_included_index = 3; + // last_included_term is term of last_included_index + uint64 last_included_term = 4; + // offset is byte offset where chunk is positioned in the snapshot file + uint64 offset = 5; + // data is raw bytes of the snapshot chunk + bytes data = 6; + // done is true if this is the last chunk + bool done = 7; +} + +message InstallSnapshotResponse { + // term is the current term + uint64 term = 1; +} diff --git a/flake.nix b/flake.nix index 86b2bc3..0d5b92f 100644 --- a/flake.nix +++ b/flake.nix @@ -31,22 +31,10 @@ inherit system overlays; }; - # Fetch submodule sources with their .git directories included - # This is necessary because chainfire, flaredb, and iam are git submodules - chainfireSrc = builtins.fetchGit { - url = ./chainfire; - submodules = true; - }; - - flaredbSrc = builtins.fetchGit { - url = ./flaredb; - submodules = true; - }; - - iamSrc = builtins.fetchGit { - url = ./iam; - submodules = true; - }; + # Local workspace sources (regular directories, not submodules) + chainfireSrc = ./chainfire; + flaredbSrc = ./flaredb; + iamSrc = ./iam; # Rust toolchain configuration # Using stable channel with rust-src (for rust-analyzer) and rust-analyzer diff --git a/flaredb b/flaredb deleted file mode 160000 index 69908ec..0000000 --- a/flaredb +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 69908ec0d2fcfda290719ce129e84b4c56afc91c diff --git a/flaredb/.gitignore b/flaredb/.gitignore new file mode 100644 index 0000000..d42a123 --- /dev/null +++ b/flaredb/.gitignore @@ -0,0 +1,18 @@ +# Generated by speckit +target/ +debug/ +release/ +.codex/ +.cursor/ +AGENTS.md +**/*.rs.bk +*.rlib +*.prof* +.idea/ +*.log +.env* +.DS_Store +Thumbs.db +*.tmp +*.swp +.vscode/ diff --git a/flaredb/.specify/memory/constitution.md b/flaredb/.specify/memory/constitution.md new file mode 100644 index 0000000..da33a92 --- /dev/null +++ b/flaredb/.specify/memory/constitution.md @@ -0,0 +1,41 @@ +# FlareDB Feature Constitution + +## Core Principles + +### I. Test-First (NON-NEGOTIABLE) +- Write tests before implementation for new functionality. +- Follow Red-Green-Refactor; do not merge untested code. +- All critical paths require unit tests; integration tests required when services/protocols change. + +### II. Reliability & Coverage +- CI must run `cargo test` (or equivalent) for all touched crates. +- Integration verification must cover cross-service interactions when contracts change. +- Regressions on previously passing tests are not acceptable. + +### III. Simplicity & Readability +- Prefer standard crates over bespoke solutions; avoid unnecessary complexity (YAGNI). +- Code must be self-explanatory; add concise comments only for non-obvious logic. +- Keep APIs minimal and coherent; avoid naming drift. + +### IV. Observability +- Services must log structured, human-readable errors; fatal errors exit non-zero. +- gRPC/CLI surfaces should emit actionable diagnostics on failure. + +### V. Versioning & Compatibility +- Protocol and API changes must call out compatibility impact; breaking changes require explicit agreement. +- Generated artifacts must be reproducible (lockfiles or pinned versions where applicable). + +## Additional Constraints +- Technology stack: Rust stable, gRPC via tonic/prost, RocksDB for storage, tokio runtime. +- Nix flake is the canonical dev environment; commands should respect it when present. + +## Development Workflow +- Tests before code; integration tests when touching contracts or cross-service logic. +- Code review (human or designated process) must confirm constitution compliance. +- Complexity must be justified; large changes should be broken down into tasks aligned with user stories. + +## Governance +- This constitution supersedes other practices for this feature; conflicts must be resolved by adjusting spec/plan/tasks, not by ignoring principles. +- Amendments require an explicit update to this document with rationale and date. + +**Version**: 1.0.0 | **Ratified**: 2025-11-30 | **Last Amended**: 2025-11-30 diff --git a/flaredb/.specify/scripts/bash/check-prerequisites.sh b/flaredb/.specify/scripts/bash/check-prerequisites.sh new file mode 100755 index 0000000..98e387c --- /dev/null +++ b/flaredb/.specify/scripts/bash/check-prerequisites.sh @@ -0,0 +1,166 @@ +#!/usr/bin/env bash + +# Consolidated prerequisite checking script +# +# This script provides unified prerequisite checking for Spec-Driven Development workflow. +# It replaces the functionality previously spread across multiple scripts. +# +# Usage: ./check-prerequisites.sh [OPTIONS] +# +# OPTIONS: +# --json Output in JSON format +# --require-tasks Require tasks.md to exist (for implementation phase) +# --include-tasks Include tasks.md in AVAILABLE_DOCS list +# --paths-only Only output path variables (no validation) +# --help, -h Show help message +# +# OUTPUTS: +# JSON mode: {"FEATURE_DIR":"...", "AVAILABLE_DOCS":["..."]} +# Text mode: FEATURE_DIR:... \n AVAILABLE_DOCS: \n ✓/✗ file.md +# Paths only: REPO_ROOT: ... \n BRANCH: ... \n FEATURE_DIR: ... etc. + +set -e + +# Parse command line arguments +JSON_MODE=false +REQUIRE_TASKS=false +INCLUDE_TASKS=false +PATHS_ONLY=false + +for arg in "$@"; do + case "$arg" in + --json) + JSON_MODE=true + ;; + --require-tasks) + REQUIRE_TASKS=true + ;; + --include-tasks) + INCLUDE_TASKS=true + ;; + --paths-only) + PATHS_ONLY=true + ;; + --help|-h) + cat << 'EOF' +Usage: check-prerequisites.sh [OPTIONS] + +Consolidated prerequisite checking for Spec-Driven Development workflow. + +OPTIONS: + --json Output in JSON format + --require-tasks Require tasks.md to exist (for implementation phase) + --include-tasks Include tasks.md in AVAILABLE_DOCS list + --paths-only Only output path variables (no prerequisite validation) + --help, -h Show this help message + +EXAMPLES: + # Check task prerequisites (plan.md required) + ./check-prerequisites.sh --json + + # Check implementation prerequisites (plan.md + tasks.md required) + ./check-prerequisites.sh --json --require-tasks --include-tasks + + # Get feature paths only (no validation) + ./check-prerequisites.sh --paths-only + +EOF + exit 0 + ;; + *) + echo "ERROR: Unknown option '$arg'. Use --help for usage information." >&2 + exit 1 + ;; + esac +done + +# Source common functions +SCRIPT_DIR="$(CDPATH="" cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/common.sh" + +# Get feature paths and validate branch +eval $(get_feature_paths) +check_feature_branch "$CURRENT_BRANCH" "$HAS_GIT" || exit 1 + +# If paths-only mode, output paths and exit (support JSON + paths-only combined) +if $PATHS_ONLY; then + if $JSON_MODE; then + # Minimal JSON paths payload (no validation performed) + printf '{"REPO_ROOT":"%s","BRANCH":"%s","FEATURE_DIR":"%s","FEATURE_SPEC":"%s","IMPL_PLAN":"%s","TASKS":"%s"}\n' \ + "$REPO_ROOT" "$CURRENT_BRANCH" "$FEATURE_DIR" "$FEATURE_SPEC" "$IMPL_PLAN" "$TASKS" + else + echo "REPO_ROOT: $REPO_ROOT" + echo "BRANCH: $CURRENT_BRANCH" + echo "FEATURE_DIR: $FEATURE_DIR" + echo "FEATURE_SPEC: $FEATURE_SPEC" + echo "IMPL_PLAN: $IMPL_PLAN" + echo "TASKS: $TASKS" + fi + exit 0 +fi + +# Validate required directories and files +if [[ ! -d "$FEATURE_DIR" ]]; then + echo "ERROR: Feature directory not found: $FEATURE_DIR" >&2 + echo "Run /speckit.specify first to create the feature structure." >&2 + exit 1 +fi + +if [[ ! -f "$IMPL_PLAN" ]]; then + echo "ERROR: plan.md not found in $FEATURE_DIR" >&2 + echo "Run /speckit.plan first to create the implementation plan." >&2 + exit 1 +fi + +# Check for tasks.md if required +if $REQUIRE_TASKS && [[ ! -f "$TASKS" ]]; then + echo "ERROR: tasks.md not found in $FEATURE_DIR" >&2 + echo "Run /speckit.tasks first to create the task list." >&2 + exit 1 +fi + +# Build list of available documents +docs=() + +# Always check these optional docs +[[ -f "$RESEARCH" ]] && docs+=("research.md") +[[ -f "$DATA_MODEL" ]] && docs+=("data-model.md") + +# Check contracts directory (only if it exists and has files) +if [[ -d "$CONTRACTS_DIR" ]] && [[ -n "$(ls -A "$CONTRACTS_DIR" 2>/dev/null)" ]]; then + docs+=("contracts/") +fi + +[[ -f "$QUICKSTART" ]] && docs+=("quickstart.md") + +# Include tasks.md if requested and it exists +if $INCLUDE_TASKS && [[ -f "$TASKS" ]]; then + docs+=("tasks.md") +fi + +# Output results +if $JSON_MODE; then + # Build JSON array of documents + if [[ ${#docs[@]} -eq 0 ]]; then + json_docs="[]" + else + json_docs=$(printf '"%s",' "${docs[@]}") + json_docs="[${json_docs%,}]" + fi + + printf '{"FEATURE_DIR":"%s","AVAILABLE_DOCS":%s}\n' "$FEATURE_DIR" "$json_docs" +else + # Text output + echo "FEATURE_DIR:$FEATURE_DIR" + echo "AVAILABLE_DOCS:" + + # Show status of each potential document + check_file "$RESEARCH" "research.md" + check_file "$DATA_MODEL" "data-model.md" + check_dir "$CONTRACTS_DIR" "contracts/" + check_file "$QUICKSTART" "quickstart.md" + + if $INCLUDE_TASKS; then + check_file "$TASKS" "tasks.md" + fi +fi diff --git a/flaredb/.specify/scripts/bash/common.sh b/flaredb/.specify/scripts/bash/common.sh new file mode 100755 index 0000000..2c3165e --- /dev/null +++ b/flaredb/.specify/scripts/bash/common.sh @@ -0,0 +1,156 @@ +#!/usr/bin/env bash +# Common functions and variables for all scripts + +# Get repository root, with fallback for non-git repositories +get_repo_root() { + if git rev-parse --show-toplevel >/dev/null 2>&1; then + git rev-parse --show-toplevel + else + # Fall back to script location for non-git repos + local script_dir="$(CDPATH="" cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + (cd "$script_dir/../../.." && pwd) + fi +} + +# Get current branch, with fallback for non-git repositories +get_current_branch() { + # First check if SPECIFY_FEATURE environment variable is set + if [[ -n "${SPECIFY_FEATURE:-}" ]]; then + echo "$SPECIFY_FEATURE" + return + fi + + # Then check git if available + if git rev-parse --abbrev-ref HEAD >/dev/null 2>&1; then + git rev-parse --abbrev-ref HEAD + return + fi + + # For non-git repos, try to find the latest feature directory + local repo_root=$(get_repo_root) + local specs_dir="$repo_root/specs" + + if [[ -d "$specs_dir" ]]; then + local latest_feature="" + local highest=0 + + for dir in "$specs_dir"/*; do + if [[ -d "$dir" ]]; then + local dirname=$(basename "$dir") + if [[ "$dirname" =~ ^([0-9]{3})- ]]; then + local number=${BASH_REMATCH[1]} + number=$((10#$number)) + if [[ "$number" -gt "$highest" ]]; then + highest=$number + latest_feature=$dirname + fi + fi + fi + done + + if [[ -n "$latest_feature" ]]; then + echo "$latest_feature" + return + fi + fi + + echo "main" # Final fallback +} + +# Check if we have git available +has_git() { + git rev-parse --show-toplevel >/dev/null 2>&1 +} + +check_feature_branch() { + local branch="$1" + local has_git_repo="$2" + + # For non-git repos, we can't enforce branch naming but still provide output + if [[ "$has_git_repo" != "true" ]]; then + echo "[specify] Warning: Git repository not detected; skipped branch validation" >&2 + return 0 + fi + + if [[ ! "$branch" =~ ^[0-9]{3}- ]]; then + echo "ERROR: Not on a feature branch. Current branch: $branch" >&2 + echo "Feature branches should be named like: 001-feature-name" >&2 + return 1 + fi + + return 0 +} + +get_feature_dir() { echo "$1/specs/$2"; } + +# Find feature directory by numeric prefix instead of exact branch match +# This allows multiple branches to work on the same spec (e.g., 004-fix-bug, 004-add-feature) +find_feature_dir_by_prefix() { + local repo_root="$1" + local branch_name="$2" + local specs_dir="$repo_root/specs" + + # Extract numeric prefix from branch (e.g., "004" from "004-whatever") + if [[ ! "$branch_name" =~ ^([0-9]{3})- ]]; then + # If branch doesn't have numeric prefix, fall back to exact match + echo "$specs_dir/$branch_name" + return + fi + + local prefix="${BASH_REMATCH[1]}" + + # Search for directories in specs/ that start with this prefix + local matches=() + if [[ -d "$specs_dir" ]]; then + for dir in "$specs_dir"/"$prefix"-*; do + if [[ -d "$dir" ]]; then + matches+=("$(basename "$dir")") + fi + done + fi + + # Handle results + if [[ ${#matches[@]} -eq 0 ]]; then + # No match found - return the branch name path (will fail later with clear error) + echo "$specs_dir/$branch_name" + elif [[ ${#matches[@]} -eq 1 ]]; then + # Exactly one match - perfect! + echo "$specs_dir/${matches[0]}" + else + # Multiple matches - this shouldn't happen with proper naming convention + echo "ERROR: Multiple spec directories found with prefix '$prefix': ${matches[*]}" >&2 + echo "Please ensure only one spec directory exists per numeric prefix." >&2 + echo "$specs_dir/$branch_name" # Return something to avoid breaking the script + fi +} + +get_feature_paths() { + local repo_root=$(get_repo_root) + local current_branch=$(get_current_branch) + local has_git_repo="false" + + if has_git; then + has_git_repo="true" + fi + + # Use prefix-based lookup to support multiple branches per spec + local feature_dir=$(find_feature_dir_by_prefix "$repo_root" "$current_branch") + + cat </dev/null) ]] && echo " ✓ $2" || echo " ✗ $2"; } + diff --git a/flaredb/.specify/scripts/bash/create-new-feature.sh b/flaredb/.specify/scripts/bash/create-new-feature.sh new file mode 100755 index 0000000..592dab2 --- /dev/null +++ b/flaredb/.specify/scripts/bash/create-new-feature.sh @@ -0,0 +1,305 @@ +#!/usr/bin/env bash + +set -e + +JSON_MODE=false +SHORT_NAME="" +BRANCH_NUMBER="" +ARGS=() +i=1 +while [ $i -le $# ]; do + arg="${!i}" + case "$arg" in + --json) + JSON_MODE=true + ;; + --short-name) + if [ $((i + 1)) -gt $# ]; then + echo 'Error: --short-name requires a value' >&2 + exit 1 + fi + i=$((i + 1)) + next_arg="${!i}" + # Check if the next argument is another option (starts with --) + if [[ "$next_arg" == --* ]]; then + echo 'Error: --short-name requires a value' >&2 + exit 1 + fi + SHORT_NAME="$next_arg" + ;; + --number) + if [ $((i + 1)) -gt $# ]; then + echo 'Error: --number requires a value' >&2 + exit 1 + fi + i=$((i + 1)) + next_arg="${!i}" + if [[ "$next_arg" == --* ]]; then + echo 'Error: --number requires a value' >&2 + exit 1 + fi + BRANCH_NUMBER="$next_arg" + ;; + --help|-h) + echo "Usage: $0 [--json] [--short-name ] [--number N] " + echo "" + echo "Options:" + echo " --json Output in JSON format" + echo " --short-name Provide a custom short name (2-4 words) for the branch" + echo " --number N Specify branch number manually (overrides auto-detection)" + echo " --help, -h Show this help message" + echo "" + echo "Examples:" + echo " $0 'Add user authentication system' --short-name 'user-auth'" + echo " $0 'Implement OAuth2 integration for API' --number 5" + exit 0 + ;; + *) + ARGS+=("$arg") + ;; + esac + i=$((i + 1)) +done + +FEATURE_DESCRIPTION="${ARGS[*]}" +if [ -z "$FEATURE_DESCRIPTION" ]; then + echo "Usage: $0 [--json] [--short-name ] [--number N] " >&2 + exit 1 +fi + +# Function to find the repository root by searching for existing project markers +find_repo_root() { + local dir="$1" + while [ "$dir" != "/" ]; do + if [ -d "$dir/.git" ] || [ -d "$dir/.specify" ]; then + echo "$dir" + return 0 + fi + dir="$(dirname "$dir")" + done + return 1 +} + +# Function to get highest number from specs directory +get_highest_from_specs() { + local specs_dir="$1" + local highest=0 + + if [ -d "$specs_dir" ]; then + for dir in "$specs_dir"/*; do + [ -d "$dir" ] || continue + dirname=$(basename "$dir") + number=$(echo "$dirname" | grep -o '^[0-9]\+' || echo "0") + number=$((10#$number)) + if [ "$number" -gt "$highest" ]; then + highest=$number + fi + done + fi + + echo "$highest" +} + +# Function to get highest number from git branches +get_highest_from_branches() { + local highest=0 + + # Get all branches (local and remote) + branches=$(git branch -a 2>/dev/null || echo "") + + if [ -n "$branches" ]; then + while IFS= read -r branch; do + # Clean branch name: remove leading markers and remote prefixes + clean_branch=$(echo "$branch" | sed 's/^[* ]*//; s|^remotes/[^/]*/||') + + # Extract feature number if branch matches pattern ###-* + if echo "$clean_branch" | grep -q '^[0-9]\{3\}-'; then + number=$(echo "$clean_branch" | grep -o '^[0-9]\{3\}' || echo "0") + number=$((10#$number)) + if [ "$number" -gt "$highest" ]; then + highest=$number + fi + fi + done <<< "$branches" + fi + + echo "$highest" +} + +# Function to check existing branches (local and remote) and return next available number +check_existing_branches() { + local short_name="$1" + local specs_dir="$2" + + # Fetch all remotes to get latest branch info (suppress errors if no remotes) + git fetch --all --prune 2>/dev/null || true + + # Find all branches matching the pattern using git ls-remote (more reliable) + local remote_branches=$(git ls-remote --heads origin 2>/dev/null | grep -E "refs/heads/[0-9]+-${short_name}$" | sed 's/.*\/\([0-9]*\)-.*/\1/' | sort -n) + + # Also check local branches + local local_branches=$(git branch 2>/dev/null | grep -E "^[* ]*[0-9]+-${short_name}$" | sed 's/^[* ]*//' | sed 's/-.*//' | sort -n) + + # Check specs directory as well + local spec_dirs="" + if [ -d "$specs_dir" ]; then + spec_dirs=$(find "$specs_dir" -maxdepth 1 -type d -name "[0-9]*-${short_name}" 2>/dev/null | xargs -n1 basename 2>/dev/null | sed 's/-.*//' | sort -n) + fi + + # Combine all sources and get the highest number + local max_num=0 + for num in $remote_branches $local_branches $spec_dirs; do + if [ "$num" -gt "$max_num" ]; then + max_num=$num + fi + done + + # Return next number + echo $((max_num + 1)) +} + +# Function to clean and format a branch name +clean_branch_name() { + local name="$1" + echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/-/g' | sed 's/-\+/-/g' | sed 's/^-//' | sed 's/-$//' +} + +# Resolve repository root. Prefer git information when available, but fall back +# to searching for repository markers so the workflow still functions in repositories that +# were initialised with --no-git. +SCRIPT_DIR="$(CDPATH="" cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +if git rev-parse --show-toplevel >/dev/null 2>&1; then + REPO_ROOT=$(git rev-parse --show-toplevel) + HAS_GIT=true +else + REPO_ROOT="$(find_repo_root "$SCRIPT_DIR")" + if [ -z "$REPO_ROOT" ]; then + echo "Error: Could not determine repository root. Please run this script from within the repository." >&2 + exit 1 + fi + HAS_GIT=false +fi + +cd "$REPO_ROOT" + +SPECS_DIR="$REPO_ROOT/specs" +mkdir -p "$SPECS_DIR" + +# Function to generate branch name with stop word filtering and length filtering +generate_branch_name() { + local description="$1" + + # Common stop words to filter out + local stop_words="^(i|a|an|the|to|for|of|in|on|at|by|with|from|is|are|was|were|be|been|being|have|has|had|do|does|did|will|would|should|could|can|may|might|must|shall|this|that|these|those|my|your|our|their|want|need|add|get|set)$" + + # Convert to lowercase and split into words + local clean_name=$(echo "$description" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/ /g') + + # Filter words: remove stop words and words shorter than 3 chars (unless they're uppercase acronyms in original) + local meaningful_words=() + for word in $clean_name; do + # Skip empty words + [ -z "$word" ] && continue + + # Keep words that are NOT stop words AND (length >= 3 OR are potential acronyms) + if ! echo "$word" | grep -qiE "$stop_words"; then + if [ ${#word} -ge 3 ]; then + meaningful_words+=("$word") + elif echo "$description" | grep -q "\b${word^^}\b"; then + # Keep short words if they appear as uppercase in original (likely acronyms) + meaningful_words+=("$word") + fi + fi + done + + # If we have meaningful words, use first 3-4 of them + if [ ${#meaningful_words[@]} -gt 0 ]; then + local max_words=3 + if [ ${#meaningful_words[@]} -eq 4 ]; then max_words=4; fi + + local result="" + local count=0 + for word in "${meaningful_words[@]}"; do + if [ $count -ge $max_words ]; then break; fi + if [ -n "$result" ]; then result="$result-"; fi + result="$result$word" + count=$((count + 1)) + done + echo "$result" + else + # Fallback to original logic if no meaningful words found + local cleaned=$(clean_branch_name "$description") + echo "$cleaned" | tr '-' '\n' | grep -v '^$' | head -3 | tr '\n' '-' | sed 's/-$//' + fi +} + +# Generate branch name +if [ -n "$SHORT_NAME" ]; then + # Use provided short name, just clean it up + BRANCH_SUFFIX=$(clean_branch_name "$SHORT_NAME") +else + # Generate from description with smart filtering + BRANCH_SUFFIX=$(generate_branch_name "$FEATURE_DESCRIPTION") +fi + +# Determine branch number +if [ -z "$BRANCH_NUMBER" ]; then + if [ "$HAS_GIT" = true ]; then + # Check existing branches on remotes + BRANCH_NUMBER=$(check_existing_branches "$BRANCH_SUFFIX" "$SPECS_DIR") + else + # Fall back to local directory check + HIGHEST=$(get_highest_from_specs "$SPECS_DIR") + BRANCH_NUMBER=$((HIGHEST + 1)) + fi +fi + +FEATURE_NUM=$(printf "%03d" "$BRANCH_NUMBER") +BRANCH_NAME="${FEATURE_NUM}-${BRANCH_SUFFIX}" + +# GitHub enforces a 244-byte limit on branch names +# Validate and truncate if necessary +MAX_BRANCH_LENGTH=244 +if [ ${#BRANCH_NAME} -gt $MAX_BRANCH_LENGTH ]; then + # Calculate how much we need to trim from suffix + # Account for: feature number (3) + hyphen (1) = 4 chars + MAX_SUFFIX_LENGTH=$((MAX_BRANCH_LENGTH - 4)) + + # Truncate suffix at word boundary if possible + TRUNCATED_SUFFIX=$(echo "$BRANCH_SUFFIX" | cut -c1-$MAX_SUFFIX_LENGTH) + # Remove trailing hyphen if truncation created one + TRUNCATED_SUFFIX=$(echo "$TRUNCATED_SUFFIX" | sed 's/-$//') + + ORIGINAL_BRANCH_NAME="$BRANCH_NAME" + BRANCH_NAME="${FEATURE_NUM}-${TRUNCATED_SUFFIX}" + + >&2 echo "[specify] Warning: Branch name exceeded GitHub's 244-byte limit" + >&2 echo "[specify] Original: $ORIGINAL_BRANCH_NAME (${#ORIGINAL_BRANCH_NAME} bytes)" + >&2 echo "[specify] Truncated to: $BRANCH_NAME (${#BRANCH_NAME} bytes)" +fi + +if [ "$HAS_GIT" = true ]; then + git checkout -b "$BRANCH_NAME" +else + >&2 echo "[specify] Warning: Git repository not detected; skipped branch creation for $BRANCH_NAME" +fi + +FEATURE_DIR="$SPECS_DIR/$BRANCH_NAME" +mkdir -p "$FEATURE_DIR" + +TEMPLATE="$REPO_ROOT/.specify/templates/spec-template.md" +SPEC_FILE="$FEATURE_DIR/spec.md" +if [ -f "$TEMPLATE" ]; then cp "$TEMPLATE" "$SPEC_FILE"; else touch "$SPEC_FILE"; fi + +# Set the SPECIFY_FEATURE environment variable for the current session +export SPECIFY_FEATURE="$BRANCH_NAME" + +if $JSON_MODE; then + printf '{"BRANCH_NAME":"%s","SPEC_FILE":"%s","FEATURE_NUM":"%s"}\n' "$BRANCH_NAME" "$SPEC_FILE" "$FEATURE_NUM" +else + echo "BRANCH_NAME: $BRANCH_NAME" + echo "SPEC_FILE: $SPEC_FILE" + echo "FEATURE_NUM: $FEATURE_NUM" + echo "SPECIFY_FEATURE environment variable set to: $BRANCH_NAME" +fi diff --git a/flaredb/.specify/scripts/bash/setup-plan.sh b/flaredb/.specify/scripts/bash/setup-plan.sh new file mode 100755 index 0000000..d01c6d6 --- /dev/null +++ b/flaredb/.specify/scripts/bash/setup-plan.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +set -e + +# Parse command line arguments +JSON_MODE=false +ARGS=() + +for arg in "$@"; do + case "$arg" in + --json) + JSON_MODE=true + ;; + --help|-h) + echo "Usage: $0 [--json]" + echo " --json Output results in JSON format" + echo " --help Show this help message" + exit 0 + ;; + *) + ARGS+=("$arg") + ;; + esac +done + +# Get script directory and load common functions +SCRIPT_DIR="$(CDPATH="" cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/common.sh" + +# Get all paths and variables from common functions +eval $(get_feature_paths) + +# Check if we're on a proper feature branch (only for git repos) +check_feature_branch "$CURRENT_BRANCH" "$HAS_GIT" || exit 1 + +# Ensure the feature directory exists +mkdir -p "$FEATURE_DIR" + +# Copy plan template if it exists +TEMPLATE="$REPO_ROOT/.specify/templates/plan-template.md" +if [[ -f "$TEMPLATE" ]]; then + cp "$TEMPLATE" "$IMPL_PLAN" + echo "Copied plan template to $IMPL_PLAN" +else + echo "Warning: Plan template not found at $TEMPLATE" + # Create a basic plan file if template doesn't exist + touch "$IMPL_PLAN" +fi + +# Output results +if $JSON_MODE; then + printf '{"FEATURE_SPEC":"%s","IMPL_PLAN":"%s","SPECS_DIR":"%s","BRANCH":"%s","HAS_GIT":"%s"}\n' \ + "$FEATURE_SPEC" "$IMPL_PLAN" "$FEATURE_DIR" "$CURRENT_BRANCH" "$HAS_GIT" +else + echo "FEATURE_SPEC: $FEATURE_SPEC" + echo "IMPL_PLAN: $IMPL_PLAN" + echo "SPECS_DIR: $FEATURE_DIR" + echo "BRANCH: $CURRENT_BRANCH" + echo "HAS_GIT: $HAS_GIT" +fi + diff --git a/flaredb/.specify/scripts/bash/update-agent-context.sh b/flaredb/.specify/scripts/bash/update-agent-context.sh new file mode 100755 index 0000000..ff40c19 --- /dev/null +++ b/flaredb/.specify/scripts/bash/update-agent-context.sh @@ -0,0 +1,790 @@ +#!/usr/bin/env bash + +# Update agent context files with information from plan.md +# +# This script maintains AI agent context files by parsing feature specifications +# and updating agent-specific configuration files with project information. +# +# MAIN FUNCTIONS: +# 1. Environment Validation +# - Verifies git repository structure and branch information +# - Checks for required plan.md files and templates +# - Validates file permissions and accessibility +# +# 2. Plan Data Extraction +# - Parses plan.md files to extract project metadata +# - Identifies language/version, frameworks, databases, and project types +# - Handles missing or incomplete specification data gracefully +# +# 3. Agent File Management +# - Creates new agent context files from templates when needed +# - Updates existing agent files with new project information +# - Preserves manual additions and custom configurations +# - Supports multiple AI agent formats and directory structures +# +# 4. Content Generation +# - Generates language-specific build/test commands +# - Creates appropriate project directory structures +# - Updates technology stacks and recent changes sections +# - Maintains consistent formatting and timestamps +# +# 5. Multi-Agent Support +# - Handles agent-specific file paths and naming conventions +# - Supports: Claude, Gemini, Copilot, Cursor, Qwen, opencode, Codex, Windsurf, Kilo Code, Auggie CLI, Roo Code, CodeBuddy CLI, Amp, SHAI, or Amazon Q Developer CLI +# - Can update single agents or all existing agent files +# - Creates default Claude file if no agent files exist +# +# Usage: ./update-agent-context.sh [agent_type] +# Agent types: claude|gemini|copilot|cursor-agent|qwen|opencode|codex|windsurf|kilocode|auggie|shai|q|bob +# Leave empty to update all existing agent files + +set -e + +# Enable strict error handling +set -u +set -o pipefail + +#============================================================================== +# Configuration and Global Variables +#============================================================================== + +# Get script directory and load common functions +SCRIPT_DIR="$(CDPATH="" cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/common.sh" + +# Get all paths and variables from common functions +eval $(get_feature_paths) + +NEW_PLAN="$IMPL_PLAN" # Alias for compatibility with existing code +AGENT_TYPE="${1:-}" + +# Agent-specific file paths +CLAUDE_FILE="$REPO_ROOT/CLAUDE.md" +GEMINI_FILE="$REPO_ROOT/GEMINI.md" +COPILOT_FILE="$REPO_ROOT/.github/agents/copilot-instructions.md" +CURSOR_FILE="$REPO_ROOT/.cursor/rules/specify-rules.mdc" +QWEN_FILE="$REPO_ROOT/QWEN.md" +AGENTS_FILE="$REPO_ROOT/AGENTS.md" +WINDSURF_FILE="$REPO_ROOT/.windsurf/rules/specify-rules.md" +KILOCODE_FILE="$REPO_ROOT/.kilocode/rules/specify-rules.md" +AUGGIE_FILE="$REPO_ROOT/.augment/rules/specify-rules.md" +ROO_FILE="$REPO_ROOT/.roo/rules/specify-rules.md" +CODEBUDDY_FILE="$REPO_ROOT/CODEBUDDY.md" +AMP_FILE="$REPO_ROOT/AGENTS.md" +SHAI_FILE="$REPO_ROOT/SHAI.md" +Q_FILE="$REPO_ROOT/AGENTS.md" +BOB_FILE="$REPO_ROOT/AGENTS.md" + +# Template file +TEMPLATE_FILE="$REPO_ROOT/.specify/templates/agent-file-template.md" + +# Global variables for parsed plan data +NEW_LANG="" +NEW_FRAMEWORK="" +NEW_DB="" +NEW_PROJECT_TYPE="" + +#============================================================================== +# Utility Functions +#============================================================================== + +log_info() { + echo "INFO: $1" +} + +log_success() { + echo "✓ $1" +} + +log_error() { + echo "ERROR: $1" >&2 +} + +log_warning() { + echo "WARNING: $1" >&2 +} + +# Cleanup function for temporary files +cleanup() { + local exit_code=$? + rm -f /tmp/agent_update_*_$$ + rm -f /tmp/manual_additions_$$ + exit $exit_code +} + +# Set up cleanup trap +trap cleanup EXIT INT TERM + +#============================================================================== +# Validation Functions +#============================================================================== + +validate_environment() { + # Check if we have a current branch/feature (git or non-git) + if [[ -z "$CURRENT_BRANCH" ]]; then + log_error "Unable to determine current feature" + if [[ "$HAS_GIT" == "true" ]]; then + log_info "Make sure you're on a feature branch" + else + log_info "Set SPECIFY_FEATURE environment variable or create a feature first" + fi + exit 1 + fi + + # Check if plan.md exists + if [[ ! -f "$NEW_PLAN" ]]; then + log_error "No plan.md found at $NEW_PLAN" + log_info "Make sure you're working on a feature with a corresponding spec directory" + if [[ "$HAS_GIT" != "true" ]]; then + log_info "Use: export SPECIFY_FEATURE=your-feature-name or create a new feature first" + fi + exit 1 + fi + + # Check if template exists (needed for new files) + if [[ ! -f "$TEMPLATE_FILE" ]]; then + log_warning "Template file not found at $TEMPLATE_FILE" + log_warning "Creating new agent files will fail" + fi +} + +#============================================================================== +# Plan Parsing Functions +#============================================================================== + +extract_plan_field() { + local field_pattern="$1" + local plan_file="$2" + + grep "^\*\*${field_pattern}\*\*: " "$plan_file" 2>/dev/null | \ + head -1 | \ + sed "s|^\*\*${field_pattern}\*\*: ||" | \ + sed 's/^[ \t]*//;s/[ \t]*$//' | \ + grep -v "NEEDS CLARIFICATION" | \ + grep -v "^N/A$" || echo "" +} + +parse_plan_data() { + local plan_file="$1" + + if [[ ! -f "$plan_file" ]]; then + log_error "Plan file not found: $plan_file" + return 1 + fi + + if [[ ! -r "$plan_file" ]]; then + log_error "Plan file is not readable: $plan_file" + return 1 + fi + + log_info "Parsing plan data from $plan_file" + + NEW_LANG=$(extract_plan_field "Language/Version" "$plan_file") + NEW_FRAMEWORK=$(extract_plan_field "Primary Dependencies" "$plan_file") + NEW_DB=$(extract_plan_field "Storage" "$plan_file") + NEW_PROJECT_TYPE=$(extract_plan_field "Project Type" "$plan_file") + + # Log what we found + if [[ -n "$NEW_LANG" ]]; then + log_info "Found language: $NEW_LANG" + else + log_warning "No language information found in plan" + fi + + if [[ -n "$NEW_FRAMEWORK" ]]; then + log_info "Found framework: $NEW_FRAMEWORK" + fi + + if [[ -n "$NEW_DB" ]] && [[ "$NEW_DB" != "N/A" ]]; then + log_info "Found database: $NEW_DB" + fi + + if [[ -n "$NEW_PROJECT_TYPE" ]]; then + log_info "Found project type: $NEW_PROJECT_TYPE" + fi +} + +format_technology_stack() { + local lang="$1" + local framework="$2" + local parts=() + + # Add non-empty parts + [[ -n "$lang" && "$lang" != "NEEDS CLARIFICATION" ]] && parts+=("$lang") + [[ -n "$framework" && "$framework" != "NEEDS CLARIFICATION" && "$framework" != "N/A" ]] && parts+=("$framework") + + # Join with proper formatting + if [[ ${#parts[@]} -eq 0 ]]; then + echo "" + elif [[ ${#parts[@]} -eq 1 ]]; then + echo "${parts[0]}" + else + # Join multiple parts with " + " + local result="${parts[0]}" + for ((i=1; i<${#parts[@]}; i++)); do + result="$result + ${parts[i]}" + done + echo "$result" + fi +} + +#============================================================================== +# Template and Content Generation Functions +#============================================================================== + +get_project_structure() { + local project_type="$1" + + if [[ "$project_type" == *"web"* ]]; then + echo "backend/\\nfrontend/\\ntests/" + else + echo "src/\\ntests/" + fi +} + +get_commands_for_language() { + local lang="$1" + + case "$lang" in + *"Python"*) + echo "cd src && pytest && ruff check ." + ;; + *"Rust"*) + echo "cargo test && cargo clippy" + ;; + *"JavaScript"*|*"TypeScript"*) + echo "npm test \\&\\& npm run lint" + ;; + *) + echo "# Add commands for $lang" + ;; + esac +} + +get_language_conventions() { + local lang="$1" + echo "$lang: Follow standard conventions" +} + +create_new_agent_file() { + local target_file="$1" + local temp_file="$2" + local project_name="$3" + local current_date="$4" + + if [[ ! -f "$TEMPLATE_FILE" ]]; then + log_error "Template not found at $TEMPLATE_FILE" + return 1 + fi + + if [[ ! -r "$TEMPLATE_FILE" ]]; then + log_error "Template file is not readable: $TEMPLATE_FILE" + return 1 + fi + + log_info "Creating new agent context file from template..." + + if ! cp "$TEMPLATE_FILE" "$temp_file"; then + log_error "Failed to copy template file" + return 1 + fi + + # Replace template placeholders + local project_structure + project_structure=$(get_project_structure "$NEW_PROJECT_TYPE") + + local commands + commands=$(get_commands_for_language "$NEW_LANG") + + local language_conventions + language_conventions=$(get_language_conventions "$NEW_LANG") + + # Perform substitutions with error checking using safer approach + # Escape special characters for sed by using a different delimiter or escaping + local escaped_lang=$(printf '%s\n' "$NEW_LANG" | sed 's/[\[\.*^$()+{}|]/\\&/g') + local escaped_framework=$(printf '%s\n' "$NEW_FRAMEWORK" | sed 's/[\[\.*^$()+{}|]/\\&/g') + local escaped_branch=$(printf '%s\n' "$CURRENT_BRANCH" | sed 's/[\[\.*^$()+{}|]/\\&/g') + + # Build technology stack and recent change strings conditionally + local tech_stack + if [[ -n "$escaped_lang" && -n "$escaped_framework" ]]; then + tech_stack="- $escaped_lang + $escaped_framework ($escaped_branch)" + elif [[ -n "$escaped_lang" ]]; then + tech_stack="- $escaped_lang ($escaped_branch)" + elif [[ -n "$escaped_framework" ]]; then + tech_stack="- $escaped_framework ($escaped_branch)" + else + tech_stack="- ($escaped_branch)" + fi + + local recent_change + if [[ -n "$escaped_lang" && -n "$escaped_framework" ]]; then + recent_change="- $escaped_branch: Added $escaped_lang + $escaped_framework" + elif [[ -n "$escaped_lang" ]]; then + recent_change="- $escaped_branch: Added $escaped_lang" + elif [[ -n "$escaped_framework" ]]; then + recent_change="- $escaped_branch: Added $escaped_framework" + else + recent_change="- $escaped_branch: Added" + fi + + local substitutions=( + "s|\[PROJECT NAME\]|$project_name|" + "s|\[DATE\]|$current_date|" + "s|\[EXTRACTED FROM ALL PLAN.MD FILES\]|$tech_stack|" + "s|\[ACTUAL STRUCTURE FROM PLANS\]|$project_structure|g" + "s|\[ONLY COMMANDS FOR ACTIVE TECHNOLOGIES\]|$commands|" + "s|\[LANGUAGE-SPECIFIC, ONLY FOR LANGUAGES IN USE\]|$language_conventions|" + "s|\[LAST 3 FEATURES AND WHAT THEY ADDED\]|$recent_change|" + ) + + for substitution in "${substitutions[@]}"; do + if ! sed -i.bak -e "$substitution" "$temp_file"; then + log_error "Failed to perform substitution: $substitution" + rm -f "$temp_file" "$temp_file.bak" + return 1 + fi + done + + # Convert \n sequences to actual newlines + newline=$(printf '\n') + sed -i.bak2 "s/\\\\n/${newline}/g" "$temp_file" + + # Clean up backup files + rm -f "$temp_file.bak" "$temp_file.bak2" + + return 0 +} + + + + +update_existing_agent_file() { + local target_file="$1" + local current_date="$2" + + log_info "Updating existing agent context file..." + + # Use a single temporary file for atomic update + local temp_file + temp_file=$(mktemp) || { + log_error "Failed to create temporary file" + return 1 + } + + # Process the file in one pass + local tech_stack=$(format_technology_stack "$NEW_LANG" "$NEW_FRAMEWORK") + local new_tech_entries=() + local new_change_entry="" + + # Prepare new technology entries + if [[ -n "$tech_stack" ]] && ! grep -q "$tech_stack" "$target_file"; then + new_tech_entries+=("- $tech_stack ($CURRENT_BRANCH)") + fi + + if [[ -n "$NEW_DB" ]] && [[ "$NEW_DB" != "N/A" ]] && [[ "$NEW_DB" != "NEEDS CLARIFICATION" ]] && ! grep -q "$NEW_DB" "$target_file"; then + new_tech_entries+=("- $NEW_DB ($CURRENT_BRANCH)") + fi + + # Prepare new change entry + if [[ -n "$tech_stack" ]]; then + new_change_entry="- $CURRENT_BRANCH: Added $tech_stack" + elif [[ -n "$NEW_DB" ]] && [[ "$NEW_DB" != "N/A" ]] && [[ "$NEW_DB" != "NEEDS CLARIFICATION" ]]; then + new_change_entry="- $CURRENT_BRANCH: Added $NEW_DB" + fi + + # Check if sections exist in the file + local has_active_technologies=0 + local has_recent_changes=0 + + if grep -q "^## Active Technologies" "$target_file" 2>/dev/null; then + has_active_technologies=1 + fi + + if grep -q "^## Recent Changes" "$target_file" 2>/dev/null; then + has_recent_changes=1 + fi + + # Process file line by line + local in_tech_section=false + local in_changes_section=false + local tech_entries_added=false + local changes_entries_added=false + local existing_changes_count=0 + local file_ended=false + + while IFS= read -r line || [[ -n "$line" ]]; do + # Handle Active Technologies section + if [[ "$line" == "## Active Technologies" ]]; then + echo "$line" >> "$temp_file" + in_tech_section=true + continue + elif [[ $in_tech_section == true ]] && [[ "$line" =~ ^##[[:space:]] ]]; then + # Add new tech entries before closing the section + if [[ $tech_entries_added == false ]] && [[ ${#new_tech_entries[@]} -gt 0 ]]; then + printf '%s\n' "${new_tech_entries[@]}" >> "$temp_file" + tech_entries_added=true + fi + echo "$line" >> "$temp_file" + in_tech_section=false + continue + elif [[ $in_tech_section == true ]] && [[ -z "$line" ]]; then + # Add new tech entries before empty line in tech section + if [[ $tech_entries_added == false ]] && [[ ${#new_tech_entries[@]} -gt 0 ]]; then + printf '%s\n' "${new_tech_entries[@]}" >> "$temp_file" + tech_entries_added=true + fi + echo "$line" >> "$temp_file" + continue + fi + + # Handle Recent Changes section + if [[ "$line" == "## Recent Changes" ]]; then + echo "$line" >> "$temp_file" + # Add new change entry right after the heading + if [[ -n "$new_change_entry" ]]; then + echo "$new_change_entry" >> "$temp_file" + fi + in_changes_section=true + changes_entries_added=true + continue + elif [[ $in_changes_section == true ]] && [[ "$line" =~ ^##[[:space:]] ]]; then + echo "$line" >> "$temp_file" + in_changes_section=false + continue + elif [[ $in_changes_section == true ]] && [[ "$line" == "- "* ]]; then + # Keep only first 2 existing changes + if [[ $existing_changes_count -lt 2 ]]; then + echo "$line" >> "$temp_file" + ((existing_changes_count++)) + fi + continue + fi + + # Update timestamp + if [[ "$line" =~ \*\*Last\ updated\*\*:.*[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] ]]; then + echo "$line" | sed "s/[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]/$current_date/" >> "$temp_file" + else + echo "$line" >> "$temp_file" + fi + done < "$target_file" + + # Post-loop check: if we're still in the Active Technologies section and haven't added new entries + if [[ $in_tech_section == true ]] && [[ $tech_entries_added == false ]] && [[ ${#new_tech_entries[@]} -gt 0 ]]; then + printf '%s\n' "${new_tech_entries[@]}" >> "$temp_file" + tech_entries_added=true + fi + + # If sections don't exist, add them at the end of the file + if [[ $has_active_technologies -eq 0 ]] && [[ ${#new_tech_entries[@]} -gt 0 ]]; then + echo "" >> "$temp_file" + echo "## Active Technologies" >> "$temp_file" + printf '%s\n' "${new_tech_entries[@]}" >> "$temp_file" + tech_entries_added=true + fi + + if [[ $has_recent_changes -eq 0 ]] && [[ -n "$new_change_entry" ]]; then + echo "" >> "$temp_file" + echo "## Recent Changes" >> "$temp_file" + echo "$new_change_entry" >> "$temp_file" + changes_entries_added=true + fi + + # Move temp file to target atomically + if ! mv "$temp_file" "$target_file"; then + log_error "Failed to update target file" + rm -f "$temp_file" + return 1 + fi + + return 0 +} +#============================================================================== +# Main Agent File Update Function +#============================================================================== + +update_agent_file() { + local target_file="$1" + local agent_name="$2" + + if [[ -z "$target_file" ]] || [[ -z "$agent_name" ]]; then + log_error "update_agent_file requires target_file and agent_name parameters" + return 1 + fi + + log_info "Updating $agent_name context file: $target_file" + + local project_name + project_name=$(basename "$REPO_ROOT") + local current_date + current_date=$(date +%Y-%m-%d) + + # Create directory if it doesn't exist + local target_dir + target_dir=$(dirname "$target_file") + if [[ ! -d "$target_dir" ]]; then + if ! mkdir -p "$target_dir"; then + log_error "Failed to create directory: $target_dir" + return 1 + fi + fi + + if [[ ! -f "$target_file" ]]; then + # Create new file from template + local temp_file + temp_file=$(mktemp) || { + log_error "Failed to create temporary file" + return 1 + } + + if create_new_agent_file "$target_file" "$temp_file" "$project_name" "$current_date"; then + if mv "$temp_file" "$target_file"; then + log_success "Created new $agent_name context file" + else + log_error "Failed to move temporary file to $target_file" + rm -f "$temp_file" + return 1 + fi + else + log_error "Failed to create new agent file" + rm -f "$temp_file" + return 1 + fi + else + # Update existing file + if [[ ! -r "$target_file" ]]; then + log_error "Cannot read existing file: $target_file" + return 1 + fi + + if [[ ! -w "$target_file" ]]; then + log_error "Cannot write to existing file: $target_file" + return 1 + fi + + if update_existing_agent_file "$target_file" "$current_date"; then + log_success "Updated existing $agent_name context file" + else + log_error "Failed to update existing agent file" + return 1 + fi + fi + + return 0 +} + +#============================================================================== +# Agent Selection and Processing +#============================================================================== + +update_specific_agent() { + local agent_type="$1" + + case "$agent_type" in + claude) + update_agent_file "$CLAUDE_FILE" "Claude Code" + ;; + gemini) + update_agent_file "$GEMINI_FILE" "Gemini CLI" + ;; + copilot) + update_agent_file "$COPILOT_FILE" "GitHub Copilot" + ;; + cursor-agent) + update_agent_file "$CURSOR_FILE" "Cursor IDE" + ;; + qwen) + update_agent_file "$QWEN_FILE" "Qwen Code" + ;; + opencode) + update_agent_file "$AGENTS_FILE" "opencode" + ;; + codex) + update_agent_file "$AGENTS_FILE" "Codex CLI" + ;; + windsurf) + update_agent_file "$WINDSURF_FILE" "Windsurf" + ;; + kilocode) + update_agent_file "$KILOCODE_FILE" "Kilo Code" + ;; + auggie) + update_agent_file "$AUGGIE_FILE" "Auggie CLI" + ;; + roo) + update_agent_file "$ROO_FILE" "Roo Code" + ;; + codebuddy) + update_agent_file "$CODEBUDDY_FILE" "CodeBuddy CLI" + ;; + amp) + update_agent_file "$AMP_FILE" "Amp" + ;; + shai) + update_agent_file "$SHAI_FILE" "SHAI" + ;; + q) + update_agent_file "$Q_FILE" "Amazon Q Developer CLI" + ;; + bob) + update_agent_file "$BOB_FILE" "IBM Bob" + ;; + *) + log_error "Unknown agent type '$agent_type'" + log_error "Expected: claude|gemini|copilot|cursor-agent|qwen|opencode|codex|windsurf|kilocode|auggie|roo|amp|shai|q|bob" + exit 1 + ;; + esac +} + +update_all_existing_agents() { + local found_agent=false + + # Check each possible agent file and update if it exists + if [[ -f "$CLAUDE_FILE" ]]; then + update_agent_file "$CLAUDE_FILE" "Claude Code" + found_agent=true + fi + + if [[ -f "$GEMINI_FILE" ]]; then + update_agent_file "$GEMINI_FILE" "Gemini CLI" + found_agent=true + fi + + if [[ -f "$COPILOT_FILE" ]]; then + update_agent_file "$COPILOT_FILE" "GitHub Copilot" + found_agent=true + fi + + if [[ -f "$CURSOR_FILE" ]]; then + update_agent_file "$CURSOR_FILE" "Cursor IDE" + found_agent=true + fi + + if [[ -f "$QWEN_FILE" ]]; then + update_agent_file "$QWEN_FILE" "Qwen Code" + found_agent=true + fi + + if [[ -f "$AGENTS_FILE" ]]; then + update_agent_file "$AGENTS_FILE" "Codex/opencode" + found_agent=true + fi + + if [[ -f "$WINDSURF_FILE" ]]; then + update_agent_file "$WINDSURF_FILE" "Windsurf" + found_agent=true + fi + + if [[ -f "$KILOCODE_FILE" ]]; then + update_agent_file "$KILOCODE_FILE" "Kilo Code" + found_agent=true + fi + + if [[ -f "$AUGGIE_FILE" ]]; then + update_agent_file "$AUGGIE_FILE" "Auggie CLI" + found_agent=true + fi + + if [[ -f "$ROO_FILE" ]]; then + update_agent_file "$ROO_FILE" "Roo Code" + found_agent=true + fi + + if [[ -f "$CODEBUDDY_FILE" ]]; then + update_agent_file "$CODEBUDDY_FILE" "CodeBuddy CLI" + found_agent=true + fi + + if [[ -f "$SHAI_FILE" ]]; then + update_agent_file "$SHAI_FILE" "SHAI" + found_agent=true + fi + + if [[ -f "$Q_FILE" ]]; then + update_agent_file "$Q_FILE" "Amazon Q Developer CLI" + found_agent=true + fi + + if [[ -f "$BOB_FILE" ]]; then + update_agent_file "$BOB_FILE" "IBM Bob" + found_agent=true + fi + + # If no agent files exist, create a default Claude file + if [[ "$found_agent" == false ]]; then + log_info "No existing agent files found, creating default Claude file..." + update_agent_file "$CLAUDE_FILE" "Claude Code" + fi +} +print_summary() { + echo + log_info "Summary of changes:" + + if [[ -n "$NEW_LANG" ]]; then + echo " - Added language: $NEW_LANG" + fi + + if [[ -n "$NEW_FRAMEWORK" ]]; then + echo " - Added framework: $NEW_FRAMEWORK" + fi + + if [[ -n "$NEW_DB" ]] && [[ "$NEW_DB" != "N/A" ]]; then + echo " - Added database: $NEW_DB" + fi + + echo + + log_info "Usage: $0 [claude|gemini|copilot|cursor-agent|qwen|opencode|codex|windsurf|kilocode|auggie|codebuddy|shai|q|bob]" +} + +#============================================================================== +# Main Execution +#============================================================================== + +main() { + # Validate environment before proceeding + validate_environment + + log_info "=== Updating agent context files for feature $CURRENT_BRANCH ===" + + # Parse the plan file to extract project information + if ! parse_plan_data "$NEW_PLAN"; then + log_error "Failed to parse plan data" + exit 1 + fi + + # Process based on agent type argument + local success=true + + if [[ -z "$AGENT_TYPE" ]]; then + # No specific agent provided - update all existing agent files + log_info "No agent specified, updating all existing agent files..." + if ! update_all_existing_agents; then + success=false + fi + else + # Specific agent provided - update only that agent + log_info "Updating specific agent: $AGENT_TYPE" + if ! update_specific_agent "$AGENT_TYPE"; then + success=false + fi + fi + + # Print summary + print_summary + + if [[ "$success" == true ]]; then + log_success "Agent context update completed successfully" + exit 0 + else + log_error "Agent context update completed with errors" + exit 1 + fi +} + +# Execute main function if script is run directly +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi + diff --git a/flaredb/.specify/templates/agent-file-template.md b/flaredb/.specify/templates/agent-file-template.md new file mode 100644 index 0000000..4cc7fd6 --- /dev/null +++ b/flaredb/.specify/templates/agent-file-template.md @@ -0,0 +1,28 @@ +# [PROJECT NAME] Development Guidelines + +Auto-generated from all feature plans. Last updated: [DATE] + +## Active Technologies + +[EXTRACTED FROM ALL PLAN.MD FILES] + +## Project Structure + +```text +[ACTUAL STRUCTURE FROM PLANS] +``` + +## Commands + +[ONLY COMMANDS FOR ACTIVE TECHNOLOGIES] + +## Code Style + +[LANGUAGE-SPECIFIC, ONLY FOR LANGUAGES IN USE] + +## Recent Changes + +[LAST 3 FEATURES AND WHAT THEY ADDED] + + + diff --git a/flaredb/.specify/templates/checklist-template.md b/flaredb/.specify/templates/checklist-template.md new file mode 100644 index 0000000..806657d --- /dev/null +++ b/flaredb/.specify/templates/checklist-template.md @@ -0,0 +1,40 @@ +# [CHECKLIST TYPE] Checklist: [FEATURE NAME] + +**Purpose**: [Brief description of what this checklist covers] +**Created**: [DATE] +**Feature**: [Link to spec.md or relevant documentation] + +**Note**: This checklist is generated by the `/speckit.checklist` command based on feature context and requirements. + + + +## [Category 1] + +- [ ] CHK001 First checklist item with clear action +- [ ] CHK002 Second checklist item +- [ ] CHK003 Third checklist item + +## [Category 2] + +- [ ] CHK004 Another category item +- [ ] CHK005 Item with specific criteria +- [ ] CHK006 Final item in this category + +## Notes + +- Check items off as completed: `[x]` +- Add comments or findings inline +- Link to relevant resources or documentation +- Items are numbered sequentially for easy reference diff --git a/flaredb/.specify/templates/plan-template.md b/flaredb/.specify/templates/plan-template.md new file mode 100644 index 0000000..6a8bfc6 --- /dev/null +++ b/flaredb/.specify/templates/plan-template.md @@ -0,0 +1,104 @@ +# Implementation Plan: [FEATURE] + +**Branch**: `[###-feature-name]` | **Date**: [DATE] | **Spec**: [link] +**Input**: Feature specification from `/specs/[###-feature-name]/spec.md` + +**Note**: This template is filled in by the `/speckit.plan` command. See `.specify/templates/commands/plan.md` for the execution workflow. + +## Summary + +[Extract from feature spec: primary requirement + technical approach from research] + +## Technical Context + + + +**Language/Version**: [e.g., Python 3.11, Swift 5.9, Rust 1.75 or NEEDS CLARIFICATION] +**Primary Dependencies**: [e.g., FastAPI, UIKit, LLVM or NEEDS CLARIFICATION] +**Storage**: [if applicable, e.g., PostgreSQL, CoreData, files or N/A] +**Testing**: [e.g., pytest, XCTest, cargo test or NEEDS CLARIFICATION] +**Target Platform**: [e.g., Linux server, iOS 15+, WASM or NEEDS CLARIFICATION] +**Project Type**: [single/web/mobile - determines source structure] +**Performance Goals**: [domain-specific, e.g., 1000 req/s, 10k lines/sec, 60 fps or NEEDS CLARIFICATION] +**Constraints**: [domain-specific, e.g., <200ms p95, <100MB memory, offline-capable or NEEDS CLARIFICATION] +**Scale/Scope**: [domain-specific, e.g., 10k users, 1M LOC, 50 screens or NEEDS CLARIFICATION] + +## Constitution Check + +*GATE: Must pass before Phase 0 research. Re-check after Phase 1 design.* + +[Gates determined based on constitution file] + +## Project Structure + +### Documentation (this feature) + +```text +specs/[###-feature]/ +├── plan.md # This file (/speckit.plan command output) +├── research.md # Phase 0 output (/speckit.plan command) +├── data-model.md # Phase 1 output (/speckit.plan command) +├── quickstart.md # Phase 1 output (/speckit.plan command) +├── contracts/ # Phase 1 output (/speckit.plan command) +└── tasks.md # Phase 2 output (/speckit.tasks command - NOT created by /speckit.plan) +``` + +### Source Code (repository root) + + +```text +# [REMOVE IF UNUSED] Option 1: Single project (DEFAULT) +src/ +├── models/ +├── services/ +├── cli/ +└── lib/ + +tests/ +├── contract/ +├── integration/ +└── unit/ + +# [REMOVE IF UNUSED] Option 2: Web application (when "frontend" + "backend" detected) +backend/ +├── src/ +│ ├── models/ +│ ├── services/ +│ └── api/ +└── tests/ + +frontend/ +├── src/ +│ ├── components/ +│ ├── pages/ +│ └── services/ +└── tests/ + +# [REMOVE IF UNUSED] Option 3: Mobile + API (when "iOS/Android" detected) +api/ +└── [same as backend above] + +ios/ or android/ +└── [platform-specific structure: feature modules, UI flows, platform tests] +``` + +**Structure Decision**: [Document the selected structure and reference the real +directories captured above] + +## Complexity Tracking + +> **Fill ONLY if Constitution Check has violations that must be justified** + +| Violation | Why Needed | Simpler Alternative Rejected Because | +|-----------|------------|-------------------------------------| +| [e.g., 4th project] | [current need] | [why 3 projects insufficient] | +| [e.g., Repository pattern] | [specific problem] | [why direct DB access insufficient] | diff --git a/flaredb/.specify/templates/spec-template.md b/flaredb/.specify/templates/spec-template.md new file mode 100644 index 0000000..c67d914 --- /dev/null +++ b/flaredb/.specify/templates/spec-template.md @@ -0,0 +1,115 @@ +# Feature Specification: [FEATURE NAME] + +**Feature Branch**: `[###-feature-name]` +**Created**: [DATE] +**Status**: Draft +**Input**: User description: "$ARGUMENTS" + +## User Scenarios & Testing *(mandatory)* + + + +### User Story 1 - [Brief Title] (Priority: P1) + +[Describe this user journey in plain language] + +**Why this priority**: [Explain the value and why it has this priority level] + +**Independent Test**: [Describe how this can be tested independently - e.g., "Can be fully tested by [specific action] and delivers [specific value]"] + +**Acceptance Scenarios**: + +1. **Given** [initial state], **When** [action], **Then** [expected outcome] +2. **Given** [initial state], **When** [action], **Then** [expected outcome] + +--- + +### User Story 2 - [Brief Title] (Priority: P2) + +[Describe this user journey in plain language] + +**Why this priority**: [Explain the value and why it has this priority level] + +**Independent Test**: [Describe how this can be tested independently] + +**Acceptance Scenarios**: + +1. **Given** [initial state], **When** [action], **Then** [expected outcome] + +--- + +### User Story 3 - [Brief Title] (Priority: P3) + +[Describe this user journey in plain language] + +**Why this priority**: [Explain the value and why it has this priority level] + +**Independent Test**: [Describe how this can be tested independently] + +**Acceptance Scenarios**: + +1. **Given** [initial state], **When** [action], **Then** [expected outcome] + +--- + +[Add more user stories as needed, each with an assigned priority] + +### Edge Cases + + + +- What happens when [boundary condition]? +- How does system handle [error scenario]? + +## Requirements *(mandatory)* + + + +### Functional Requirements + +- **FR-001**: System MUST [specific capability, e.g., "allow users to create accounts"] +- **FR-002**: System MUST [specific capability, e.g., "validate email addresses"] +- **FR-003**: Users MUST be able to [key interaction, e.g., "reset their password"] +- **FR-004**: System MUST [data requirement, e.g., "persist user preferences"] +- **FR-005**: System MUST [behavior, e.g., "log all security events"] + +*Example of marking unclear requirements:* + +- **FR-006**: System MUST authenticate users via [NEEDS CLARIFICATION: auth method not specified - email/password, SSO, OAuth?] +- **FR-007**: System MUST retain user data for [NEEDS CLARIFICATION: retention period not specified] + +### Key Entities *(include if feature involves data)* + +- **[Entity 1]**: [What it represents, key attributes without implementation] +- **[Entity 2]**: [What it represents, relationships to other entities] + +## Success Criteria *(mandatory)* + + + +### Measurable Outcomes + +- **SC-001**: [Measurable metric, e.g., "Users can complete account creation in under 2 minutes"] +- **SC-002**: [Measurable metric, e.g., "System handles 1000 concurrent users without degradation"] +- **SC-003**: [User satisfaction metric, e.g., "90% of users successfully complete primary task on first attempt"] +- **SC-004**: [Business metric, e.g., "Reduce support tickets related to [X] by 50%"] diff --git a/flaredb/.specify/templates/tasks-template.md b/flaredb/.specify/templates/tasks-template.md new file mode 100644 index 0000000..60f9be4 --- /dev/null +++ b/flaredb/.specify/templates/tasks-template.md @@ -0,0 +1,251 @@ +--- + +description: "Task list template for feature implementation" +--- + +# Tasks: [FEATURE NAME] + +**Input**: Design documents from `/specs/[###-feature-name]/` +**Prerequisites**: plan.md (required), spec.md (required for user stories), research.md, data-model.md, contracts/ + +**Tests**: The examples below include test tasks. Tests are OPTIONAL - only include them if explicitly requested in the feature specification. + +**Organization**: Tasks are grouped by user story to enable independent implementation and testing of each story. + +## Format: `[ID] [P?] [Story] Description` + +- **[P]**: Can run in parallel (different files, no dependencies) +- **[Story]**: Which user story this task belongs to (e.g., US1, US2, US3) +- Include exact file paths in descriptions + +## Path Conventions + +- **Single project**: `src/`, `tests/` at repository root +- **Web app**: `backend/src/`, `frontend/src/` +- **Mobile**: `api/src/`, `ios/src/` or `android/src/` +- Paths shown below assume single project - adjust based on plan.md structure + + + +## Phase 1: Setup (Shared Infrastructure) + +**Purpose**: Project initialization and basic structure + +- [ ] T001 Create project structure per implementation plan +- [ ] T002 Initialize [language] project with [framework] dependencies +- [ ] T003 [P] Configure linting and formatting tools + +--- + +## Phase 2: Foundational (Blocking Prerequisites) + +**Purpose**: Core infrastructure that MUST be complete before ANY user story can be implemented + +**⚠️ CRITICAL**: No user story work can begin until this phase is complete + +Examples of foundational tasks (adjust based on your project): + +- [ ] T004 Setup database schema and migrations framework +- [ ] T005 [P] Implement authentication/authorization framework +- [ ] T006 [P] Setup API routing and middleware structure +- [ ] T007 Create base models/entities that all stories depend on +- [ ] T008 Configure error handling and logging infrastructure +- [ ] T009 Setup environment configuration management + +**Checkpoint**: Foundation ready - user story implementation can now begin in parallel + +--- + +## Phase 3: User Story 1 - [Title] (Priority: P1) 🎯 MVP + +**Goal**: [Brief description of what this story delivers] + +**Independent Test**: [How to verify this story works on its own] + +### Tests for User Story 1 (OPTIONAL - only if tests requested) ⚠️ + +> **NOTE: Write these tests FIRST, ensure they FAIL before implementation** + +- [ ] T010 [P] [US1] Contract test for [endpoint] in tests/contract/test_[name].py +- [ ] T011 [P] [US1] Integration test for [user journey] in tests/integration/test_[name].py + +### Implementation for User Story 1 + +- [ ] T012 [P] [US1] Create [Entity1] model in src/models/[entity1].py +- [ ] T013 [P] [US1] Create [Entity2] model in src/models/[entity2].py +- [ ] T014 [US1] Implement [Service] in src/services/[service].py (depends on T012, T013) +- [ ] T015 [US1] Implement [endpoint/feature] in src/[location]/[file].py +- [ ] T016 [US1] Add validation and error handling +- [ ] T017 [US1] Add logging for user story 1 operations + +**Checkpoint**: At this point, User Story 1 should be fully functional and testable independently + +--- + +## Phase 4: User Story 2 - [Title] (Priority: P2) + +**Goal**: [Brief description of what this story delivers] + +**Independent Test**: [How to verify this story works on its own] + +### Tests for User Story 2 (OPTIONAL - only if tests requested) ⚠️ + +- [ ] T018 [P] [US2] Contract test for [endpoint] in tests/contract/test_[name].py +- [ ] T019 [P] [US2] Integration test for [user journey] in tests/integration/test_[name].py + +### Implementation for User Story 2 + +- [ ] T020 [P] [US2] Create [Entity] model in src/models/[entity].py +- [ ] T021 [US2] Implement [Service] in src/services/[service].py +- [ ] T022 [US2] Implement [endpoint/feature] in src/[location]/[file].py +- [ ] T023 [US2] Integrate with User Story 1 components (if needed) + +**Checkpoint**: At this point, User Stories 1 AND 2 should both work independently + +--- + +## Phase 5: User Story 3 - [Title] (Priority: P3) + +**Goal**: [Brief description of what this story delivers] + +**Independent Test**: [How to verify this story works on its own] + +### Tests for User Story 3 (OPTIONAL - only if tests requested) ⚠️ + +- [ ] T024 [P] [US3] Contract test for [endpoint] in tests/contract/test_[name].py +- [ ] T025 [P] [US3] Integration test for [user journey] in tests/integration/test_[name].py + +### Implementation for User Story 3 + +- [ ] T026 [P] [US3] Create [Entity] model in src/models/[entity].py +- [ ] T027 [US3] Implement [Service] in src/services/[service].py +- [ ] T028 [US3] Implement [endpoint/feature] in src/[location]/[file].py + +**Checkpoint**: All user stories should now be independently functional + +--- + +[Add more user story phases as needed, following the same pattern] + +--- + +## Phase N: Polish & Cross-Cutting Concerns + +**Purpose**: Improvements that affect multiple user stories + +- [ ] TXXX [P] Documentation updates in docs/ +- [ ] TXXX Code cleanup and refactoring +- [ ] TXXX Performance optimization across all stories +- [ ] TXXX [P] Additional unit tests (if requested) in tests/unit/ +- [ ] TXXX Security hardening +- [ ] TXXX Run quickstart.md validation + +--- + +## Dependencies & Execution Order + +### Phase Dependencies + +- **Setup (Phase 1)**: No dependencies - can start immediately +- **Foundational (Phase 2)**: Depends on Setup completion - BLOCKS all user stories +- **User Stories (Phase 3+)**: All depend on Foundational phase completion + - User stories can then proceed in parallel (if staffed) + - Or sequentially in priority order (P1 → P2 → P3) +- **Polish (Final Phase)**: Depends on all desired user stories being complete + +### User Story Dependencies + +- **User Story 1 (P1)**: Can start after Foundational (Phase 2) - No dependencies on other stories +- **User Story 2 (P2)**: Can start after Foundational (Phase 2) - May integrate with US1 but should be independently testable +- **User Story 3 (P3)**: Can start after Foundational (Phase 2) - May integrate with US1/US2 but should be independently testable + +### Within Each User Story + +- Tests (if included) MUST be written and FAIL before implementation +- Models before services +- Services before endpoints +- Core implementation before integration +- Story complete before moving to next priority + +### Parallel Opportunities + +- All Setup tasks marked [P] can run in parallel +- All Foundational tasks marked [P] can run in parallel (within Phase 2) +- Once Foundational phase completes, all user stories can start in parallel (if team capacity allows) +- All tests for a user story marked [P] can run in parallel +- Models within a story marked [P] can run in parallel +- Different user stories can be worked on in parallel by different team members + +--- + +## Parallel Example: User Story 1 + +```bash +# Launch all tests for User Story 1 together (if tests requested): +Task: "Contract test for [endpoint] in tests/contract/test_[name].py" +Task: "Integration test for [user journey] in tests/integration/test_[name].py" + +# Launch all models for User Story 1 together: +Task: "Create [Entity1] model in src/models/[entity1].py" +Task: "Create [Entity2] model in src/models/[entity2].py" +``` + +--- + +## Implementation Strategy + +### MVP First (User Story 1 Only) + +1. Complete Phase 1: Setup +2. Complete Phase 2: Foundational (CRITICAL - blocks all stories) +3. Complete Phase 3: User Story 1 +4. **STOP and VALIDATE**: Test User Story 1 independently +5. Deploy/demo if ready + +### Incremental Delivery + +1. Complete Setup + Foundational → Foundation ready +2. Add User Story 1 → Test independently → Deploy/Demo (MVP!) +3. Add User Story 2 → Test independently → Deploy/Demo +4. Add User Story 3 → Test independently → Deploy/Demo +5. Each story adds value without breaking previous stories + +### Parallel Team Strategy + +With multiple developers: + +1. Team completes Setup + Foundational together +2. Once Foundational is done: + - Developer A: User Story 1 + - Developer B: User Story 2 + - Developer C: User Story 3 +3. Stories complete and integrate independently + +--- + +## Notes + +- [P] tasks = different files, no dependencies +- [Story] label maps task to specific user story for traceability +- Each user story should be independently completable and testable +- Verify tests fail before implementing +- Commit after each task or logical group +- Stop at any checkpoint to validate story independently +- Avoid: vague tasks, same file conflicts, cross-story dependencies that break independence diff --git a/flaredb/Cargo.toml b/flaredb/Cargo.toml new file mode 100644 index 0000000..884636f --- /dev/null +++ b/flaredb/Cargo.toml @@ -0,0 +1,66 @@ +[workspace] +members = [ + "crates/flaredb-types", + "crates/flaredb-proto", + "crates/flaredb-storage", + "crates/flaredb-raft", + "crates/flaredb-server", + "crates/flaredb-pd", + "crates/flaredb-client", + "crates/flaredb-cli", +] +resolver = "2" + +[workspace.package] +version = "0.1.0" +edition = "2021" +license = "MIT OR Apache-2.0" +rust-version = "1.75" + +[workspace.dependencies] +# Async runtime +tokio = { version = "1.40", features = ["full"] } +tokio-stream = "0.1" +futures = "0.3" +async-trait = "0.1" + +# Distributed Consensus +openraft = { version = "0.9", features = ["serde"] } + +# Storage +rocksdb = { version = "0.24", default-features = false, features = ["multi-threaded-cf", "zstd", "lz4", "snappy"] } + +# gRPC +tonic = "0.12" +tonic-build = "0.12" +tonic-health = "0.12" +prost = "0.13" +prost-types = "0.13" + +# Serialization +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +bincode = "1.3" + +# Logging & Tracing +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } + +# Error handling +thiserror = "1.0" +anyhow = "1.0" + +# CLI +clap = { version = "4.5", features = ["derive"] } + +# Utilities +sha2 = "0.10" +bytes = "1.5" + +# Testing +tempfile = "3" + +[profile.release] +lto = true +opt-level = 3 +codegen-units = 1 diff --git a/flaredb/advice.md b/flaredb/advice.md new file mode 100644 index 0000000..7d90d61 --- /dev/null +++ b/flaredb/advice.md @@ -0,0 +1,124 @@ +ざっくり結論 + +* **Chainfire**は、Raft+RocksDB+gRPC+Gossip(SWIM/foca)で「etcd 風の分散KV+Watch」を狙う設計。Rust のワークスペース分割もきれいで、API/ストレージ/ウォッチ/ゴシップ/ラフトがモジュール化されている。ただし**Raft の対外RPCがまだ未配線(in‑memory/ダミー)**で、本当の多ノードクラスタとしては未完成。単一ノードやプロセス内検証には十分使える段階。 +* **FlareDB**は、PD(Placement Driver)+TSO(単調増加タイムスタンプ)+KV(Raw/CAS)+Raftサービス+リージョン/マルチラフトの下地+Merkle(整合性検査の雛形)まで手が入っており、**実験用の分散ストレージ最小系**としてよくまとまっている。CI/テスト項目・Quickstart・検証スクリプトもあり、開発者体験が良い。実運用には、マルチラフトの完成度・レプリケーション/再配置・フォロワリード/線形化リード・トランザクションなど**次の一歩**が必要。 + +--- + +## Chainfire:何ができていて、どこが足りないか + +**できていること(コードから確認できる実体)** + +* Rust Workspace でAPI/サーバ/ストレージ/ラフト/ゴシップ/ウォッチが分離。依存は `openraft`(Raft)・`foca`(SWIM Gossip)・`rocksdb`・`tonic/prost`(gRPC)に整理済み。 +* Raft 設定は OpenRaft の典型値で初期化(心拍/選挙タイムアウト/スナップショット方針等)し、ユニットテストもあり。 +* gRPC の **KV / Watch / Cluster / (内部)Raft** サービスを一つのTonicサーバに束ねて起動する作り。 +* **Watch** は双方向ストリームで、内部のWatchRegistryとつながるちゃんとした実装。クライアント側の受信ハンドルも用意済み。 +* RocksDB をCF分割で利用。スナップショットのビルド/適用テストあり(データ転送の下地)。 + +**詰めが甘い/未完成な点(現状の制約)** + +* **Raft RPCが未配線**:`RaftRpcClient` は “gRPC実装を後で差す” 前提のトレイトのまま。ノード生成時も **Dummy/In‑memory のクライアント**が使われており、実ノード間通信になっていない。これだと**単一プロセス内での検証**はできるが、別プロセス/別ホストにまたぐクラスタは動かない。 +* **Raft用ポートの扱い**:ログには Raft用アドレスを出しているが、実際のTonicサーバは **APIアドレスでまとめて** `RaftService` も公開している。ポート分離・セキュリティ/ネットワーク設計が未整理。 +* クラスタメンバーシップ変更(joint consensus)や、線形化読み取り(ReadIndex)、スナップショット転送の堅牢化など、Raft運用の“本番ポイント”は未記述/未配線に見える(設計としてはOpenRaftが担保可能)。 + +**今の実用性(どこで役に立つ?)** + +* **研究/検証・単一ノードのメタデータKV**としては十分。“etcd互換風のAPI+Watch”の感触を掴むには良い。 +* **本番クラスタ**やフェイルオーバを求める用途では、**Raft RPC配線とメンバーシップ管理**が入るまで待ちが必要。 + +**短期で刺さる改善(着手順)** + +1. **RaftのgRPCクライアント**を `internal_proto` に基づいて実装し、`RaftRpcClient` に差し込む。 +2. **Raft用ポート分離**:`api_addr` と `raft_addr` を別サーバで起動し、TLS/認証の下地も確保。 +3. **Gossip⇔Raft連携**:focaでの生存監視をトリガに、メンバー自動追加/離脱をRaftのjoint‑consensusに流す。依存は既にワークスペースにある。 +4. **線形化Read/ReadIndex**実装、**フォロワリード**(許容するなら条件付き)を整理。 +5. **ウォッチの厳密な順序/Revision**保証をStateMachineの適用と一体化(watch_txの結線)。 +6. **スナップショット転送の実戦投入**(チャンク/再送/検証)。テストは下地あり。 +7. **メトリクス/トレース**(Prometheus/OpenTelemetry)と**障害注入テスト**。 +8. Docker/Helm/Flakeの梱包をCIに載せる。 + +--- + +## FlareDB:何ができていて、どこが足りないか + +**できていること(コードから確認できる実体)** + +* **PD+TSO** の独立プロセス。**Quickstart**に起動順とCLI操作(TSO/Raw Put/Get/CAS)が書かれており、User StoryのチェックリストにもTSO達成が明記。 +* **サーバ側サービス**:`KvRaw`/`KvCas`/`RaftService` を同一 gRPC サーバで提供。 +* **PD連携のハートビート/再接続・リージョン更新ループ**の骨格がある(起動後に定期HB→失敗時は再接続、リージョン情報を同期)。 +* **Merkle**(領域ハッシュの雛形)で後々のアンチエントロピー/整合性検査を意識。 +* **テストと仕様フォルダが豊富**:レプリケーション/マルチリージョン/スプリット/整合性などのテスト群、spec・scripts で動作確認の導線がある。 + +**詰めが甘い/未完成な点(現状の制約)** + +* **マルチラフトの完成度**:リージョン分割・再配置・投票者/ラーナ/学習者の遷移、PDのスケジューリング(リバランス/ホットキー対策)の“運用アルゴリズム”はこれから。ディレクトリやspecはあるが、本番相当の道具立ては未完成。 +* **リードパスの整理**:強整合/フォロワリード/ReadIndexの選択や遅延観測の制御が未整備に見える。 +* **トランザクション(MVCC)**:TSOはあるが、二相コミットや悲観/楽観制御、ロールバック/ロック解放の実働コードはこれから(CASはある)。 +* **障害時挙動と耐久性**:スナップショット/ログの回復・リージョンマージ・アンチエントロピー(Merkle駆動)のバックグラウンドジョブは雛形段階。 + +**今の実用性** + +* 研究用途・PoC として**単一~少数ノードのKV(Raw/CAS)**を回し、PD/TSO連携やリージョンの概念を試すには充分。 +* フル機能の分散トランザクショナルKV/SQL バックエンドを**本番投入**するには、マルチラフト/リージョン管理/トランザクション/可観測性などの整備が必要。 + +**短期で刺さる改善(着手順)** + +1. **マルチラフトの完成**:リージョンスプリットのトリガ(サイズ/負荷)→新リージョンのRaft起動→PDのメタ更新→クライアントのRegion Cache更新をE2Eでつなぐ。テスト骨子は既にある。 +2. **フォロワリード/線形化Read**の切替を導入(読み取りSLAと一貫性を両立)。 +3. **MVCC+2PC**:TSO を commit_ts/read_ts に使い、Prewrite/Commit(TiKV流) or OCC を追加。Quickstart のCASを土台に昇華。 +4. **Merkleベースのアンチエントロピー**:バックグラウンドでリージョンのMerkle葉を比較し、差分レンジを修復。 +5. **PDのスケジューラ**:移動コスト・ホットキー・障害隔離を考慮した配置。 +6. **メトリクス/トレース/プロファイリング**と**YCSB/Jepsen系テスト**で性能と安全性を可視化。 + +--- + +## さらに高みへ(共通の設計指針) + +1. **制御面(Chainfire)×データ面(FlareDB)の分業を明確化** + Chainfire を“クラスタ制御の中枢”(ノードメタ/アロケーション/設定/ウォッチ)に、FlareDB を“データ平面”に寄せる。Gossipの生存情報→ChainfireのKV→FlareDB PDへの反映という**単一路**を敷くと運用が楽になる。 + +2. **アドレス解決とメンバーシップの一元管理** + ChainfireのCluster APIに Raft peer の `BasicNode` 情報を登録/取得する経路を作り、`NetworkFactory` がそこから**動的にダイヤル**できるようにする。現状はトレイトとFactoryが揃っているので配線だけで前進する。 + +3. **明示的なポート分離とゼロトラスト前提** + Client API(KV/Watch)と Peer RPC(Raft)を分離配信し、mTLS+認可を段階導入。今は一つのTonicサーバに同居している。 + +4. **線形化の“契約”をドキュメント化** + Watch の順序/Revision と Read の一貫性(ReadIndex/フォロワ/リーダ)をモード化して明示する。API層は既に独立しているので拡張しやすい。 + +5. **スナップショットと再構築の運用設計** + 既存のスナップショット構造を基に**chunked streaming/再送/検証**を実装し、ローリングアップグレードと迅速なリカバリを可能に。 + +6. **MVCC+TSOで“トランザクション対応のFlareDB”へ** + まずは単一リージョンで2PC/OCCを成立させ、その後リージョンを跨ぐ分散トランザクションへ。Quickstart とタスク表に沿って前進できる。 + +7. **可観測性と安全性** + すべてのRaft RPC・適用・スナップショット・Gossipイベントに**トレースID**を通し、Prometheus でSLOを監視。障害注入テスト(ネット分断/ディスク遅延)を回す。 + +--- + +## 小さく始めるチェックリスト(1–2スプリント) + +**Chainfire** + +* [ ] `RaftRpcClient` の gRPC 実装を追加(`internal_proto` をクライアント化)し、`Dummy` を置き換え。 +* [ ] `api_addr` と `raft_addr` を別 `Server` で `serve`。ログ出力と一致させる。 +* [ ] Gossip からメンバーの up/down を拾い、Cluster API経由でRaft構成変更に反映。 + +**FlareDB** + +* [ ] `verify-multiraft.sh` とテスト群に合わせ、リージョンスプリット→新ラフト起動→PD更新→クライアントRegion Cache更新の一連をE2E化。 +* [ ] フォロワリード/線形化Readの切替をサービスに実装。 +* [ ] TSO を使った MVCC の最小実装(単一リージョン)を追加してから、2PCへ拡張。 + +--- + +## まとめ(現実的な採用ライン) + +* **今すぐの実用**:Chainfire はシングルノードの構成管理KV+Watchに、FlareDB は PD/TSO を含む実験用KV に向く。 +* **クラスタ本番**:Chainfire は **Raft RPC 配線+メンバーシップ**実装が最初の関門。FlareDB は **マルチラフト/リージョン運用+MVCC/2PC** が到達点。 +* 設計の方向性はどちらも筋が良い。エンジンは積んだ、あとは**配線と路面(運用)**を敷けば走れる、という状態だ。 + +> 補足:この評価はアップロードされたリポジトリのソース構成・実装・Spec/Quickstart に基づいています(抜粋例:依存関係・OpenRaft設定・サーバ起動配線・Dummy/In‑memory RPC・PD/TSO・テスト/スクリプトなど)。必要なら、具体ファイル/行も辿れるように示しました。 + +この先、どのユースケース(たとえばK8sのコントロールプレーン用KV/大規模Key-Valueの裏側/学術実験)を主眼にするかで実装の優先度は変わります。用途を教えてくれれば、必要機能の優先順位表まで落とし込みます。 diff --git a/flaredb/chat.md b/flaredb/chat.md new file mode 100644 index 0000000..c650823 --- /dev/null +++ b/flaredb/chat.md @@ -0,0 +1,1935 @@ +# You Asked: + +Apache Cassandraの運用が大変という話は良く聞きますが、何がそんなに苦痛なのでしょうか? + +# Gemini Responded: + +Apache Cassandraはその圧倒的な書き込み性能と可用性の高さで知られていますが、運用エンジニアからは**「猛獣」**や**「じゃじゃ馬」**に例えられるほど、管理が難しいことでも有名です。 + +その「苦痛」の原因は、主に**Java VM (JVM) の挙動**、**分散システム特有のデータ整合性維持**、そして**独特なデータ削除の仕組み**の3点に集約されます。 + +具体的に何がそれほど大変なのか、現場のエンジニアを悩ませるポイントを整理しました。 + +--- + +### 1. JVMとガベージコレクション(GC)の悪夢 +CassandraはJavaで動作するため、メモリ管理はJVMに依存します。これが最大の運用課題となることが多いです。 + +* **Stop-the-World (STW):** + JVMがメモリ整理(ガベージコレクション)を行う際、アプリケーションの動作を一瞬停止させることがあります。Cassandraのような高負荷データベースでは、ヒープメモリが巨大になりがちで、この停止時間が長引く(数秒〜数十秒)ことがあります。 +* **「死んだ」と誤判定される:** + GCで停止している間、Cassandraは他のノードへの応答(Heartbeat)ができなくなります。すると、周囲のノードは**「こいつはダウンした」と判定**して切り離してしまいます。GCが終わって復帰しても、クラスタの状態が不安定になり、アクセスエラーが多発します。 +* **チューニングの迷宮:** + これを防ぐためのJVMチューニング(New/Old領域の比率、GCアルゴリズムの選定など)は非常に高度なスキルを要し、「秘伝のタレ」化しがちです。 + +### 2. 「コンパクション」と「リペア」によるリソース枯渇 +Cassandraは「書き込み」を高速化するために、バックグラウンドで重い処理を行います。これが突然サーバーのリソースを食いつぶすことがあります。 + + + +* **コンパクション (Compaction):** + データは不変のファイル(SSTable)として次々とディスクに書き込まれます。放っておくとファイル数が膨大になり、**読み込み速度が劇的に低下**します。 + これを防ぐため、バックグラウンドで複数のファイルを結合(コンパクション)し続けますが、この処理がCPUとディスクI/Oを強烈に消費し、本来のサービスの性能を落とすことがあります。 +* **リペア (Repair):** + Cassandraは「結果整合性」を採用しているため、ノード間でデータのズレが生じます。これを直すために定期的に `nodetool repair` というコマンドを実行する必要がありますが、これがまた重い処理です。失敗したり、時間がかかりすぎたりして、運用者を疲弊させます。 + +### 3. 「削除」が実は「書き込み」である (Tombstones問題) +Cassandraでデータを削除しても、すぐにはディスクから消えません。 + +* **墓石 (Tombstone) を置く:** + 削除命令を送ると、「ここは削除済み」というマーク(Tombstone)を**書き込み**ます。 +* **ゾンビデータの復活:** + もし定期的な「リペア」をサボったり、ノードが長期間ダウンして復帰したりすると、**削除したはずのデータが復活**することがあります(Tombstone自体が消えてしまうため)。 +* **読み込み時の罠:** + 「10件データを取ってくる」というクエリを投げた際、Cassandraが内部で**「10万個のTombstone(削除済みマーク)をスキャンして、やっと生きている10件を見つける」**という状況に陥ることがあります。これによりクエリがタイムアウトし、「データはあるのに読めない」という事象が発生します。 + +### 4. スケールアウト/インの遅さ +「リニアにスケールする」と言われますが、ノードの追加や削除はボタン一つで瞬時に終わるわけではありません。 + +* **データの再配置 (Streaming):** + 新しいノードを追加すると、他のノードから担当分のデータをネットワーク越しにコピー(ストリーミング)し始めます。テラバイト級のデータがある場合、これに数時間〜数日かかり、その間ネットワーク帯域が圧迫されます。 + +--- + +### まとめ:なぜ苦痛なのか? + +一言で言えば、**「データベースが自律的にやってくれるはずの面倒な処理(掃除、整頓、ゴミ捨て)を、人間が手動やスクリプトで絶妙にコントロールしてあげないと、自重で倒壊する」**からです。 + +> **注記:** 近年のバージョン(Cassandra 4.0以降)では、JavaのZGC/Shenandoah GCへの対応や、ストリーミングの改善により、以前よりはかなり運用が楽になっています。また、C++で書き直された互換DBである **ScyllaDB** への移行や、AWS Keyspacesなどのマネージドサービスを利用することで、この苦痛を回避するケースも増えています。 + +**次のステップ:** +もし現在、具体的な技術選定をされているのであれば、「Cassandraのメリットが活きるユースケース(書き込み特化)」と「運用の楽な代替案(ScyllaDB等)」の比較について整理しましょうか? + +--- + +# You Asked: + +Rustで極めて単純で高パフォーマンスなKVSのようなDBを書き、その上のレイヤーにSQLのような高度なクエリのできるものを乗せるということを考えています。これを作るに当たり、考えることはどのようなことがあるでしょうか。 + +# Gemini Responded: + +Rustで「KVS(ストレージエンジン) + SQLレイヤー(クエリエンジン)」という2層構造のDBを作るのは、**CockroachDB (RocksDB/Pebbleの上に構築)** や **TiDB (TiKVの上に構築)**、**FoundationDB** などで採用されている非常にモダンで拡張性の高いアーキテクチャです。 + +この構成をRustで実装する際に直面する技術的課題と、設計上の重要な決定事項を、下のレイヤーから順に整理します。 + +--- + +### 1. KVSレイヤー(ストレージエンジン)の設計 +「極めて単純で高パフォーマンス」を目指す場合、アーキテクチャの選定が全てです。 + +#### A. データ構造の選定:B-Tree vs LSM-Tree +* **LSM-Tree (Log-Structured Merge-tree):** + * **特徴:** 書き込み(Write)が非常に高速。データは追記のみで、後でバックグラウンドでマージ(コンパクション)する。RocksDBやLevelDBのアプローチ。 + * **Rustでの課題:** コンパクション処理の実装が複雑。また、読み込み時に複数のSSTableを探すため、Bloom Filterの実装が必須。 +* **B+Tree:** + * **特徴:** 読み込み(Read)が安定して高速。更新はインプレース(場所を指定して書き換え)。従来のリレーショナルDB(PostgreSQL, MySQL/InnoDB)のアプローチ。 + * **Rustでの課題:** ページキャッシュ(Buffer Pool)の管理が難しい。Rustの借用チェッカーと「相互参照するページ構造」は相性が悪く、`unsafe` や `Rc>`、あるいはインデックス(`Arena`アロケータ)を使ったグラフ構造の工夫が必要です。 + + + +#### B. 永続化と安全性 (WAL) +KVSが落ちてもデータを失わないために、**WAL (Write Ahead Log)** の実装が必要です。 +* メモリ(MemTable)に書く前に、必ずディスク上のログファイルに追記する。 +* **課題:** ディスクへの `fsync` のタイミング。毎回の書き込みでやると遅くなるため、バッチ処理や `io_uring` (Linux) を使った非同期I/Oの活用がパフォーマンスの鍵になります。 + +### 2. インターフェース層(KVSとSQLの接着剤) +ここがこのアーキテクチャの肝です。リレーショナルなデータ(テーブル、行、列)を、どうやってフラットなバイト列(Key-Value)に落とし込むか? + +#### キーエンコーディング (Key Encoding) +SQLの `SELECT * FROM users WHERE id > 100` のような**範囲検索**を高速に行うには、KVSのキーが**辞書順でソート**されている必要があります。 + +* **キーの設計例:** + 単純な連番ではなく、以下のようなバイナリ形式にする必要があります。 + `[TableID: 4byte] [IndexID: 4byte] [PrimaryKey: Variable]` +* **課題:** + * 可変長データ(文字列など)をキーに含める場合、ソート順を壊さないエンコーディング(Memcomparable format)の実装が必要です。 + * Rustには `byteorder` クレートなどがありますが、カスタムシリアライザを書くことになるでしょう。 + +### 3. トランザクション制御 (MVCC) +「高パフォーマンス」かつ「SQL」となると、並行処理制御は避けて通れません。単純なロック(Mutex)では性能が出ないため、**MVCC (Multi-Version Concurrency Control)** が必須になります。 + +* **実装イメージ:** + KVSのキーにタイムスタンプやバージョン番号を含めます。 + * Key: `User:1` -> Value: `{"name": "Alice"}` (Version 10) + * 読み込み時は「Version 10時点のスナップショット」を見る。 +* **Rustでの課題:** + 古いバージョン(ガベージ)をいつ消すか(Vacuum/GC)の管理。Rustのチャネル(`tokio::sync`や`crossbeam`)を使った非同期なGCスレッドの設計が求められます。 + +### 4. SQLレイヤー(クエリエンジン) +ここは「車輪の再発明」を避けるか、あえてするかで開発工数が劇的に変わります。 + +#### A. パーサーとプランナー +* **sqlparser-rs:** SQLのパース(AST構築)には、デファクトスタンダードである `sqlparser` クレートを使うのが賢明です。 +* **クエリオプティマイザ:** ここが一番難しい部分です。「JOINの順序」や「インデックスを使うべきか全件走査すべきか」の判断ロジック。最初は「ルールベース(RBO)」で作り、余裕があれば「コストベース(CBO)」へ進化させるのが一般的です。 + +#### B. 実行モデル (Executor) +* **Volcano Model (Iterator):** `next()` を呼ぶと1行返ってくる。実装は簡単だが、関数呼び出しのオーバーヘッドが大きい。 +* **Vectorized Execution:** 一度に数千行(バッチ)を処理する。**Apache Arrow (Rust版)** を内部データ形式として使うと、SIMD命令が効きやすく、爆速になります。Rustはここが得意です。 + +--- + +### Rustのエコシステム活用案 + +ゼロから書く楽しみを損なわない範囲で、以下のクレートを調査・参考にすると良いでしょう。 + +| コンポーネント | 使うべき/参考にすべきクレート | 備考 | +| :--- | :--- | :--- | +| **I/O** | `tokio-uring` / `glommio` | Linuxのio_uringを活用してI/O待ちを極限まで減らす | +| **SQL Parser** | `sqlparser` | 自作するとSQL標準への追従だけで数年かかるため | +| **データ形式** | `arrow-rs` / `parquet` | カラムナフォーマットやベクトル処理の参考に | +| **KVエンジン** | `sled` / `agatedb` | Rust製KVSのソースコードは宝の山 | + +### 開発の第一歩としての提案 + +いきなりSQL層まで作ると挫折しやすいため、まずは**「KVS層だけで、リレーショナルなスキャンができるか?」**を検証するプロトタイプ作成をお勧めします。 + +1. `BTreeMap` (メモリオンリー) をバックエンドにする。 +2. その上に「タプルをバイナリキーに変換するエンコーダー」を作る。 +3. `scan_prefix(TableID)` でテーブル全件取得ができるか試す。 + +**次のステップ:** +具体的に、**「LSM-TreeとB-Treeのどちらで実装するか」**、あるいは**「MVCC(トランザクション)の設計」**について、詳細な実装戦略を掘り下げてみましょうか? + +--- + +# You Asked: + +- KVSを基盤とした分散型超高速DB +  - FoundationDBのようなものを目指す +    - Rustで書く +    - 多分ストレージ部分にはRocksDBを使うこととなるだろう +    - 良い感じのモジュール化により、柔軟な構成が取れるように目指す。 +      - Simple by default, Powerful when neededという感じ +    - KVS部分の最終的な処理を変えることにより、強整合性と結果整合性を切り替えることが可能 +  - 想定用途 +    - S3的なオブジェクトストレージのための分散DB +      - 超高速である必要がある。また、ちゃんとしたクエリができる必要もある(リストアップするようなクエリが必要になるだろう)。 +    - SNSのバックエンドDB +      - まあSNSに限った話ではないかも知れない。超高速のDBとしてイベント等を処理できるようにしたい。 +  - EC2のようなものから、接続路もボトルネックが出ないように接続すべき +    - APIの接続部がボトルネックにならないようにしたい。 +    - VM基盤は今作っているものがあるので、それと良い感じにメッシュで接続するなどしたい。 + +このような感じのものを作ろうとしています。全体のアーキテクチャを考えていきたいですが、どのようにするとよいでしょうか。 + +# Gemini Responded: + +FoundationDB(FDB)のアーキテクチャ(ステートレスなトランザクション層とステートフルなストレージ層の分離)を参考にしつつ、Rustで「強整合性と結果整合性を切り替え可能」かつ「VM基盤とメッシュ接続」という野心的な要件を満たすアーキテクチャを提案します。 + +「S3のメタデータ管理」と「SNSのイベント処理」は、実はアクセスパターンが異なります(前者はRange Scan重視、後者はWrite Throughput重視)。これを両立するための**モジュラーなアーキテクチャ**が必要です。 + +以下に、4つのレイヤーに分けた設計案を提示します。 + +----- + +### 全体アーキテクチャ図(概念) + +```mermaid +graph TD + Client[VM / Client App] -->|Mesh / Shared Mem| Proxy[Network Proxy / Sidecar] + Proxy -->|Route Request| TLog[Transaction / Query Layer] + + subgraph "Control Plane (Placement Driver)" + PD[Metadata & Shard Manager] + end + + subgraph "Storage Layer (Distributed)" + Node1[Storage Node 1] + Node2[Storage Node 2] + Node3[Storage Node 3] + end + + TLog -->|gRPC / QUIC| Node1 + TLog -->|gRPC / QUIC| Node2 + PD -.->|Cluster Info| TLog + PD -.->|Rebalancing| Node1 +``` + +----- + +### 1\. Storage Node(データの永続化と局所性) + +RocksDBをラップし、ネットワーク越しの読み書きに応答する最下層です。ここで「整合性モデル」の切り替えを吸収します。 + +#### **モジュール設計: `ConsensusTrait` の導入** + +RustのTraitシステムを活用し、コンセンサスアルゴリズムをプラガブルにします。 + + * **Core Trait:** + ```rust + trait ConsensusEngine { + async fn propose(&self, batch: WriteBatch) -> Result; + async fn read(&self, key: Key) -> Result; + } + ``` + * **Mode A: Strong Consistency (S3 Metadata向け)** + * **実装:** Multi-Raft (TiKV方式)。データを「Region(例えば96MBごとの範囲)」に分割し、RegionごとにRaftグループを作ります。 + * **Rustライブラリ:** `raft-rs` (TiKVで使用) や `openraft`。 + * **特徴:** 常に最新のデータを読める。Range Scanが得意。 + * **Mode B: Eventual Consistency (SNS/ログ向け)** + * **実装:** Consistent Hashing + Vector Clocks。リーダーレスで、任意のノードに書き込み可能。 + * **特徴:** 書き込みレイテンシが極小。一時的な不整合を許容。 + +#### **RocksDBの活用** + +RocksDBは単なるエンジンとして使い、その上の\*\*「キーのプレフィックス設計」\*\*でマルチテナントやテーブル構造を表現します。 + +----- + +### 2\. Transaction / Query Layer(頭脳) + +FoundationDBの最大の特徴は、この層が\*\*ステートレス(状態を持たない)\*\*であることです。これにより、計算リソース(CPU)だけを個別にスケールアウトできます。 + + * **役割:** + * SQLやKV操作のパース。 + * どのノード(Shard/Region)にデータがあるかの解決(Placement Driverへの問い合わせ)。 + * **分散トランザクション制御 (2PC / OOCC):** + * 強整合性モードでは、ここでタイムスタンプを取得し、衝突判定を行います。 + * **S3リストアップへの対応:** + * S3の `LIST` 操作は重いため、単純なスキャンではなく、RocksDBの `Iterator` を活用し、\*\*「Pagination付きのRange Scan」\*\*として実装します。 + +----- + +### 3\. Network & VM Mesh Integration(ボトルネックの排除) + +ここがあなたの「VM基盤を作っている」という強みが活きる部分です。TCP/IPスタックのオーバーヘッドを極限まで削ります。 + +#### **A. 接続方式: VM基盤との統合** + +VM(EC2相当)からDBへの接続を高速化するために、以下の3段階の接続方式を用意します。 + +1. **Standard (TCP/QUIC):** 外部からの通常の接続。`tokio` + `quinn` (QUIC) で実装。 +2. **Internal Mesh (eBPF/XDP):** 同一データセンター内の通信。Linuxカーネルのネットワークスタックをバイパスし、パケット転送を高速化。 +3. **Co-location (vhost-user / Shared Memory):** + * もしVMホストOS上でDBプロキシが動いているなら、VM(ゲスト)とホスト間で**共有メモリ**を使って通信します。 + * Rustの `vhost` クレートなどを使い、virtio経由でゼロコピー通信を行えば、ネットワーク遅延はほぼゼロになります。 + +#### **B. API Gateway** + + * **プロトコル:** gRPC (Protobuf) を基本としつつ、パフォーマンスが必要な箇所には **FlatBuffers** や **rkyv** (Rust向けのゼロコピーデシリアライゼーション) を使用。 + +----- + +### 4\. S3 & SNS 向けのデータモデリング戦略 + +KVSの上に構築する場合、キーの設計がパフォーマンスの9割を決めます。 + +#### **Case 1: S3 Compatible Object Storage (Metadata)** + +S3の「フォルダのような階層構造」をフラットなKVSで表現し、かつ高速にリストアップする必要があります。 + + * **Key Design:** + `[BucketID] [0x00] [ParentPathHash] [0x00] [FileName]` + * **解説:** + * `ParentPathHash` を含めることで、特定のフォルダ直下のファイル一覧(`ls`コマンド)が、KVS上の連続した領域へのシーケンシャルアクセス(Range Scan)になり、爆速化します。 + * FoundationDBのDirectory Layerの考え方が参考になります。 + +#### **Case 2: SNS Backend (Timeline/Events)** + + * **Key Design:** + `[UserID] [ReverseTimestamp] [EventID]` + * **解説:** + * タイムスタンプを逆順(最大値 - 現在時刻)にすることで、最新の投稿を取得する際に「先頭から読むだけ」になり、RocksDBのキャッシュ効率が最大化されます。 + +----- + +### 開発ロードマップの提案 + +まず「Simple by default」を実現するための最小構成から始めるのが良いでしょう。 + +1. **Phase 1: "Local" Distributed DB** + * `raft-rs` と `rocksdb` を組み合わせて、ローカルの複数プロセスで動く「RaftベースのKVS」を作る。 + * この時点で `StorageTrait` を定義し、APIを確定させる。 +2. **Phase 2: Network & Protocol** + * `tonic` (gRPC) を導入し、クライアントからの `Put`, `Get`, `Scan` を実装。 +3. **Phase 3: S3 Metadata Implementation** + * Phase 2のKVSを使って、S3の `ListObjects` 相当のロジックを実装し、ベンチマークを取る。 +4. **Phase 4: Optimization (VM Mesh)** + * 自作VM基盤との接続部分(共有メモリやカスタムプロトコル)の実装。 + +**次のステップ:** +特にどの部分が一番の懸念点、あるいは興味深いポイントですか? +例えば、\*\*「RustでのRaftの実装とステートマシンの設計」**について詳しく掘り下げるか、**「VMとDB間のゼロコピー通信(vhost-user等)の設計」\*\*について議論しますか? + +--- + +# You Asked: + +もう一つの案として、強整合性は完全に諦めてオブジェクトストレージ側に整合性チェックを作るという案もあります。どちらが良いか比較してみてください。 + +# Gemini Responded: + +「DB側で強整合性を捨て、アプリ(オブジェクトストレージ)側で整合性を担保する」というアプローチは、Amazon Dynamoや初期のCassandra、Riakなどが採用した**「AP型(可用性・分断耐性重視)」**の設計思想です。 + +結論から言うと、S3のようなオブジェクトストレージを作る場合、**「メタデータ管理はDB側で強整合性を持たせ、実データ(Blob)は結果整合性でも良い」**とするのが現代的な正解に近いと考えられます。すべてをアプリ側で担保するのは、**S3互換としての正しさを保証する難易度が跳ね上がる**からです。 + +両者のアプローチを比較・分析します。 + +--- + +### 比較:A案「DBが強整合性を保証」 vs B案「アプリが整合性をチェック」 + +| 比較項目 | A案:DBが強整合性 (CP型) | B案:アプリがチェック (AP型) | +| :--- | :--- | :--- | +| **代表例** | FoundationDB, TiKV, CockroachDB | DynamoDB, Cassandra, Riak | +| **書き込み速度** | **中〜高** (Raft/Paxosの合意待ちが発生) | **極めて高速** (合意不要、Quorumのみ) | +| **読み込み速度** | **高速** (最新が確定している) | **中** (複数ノードから読み、アプリで比較・修復が必要) | +| **アプリの実装** | **シンプル** (DBを信じれば良い) | **極めて複雑** (競合解決、リトライ、ベクタークロック管理が必要) | +| **S3互換性** | **高い** (現在のAWS S3は強整合性) | **低い** (Read-After-Writeの保証が困難) | +| **スケーラビリティ**| シャード単位ではリニアだが、ホットスポットに弱い | 理論上無限にスケールしやすい | + + + +[Image of CAP theorem consistency trade-offs] + + +--- + +### なぜ B案(アプリ側チェック)が「S3用途」で茨の道なのか? + +一見、B案は高速で魅力的に見えますが、オブジェクトストレージ(特にメタデータ)の実装において、以下の**「3つの罠」**が待ち受けています。 + +#### 1. "Read-After-Write" の保証が極めて難しい +ユーザーが「ファイルをアップロード(Put)し、直後に一覧表示(List)する」という操作をしたとします。 +* **B案(AP型):** 書き込みが全ノードに行き渡る前にListが走ると、「ファイルが無い」と返してしまいます。これをアプリ側で防ぐには、「自分が書いた直後のデータは、必ず書き込んだノードを見に行く」などの複雑なルーティング制御(Sticky Session的なもの)が必要になります。 +* **現状:** 2020年以降、本家AWS S3は**強整合性**になりました。これに慣れたユーザーやツールは、結果整合性のストレージを使うとエラーを起こす可能性があります。 + +#### 2. CAS (Compare-And-Swap) が実装できない +「ファイルが存在しない場合のみ作成する」「バージョンがXならYに更新する」といった排他制御が必要です。 +* **罠:** DB自体が強整合性(アトミックなトランザクション)を持っていない場合、アプリ側だけで厳密な排他制御を行うことは**論理的に不可能**です(分散ロックなどの別の協調システムが必要になり、結局それがボトルネックになる)。 +* **影響:** 同じファイル名で同時にアップロードされた際、どちらが勝つか予測不能(Last Write Wins)になり、データロストのリスクがあります。 + +#### 3. 「ディレクトリのリネーム/削除」の地獄 +S3にはディレクトリはありませんが、擬似的にフォルダとして扱います。 +「フォルダごと削除」する場合、内部的には数千〜数万のオブジェクトのメタデータを書き換える必要があります。 +* **B案(AP型):** 途中で通信が切れたり競合したりすると、「一部のファイルだけ消えて、一部は残っている(しかもゾンビデータ)」という不整合な状態になり、それを直すための「リペアバッチ」を自作する羽目になります。これは最初のCassandraの例で挙げた「運用の苦痛」そのものです。 + +--- + +### では、B案(アプリ側チェック)が輝く場所は? + +B案が完全にダメなわけではありません。**「実データ(Blob/Chunk)」**と**「SNSのタイムライン」**には最適です。 + +* **Immutableなデータ:** 一度書いたら書き換えないデータ(画像の実体など)は、競合が起きないため、B案の「投げっぱなし書き込み」で超高速に処理できます。 +* **SNSの「いいね」やログ:** 「いいね」の数が一時的にズレていても誰も困りません。ここではB案の書き込み性能が最大の武器になります。 + +--- + +### 推奨アーキテクチャ:ハイブリッド戦略 + +Rustで自作されるのであれば、どちらか一つに絞るのではなく、**「データの中身によって使い分ける」**のが最も賢い戦略です。 + +ご自身の「モジュール化」の構想を活かし、以下のように階層を分けます。 + +1. **メタデータ層 (Namespace/File Index)** -> **強整合性モード (Raft)** + * ファイル名、サイズ、権限、ディレクトリ構造。 + * データ量は小さいので、Raftのオーバーヘッドは許容範囲。 + * これにより「S3互換の挙動」と「排他制御」をシンプルに実装できます。 + +2. **ブロックストレージ層 (Object Chunks)** -> **結果整合性モード (Consistent Hashing + Replication)** + * ファイルのバイトデータそのもの。 + * Raftは使わず、Dynamoのように「N個のノードに並列書き込みし、W個成功したらOK」とする。 + * これで帯域をフルに使った爆速アップロードが可能です。 + +### 結論 + +**「メタデータ(管理情報)に関しては、DB側で強整合性を持つ(A案)」方が、トータルの開発コストと運用コストは圧倒的に安くなります。** + +アプリ側で分散システムの整合性を担保するコードを書くのは、**DBエンジンを自作するよりもデバッグが難しく、バグが出た時の被害甚大**だからです。 + +**次のステップ:** +この「メタデータは強整合性、データは結果整合性」という**分離型アーキテクチャ**の具体的なデータフロー(Putリクエストが来た時、どうやって2つの層に書き分けるか)を図解やシーケンスで整理してみましょうか? + +--- + +# You Asked: + +まあS3の部分はやろうと思えばできると思うので、とにかくKVSやDBの部分を具体的な仕様に詰めていきたいと思っています。 + +# Gemini Responded: + +承知しました。S3やSNSのバックエンドとして耐えうる、\*\*「Rust製・RocksDBベース・強整合性/結果整合性切り替え可能」\*\*な分散DBの具体的な仕様策定に入りましょう。 + +FoundationDBやTiKVのアーキテクチャをベースにしつつ、Rustのエコシステムを最大限に活かした設計案(コードレベルに近い仕様)を提案します。 + +プロジェクト名を仮に **"RustDB"** と呼びます。 + +----- + +# RustDB Architecture Specification + +全体を以下の4つのモジュール(Crate)に分割して開発することをお勧めします。 + +1. **`rdb-storage`**: ローカルストレージエンジン (RocksDB wrapper) +2. **`rdb-consensus`**: 分散合意・レプリケーション層 (Raft / Async Replication) +3. **`rdb-meta`**: クラスタ管理・スケジューリング (Placement Driver) +4. **`rdb-proxy`**: トランザクション・クエリ層 (Stateless) + +----- + +## 1\. `rdb-storage`: ストレージエンジン層 + +単一ノードでの読み書きを担当します。 + + * **Backend:** `rocksdb` crate (Rust bindings for RocksDB) + * **Key Design:** + * **Column Families (CF) の活用:** データを論理的に分離し、物理的なIOを最適化します。 + * `default`: 実際のデータ (Value) + * `lock`: 分散トランザクション用のロック情報 + * `write`: トランザクションのコミット情報(MVCC用) + * `raft`: Raftのログ自体を保存 + +### インターフェース定義 (Trait) + +「強整合性」と「結果整合性」を切り替えるため、ストレージへの書き込みは直接行わず、以下のTraitを通します。 + +```rust +#[async_trait] +pub trait StorageEngine: Send + Sync { + // スナップショット読み込み + async fn get(&self, key: &[u8], version: u64) -> Result>>; + + // スキャン (Iterator) + async fn scan(&self, start: &[u8], end: &[u8], version: u64) -> Result, Vec)>>; + + // アトミック書き込み (Batch) + async fn write(&self, batch: WriteBatch) -> Result<()>; +} +``` + +----- + +## 2\. `rdb-consensus`: 分散合意層 (The Core) + +ここが最も重要です。**データの断片(Region/Shard)** 単位でコンセンサスグループを作ります。 + + * **採用ライブラリ:** `raft-rs` (TiKVが作っている、プロダクションレディな実装) + * **データの分割単位:** `Region` + * デフォルトサイズ: 96MB + * 各Regionは `[StartKey, EndKey)` の範囲を担当。 + +### 整合性モードの切り替え仕様 + +設定ファイルやテーブル作成時の指定で、以下の2つのモードをRegion単位で適用します。 + +#### **Mode A: Strong Consistency (Raft)** + +S3メタデータ用。 + + * **Write:** Leaderノードにリクエスト -\> Log Replication -\> Quorum達成 -\> Apply -\> Clientへ応答。 + * **Read:** + * **ReadIndex:** Leaderが「自分がまだLeaderであること」を確認してから返す(線形化可能性の保証)。 + * **Lease Read:** 時間ベースのリースを使って、Leader確認をスキップし高速化(クロック同期依存)。 + +#### **Mode B: High Throughput (Async Replication)** + +SNSタイムライン / Blobデータ用。 + + * **Write:** \* クライアントはハッシュリング上の担当ノード(Primary)に書き込む。 + * Primaryはローカルに書いて即座にOKを返す(Ack=1)。 + * バックグラウンドでReplicaへ非同期転送。 + * **Read:** + * 最新の書き込みが見えない可能性があるが、レイテンシはディスクI/Oのみに依存。 + +----- + +## 3\. `rdb-proxy`: トランザクション層 (Transaction) + +クライアント(API Gateway)からのリクエストを受け、トランザクションを制御します。ここはステートレスです。 + + * **プロトコル:** Google Percolatorモデル(TiKVと同じ)を採用するのが、性能と実装難易度のバランスが良いです。 + * **MVCC (Multi-Version Concurrency Control) の実装:** + Keyには常に8バイトのタイムスタンプが付与されます。 + +### トランザクションフロー (S3 PutObjectの例) + +1. **Start:** Proxyが `rdb-meta` からタイムスタンプ(StartTS)を取得。 +2. **Prewrite:** \* 対象のKey(メタデータ)に対して、Primary Lockを書き込む。 + * データ本体も `lock` CF に仮書き込み。 + * 他のトランザクションとの衝突をチェック。 +3. **Commit:** + * 全てのPrewriteが成功したら、コミットタイムスタンプ(CommitTS)を取得。 + * Primary Lockを解除し、正式なデータとして `write` CF に記録。 +4. **Cleanup:** + * 残りのLockを非同期で解除。 + +このロジックをRustのライブラリとして実装し、gRPCサーバーに組み込みます。 + +----- + +## 4\. `rdb-meta`: メタデータ管理 (Placement Driver) + +クラスタの脳です。ステートレスなProxyに対して「どのデータがどこにあるか」を教えます。 + + * **役割:** + 1. **Timestamp Oracle (TSO):** クラスタ全体で単調増加する一意なタイムスタンプ(論理時計 + 物理時計)を発行。これがないと分散トランザクションはできません。 + 2. **Region Scheduling:** ノード間の負荷分散。あるノードが過負荷なら、RaftのLeaderを別のノードに移動させたり、データを移動させたりする指令を出します。 + * **実装:** + * これ自体も高可用性が必要なため、`etcd` を組み込むか、自身をRaftグループ(3ノード構成など)として実装します。 + +----- + +## 技術スタックとライブラリ選定 (推奨) + +ゼロから作る部分と、既存を使う部分のメリハリをつけます。 + +| コンポーネント | Rust Crate / 技術 | 理由 | +| :--- | :--- | :--- | +| **通信 (RPC)** | `tonic` (gRPC) | 高速、型安全、エコシステム最強。 | +| **通信 (Internal)** | `quinn` (QUIC) | ノード間通信のレイテンシ削減。Head-of-Line blocking回避。 | +| **非同期ランタイム** | `tokio` | 事実上の標準。io\_uring対応(`tokio-uring`)も視野に。 | +| **ストレージ** | `rocksdb` | 信頼と実績。自作は泥沼。 | +| **Raft** | `raft-rs` | TiKVの実装。RustでRaftやるならこれ一択。 | +| **シリアライズ** | `bincode` or `rkyv` | `serde_json`は遅い。内部通信はゼロコピーを目指すなら`rkyv`。 | +| **メトリクス** | `metrics` + `Prometheus` | 運用に入ったらこれが全て。最初から入れておく。 | +| **エラー処理** | `thiserror` + `anyhow` | 鉄板構成。 | + +----- + +## 開発のフェーズ分け提案 + +いきなり分散は難しいので、以下のステップで進めるのが現実的です。 + +1. **Phase 1: スタンドアロンなトランザクションKVS** + + * Raftなし。単一のRocksDBプロセスに対し、PercolatorモデルのMVCCトランザクション(Prewrite/Commit)が動くことを確認する。 + * **ゴール:** 複数のクライアントから並列に書き込んでもACIDが守られること。 + +2. **Phase 2: Raftの組み込み (Multi-Raft)** + + * データを複数のRegionに分割し、`raft-rs` を使って複製する。 + * **ゴール:** ノードを1つkillしても、データが読める・書けること。 + +3. **Phase 3: PD (Placement Driver) の実装** + + * Regionの分割(Split)と結合(Merge)、移動(Rebalance)の実装。 + * **ゴール:** ノードを追加したら自動的にデータが均されること。 + +まずは **「Phase 1: MVCCトランザクションの実装」** から設計詳細を詰めましょうか? それとも **「Phase 2: Raft周りの設計」** が気になりますか? + +--- + +# You Asked: + +MVCCトランザクションについてまずは教えてください。 + +# Gemini Responded: + +MVCC (Multi-Version Concurrency Control: 多版型同時実行制御) は、\*\*「データを上書きせず、新しいバージョンとして追記する」\*\*というシンプルな原則に基づいています。これにより、「書き込み中のデータを読み取りたい」というリクエストが来ても、ロック待ちをさせずに「一つ前の確定したバージョン」を返すことができます。 + +今回は、分散KVS(TiKVやCockroachDB)でデファクトスタンダードとなっている**Google Percolatorモデル**をベースに、Rust + RocksDBで実装するための具体的な仕組みを解説します。 + +----- + +### 1\. データの物理構造 (RocksDBの中身) + +RocksDBは単なるKey-Valueストアですが、MVCCを実現するために**3つのColumn Family (CF)** を使い分けます。 +ここが理解の肝です。 + +| Column Family | 役割 | Keyの形式 | Valueの中身 | +| :--- | :--- | :--- | :--- | +| **CF\_DEFAULT** | **実データ置き場** | `[Key] + [Start_TS]` | ユーザーデータ (Raw Data) | +| **CF\_LOCK** | **仮押さえ (Lock)** | `[Key]` | Lock情報 (Primary Keyの場所, Start\_TS, TTL) | +| **CF\_WRITE** | **コミット証書** | `[Key] + [Commit_TS]` | メタデータ (`Start_TS`, 種類: Put/Delete) | + +> **ポイント:** RocksDBはKeyでソートされるため、TimestampをKeyの末尾にビッグエンディアン(降順)で付与することで、常に「最新のバージョン」が先頭に来るようにします。 + +----- + +### 2\. 書き込みトランザクションの流れ (2-Phase Commit) + +例として、`Key="user:1"` に `Value="Alice"` を書き込むケースを考えます。 + +#### Step 0: 準備 + +トランザクション開始時、TSO (Timestamp Oracle) から **Start\_TS (例: 100)** を取得します。 + +#### Step 1: Prewrite (仮書き込み) + +まず、データの実体とロック情報を書き込みます。この時点ではまだ他の人からは見えません。 + +1. **CF\_DEFAULT** にデータを保存 + * Key: `user:1` + `100` (Start\_TS) + * Value: `"Alice"` +2. **CF\_LOCK** にロックを置く + * Key: `user:1` + * Value: `{ type: Put, start_ts: 100, primary: "user:1" }` + * *もしここで既に他人のロックがあれば、書き込みは失敗(Conflict)します。* + +#### Step 2: Commit (本書き込み) + +全てのPrewriteが成功したら、TSOから **Commit\_TS (例: 110)** を取得します。 +これが「確定した」合図になります。 + +1. **CF\_WRITE** にコミット証書を書く + * Key: `user:1` + `110` (Commit\_TS) + * Value: `{ start_ts: 100, type: Put }` + * *これがあることで、「110の時点でのデータの実体は、100で書かれたものだ」と紐付きます。* +2. **CF\_LOCK** のロックを消す (Cleanup) + * Key: `user:1` + * Value: (削除) + +----- + +### 3\. 読み込みの流れ (Snapshot Read) + +読み込みリクエストには、必ず **Read\_TS (例: 115)** が指定されます。「時刻115時点の世界を見せてくれ」という意味です。 + +1. **ロックのチェック (CF\_LOCK)** + * `user:1` にロックが存在し、かつその `start_ts` が 115 より小さい場合、誰かが書き込み途中です。 + * → 基本はバックオフ(少し待つ)します。 +2. **最新バージョンの特定 (CF\_WRITE)** + * `CF_WRITE` をスキャンし、`Commit_TS` が 115 以下の最新のレコードを探します。 + * 見つかったレコード: Key=`user:1`+`110`, Value=`{ start_ts: 100 }` +3. **実データの取得 (CF\_DEFAULT)** + * `CF_WRITE` に書いてあった `start_ts: 100` を使い、実体を取りに行きます。 + * `CF_DEFAULT` から Key=`user:1`+`100` を Get します。 + * → `"Alice"` を返却。 + +----- + +### 4\. Rustでの実装イメージ + +非常に単純化した実装コードです。 + +```rust +use rocksdb::{DB, WriteBatch, ColumnFamilyDescriptor}; +use byteorder::{BigEndian, WriteBytesExt}; + +// Keyのエンコーディング (Key + Timestamp) +fn encode_key(key: &[u8], ts: u64) -> Vec { + let mut encoded = Vec::with_capacity(key.len() + 8); + encoded.extend_from_slice(key); + // 降順にするためにビット反転などを入れるテクニックもあるが、ここでは単純に + encoded.write_u64::(u64::MAX - ts).unwrap(); + encoded +} + +struct Transaction<'a> { + db: &'a DB, + start_ts: u64, + mutations: Vec<(Vec, Vec)>, // Buffer +} + +impl<'a> Transaction<'a> { + // 1. Prewrite Phase + fn prewrite(&self) -> Result<(), Error> { + let mut batch = WriteBatch::default(); + + for (key, value) in &self.mutations { + // Check Lock (他人のロックがないか確認) + let lock_cf = self.db.cf_handle("lock").unwrap(); + if self.db.get_cf(lock_cf, key)?.is_some() { + return Err(Error::Locked); + } + + // Put Data (CF_DEFAULT) + let default_cf = self.db.cf_handle("default").unwrap(); + batch.put_cf(default_cf, encode_key(key, self.start_ts), value); + + // Put Lock (CF_LOCK) + batch.put_cf(lock_cf, key, self.start_ts.to_be_bytes()); // 簡易的なLock情報 + } + + self.db.write(batch)?; + Ok(()) + } + + // 2. Commit Phase + fn commit(&self, commit_ts: u64) -> Result<(), Error> { + let mut batch = WriteBatch::default(); + + for (key, _) in &self.mutations { + // Write Commit Record (CF_WRITE) + let write_cf = self.db.cf_handle("write").unwrap(); + // Valueとして「実データはstart_tsにあるよ」というポインタを入れる + batch.put_cf(write_cf, encode_key(key, commit_ts), self.start_ts.to_be_bytes()); + + // Delete Lock (CF_LOCK) + let lock_cf = self.db.cf_handle("lock").unwrap(); + batch.delete_cf(lock_cf, key); + } + + self.db.write(batch)?; + Ok(()) + } +} +``` + +### この設計のメリット・デメリット + + * **メリット:** + * **Wait-free Reads:** 読み込みはロックを取らないため、書き込み負荷が高くても読み込み速度が落ちません。 + * **Snapshot Isolation:** トランザクション開始時点の整合性が保証されます。 + * **デメリット:** + * **書き込み増幅:** 1回の論理的な書き込みに対して、RocksDBへは最低でも2回(PrewriteとCommit)の書き込みが発生します。 + * **GCが必要:** 古いバージョン(`CF_WRITE`や`CF_DEFAULT`のゴミ)が残り続けるため、定期的に古いタイムスタンプのデータを削除するガベージコレクション(Compaction Filter)の実装が必要です。 + +**次のステップ:** +このMVCCの仕組みをベースに、\*\*「分散環境で複数のノードにまたがるトランザクション(Primary Lockの概念)」**について説明しましょうか? それとも、**「GCやコンパクションをRustでどう実装するか」\*\*へ進みますか? + +--- + +# You Asked: + +このMVCCトランザクションは、結果整合性が必要な場合も、強整合性が必要な場合も同じく必要なのでしょうか? + +# Gemini Responded: + +結論から申し上げますと、**いいえ、結果整合性のモードではこの重厚なMVCCトランザクション(特にロックと2フェーズコミット)は「不要」であり、むしろ「邪魔」になります。** + +高パフォーマンスを目指す今回のDBにおいて、ここは明確に分岐させるべきポイントです。 + +なぜなら、MVCCトランザクションは\*\*「安全性(ACID)」**を買うために**「性能(レイテンシとスループット)」\*\*を犠牲にする仕組みだからです。結果整合性(SNSの投稿やBlobデータの書き込み)で求められるのは、安全性よりも圧倒的な書き込み速度です。 + +具体的な違いを整理します。 + +----- + +### モード別の書き込みパスの違い + +#### 1\. 強整合性モード (S3メタデータ用) + +**→ MVCC + 分散トランザクション (必須)** + +先ほど解説した仕組みです。 + + * **目的:** 「誰かが書き込んでいる最中は読み込ませない」「途中で失敗したらロールバックする」。 + * **コスト:** 高い。 + * 最低2回のネットワーク往復(Prewrite, Commit)。 + * 最低3回のRocksDB書き込み(Lock, Data, Write)。 + * ロック競合時の待ち時間。 + +#### 2\. 結果整合性モード (SNS/Blobデータ用) + +**→ Raw Write + LWW (Last Write Wins)** + +こちらが、あなたが求めている「超高速」な世界です。 + + * **目的:** とにかく早くディスクに永続化し、クライアントに「OK」を返す。 + * **仕組み:** + * ロック(CF\_LOCK)も、コミット証書(CF\_WRITE)も使いません。 + * **CF\_DEFAULT に直接、一発書き込みます。** + * **コスト:** 極小。 + * 1回のネットワーク往復。 + * 1回のRocksDB書き込み。 + +----- + +### 結果整合性モードでの「整合性」の保ち方 + +MVCCを使わずにどうやって「データの新しさ」を管理するかというと、単純な**LWW(Last Write Wins:後勝ち)戦略、あるいはVector Clock**を使います。 + +#### RocksDBでの実装イメージ (結果整合性版) + +MVCCのような複雑なKeyエンコーディングはせず、単純にタイムスタンプを埋め込むだけで十分です。 + +```rust +// 結果整合性モードの書き込み処理 +async fn raw_put(&self, key: &[u8], value: &[u8]) -> Result<()> { + // 1. タイムスタンプを取得 (サーバー時刻でOK、厳密なTSOは不要) + let ts = current_server_time(); + + // 2. 既存データより新しいかチェック (Read-modify-writeになってしまうが、Blobなら追記のみ等の最適化が可能) + // ※SNSのような追記型ならこのチェックすら不要で、単に新しいKey(UUID等)でInsertするだけ + + // 3. RocksDBに直接書き込み + // Key: [User:1][Timeline][TS] <- TSを含めることでソートされる + // Value: { body: "Hello World" } + self.db.put(encode_key(key, ts), value)?; + + Ok(()) +} +``` + +----- + +### アーキテクチャへの反映:パスの分離 + +RustのTraitシステムを使って、この2つのパスをきれいに切り替える設計をお勧めします。 + +```rust +// エンジンが持つべき機能の抽象化 +#[async_trait] +pub trait StorageEngine { + // 共通: 読み込み + async fn get(&self, key: &[u8]) -> Result>>; +} + +// 強整合性が必要な場面(S3メタデータ) +pub struct TransactionalEngine { + db: Arc, +} + +impl TransactionalEngine { + // 重厚なトランザクションを開始する + pub async fn begin_transaction(&self) -> Transaction { + Transaction::new(&self.db) + } +} + +// 結果整合性でいい場面(SNS/Blob) +pub struct RawEngine { + db: Arc, +} + +impl RawEngine { + // いきなり書き込む (Atomic性は単一Keyのみ保証) + pub async fn put_fast(&self, key: &[u8], value: &[u8]) -> Result<()> { + // ロックなし、検証なしの直接書き込み + self.db.put(key, value) + } +} +``` + +### どちらを選ぶかの判断基準 + +| 機能 | MVCC (強整合性) | Raw Write (結果整合性) | +| :--- | :--- | :--- | +| **Atomic更新** | 複数の行を同時にアトミック更新できる | **単一行(キー)のみ**アトミック | +| **衝突検知** | 先に書いた人が勝つ(後からはエラー) | **後で書いた人が勝つ**(上書き) | +| **読み込み** | 常に確定した最新データを保証 | 複製遅延により古いデータが見える可能性あり | +| **用途** | ユーザー残高、ファイル一覧、権限管理 | 画像データ、ログ、SNSタイムライン、「いいね」数 | + +### 結論 + +S3的なオブジェクトストレージを作る場合、以下のように完全に使い分けます。 + +1. **Buckets / Objects Table (メタデータ):** + * **MVCC必須。** ディレクトリ構造の一貫性を保つため。 +2. **Object Blocks (データ実体):** + * **MVCC不要。** ここにトランザクションを使うと遅すぎて使い物になりません。 + * データブロックは Immutable(不変)として扱い、「同じキー(ハッシュ)なら中身は同じ」と割り切るか、Raw Writeでガンガン書き込みます。 + +**次のステップ:** +トランザクションの仕組みの違いが整理できたところで、次は\*\*「この2つのモードを支える、分散合意(Raft)と非同期レプリケーションの切り替え」\*\*という、ネットワーク層の話に進みましょうか? + +--- + +# You Asked: + +ええ、そこを良く考えたいですね。 + +# Gemini Responded: + +「強整合性(Raft)」と「結果整合性(非同期レプリケーション)」を、同一のクラスタ、同一のバイナリ内で共存させるための、ネットワークとレプリケーション層の設計を掘り下げます。 + +これを実現する鍵は、**「Multi-Raft」アーキテクチャを拡張し、「Region(データの断片)」ごとにレプリケーション戦略をPluggableにする**ことです。 + +----- + +### 1\. 基本単位:「Region」の抽象化 + +TiKVやCockroachDBと同様、データを巨大な1枚岩ではなく、96MB〜数GB程度の\*\*「Region(シャード)」\*\*に分割します。 +このRegionが、レプリケーションの最小単位になります。 + +Rustのコードレベルでは、Regionは以下のような Enum で表現され、それぞれ異なる「ステートマシン」を持ちます。 + +```rust +enum RegionReplicationStrategy { + // Mode A: 強整合性 (S3メタデータ用) + RaftGroup { + leader_id: NodeId, + peers: Vec, + raft_log: RaftLog, // raft-rs state + }, + // Mode B: 結果整合性 (Blob/SNS用) + AsyncPrimaryBackup { + primary_node: NodeId, + backup_nodes: Vec, + pending_queue: VecDeque, // 未送信データ + }, +} +``` + +この設計により、\*\*「あるテーブル(キー範囲)はRaftでガチガチに守る」「あるテーブルは非同期で爆速で流す」\*\*という混在が可能になります。 + +----- + +### 2\. Mode A: 強整合性 (Multi-Raft) の設計 + +メタデータ管理用です。ここは `raft-rs` を素直に使いますが、パフォーマンスのために一工夫必要です。 + + * **仕組み:** + * 1つのノード上で、数百〜数千のRaftグループ(Region)が同時に動きます。 + * 各RegionのLeaderだけが書き込みを受け付けます。 + * **ネットワークの課題:** + * 単純に実装すると、RaftのHeartbeatだけでネットワークがパンクします。 + * **解決策: "Batching & Pipelining"** + * **メッセージの集約:** 異なるRegion宛てのRaftメッセージ(Vote, AppendEntries)を、**宛先ノードごとに1つのTCPパケットにまとめて**送信します。 + * これにより、システムコールとネットワークオーバーヘッドを劇的に減らします。 + +### 3\. Mode B: 結果整合性 (Chain / Async Replication) の設計 + +ここがあなたのDBの「超高速」を実現する部分です。Raftの合意プロセス(過半数のAck待ち)をスキップします。 + +#### 戦略: Primary-Backup with Hinted Handoff + +S3のBlobデータやSNSログ向けに、\*\*「W=1 (Write 1)」\*\*の設定を可能にします。 + +1. **書き込みフロー (Put):** + + * クライアントは、Consistent Hashingで計算された **Primary Node** にデータを送ります。 + * Primary Nodeは、ディスク(WAL/MemTable)に書いた瞬間、**即座に「OK」を返します**(これが爆速の理由)。 + * **裏側で:** Primary Nodeは、Backup Node(次のハッシュ担当ノード2つなど)へ、非同期でデータを転送します。 + +2. **障害時の対応 (Hinted Handoff):** + + * もしBackup Nodeがダウンしていたら? + * Primaryは「あいつが戻ってきたら渡すデータ」として、ローカルに一時保存(Hint)しておきます。 + * 復旧を検知したら、まとめて転送します。 + * *(CassandraやRiakで使われている手法です)* + +----- + +### 4\. ネットワークトポロジーとプロトコル + +「VM基盤とメッシュ接続」という要件を活かし、通信レイヤーを最適化します。 + +#### Unified Transport Layer (gRPC vs QUIC) + +全ての通信(Raftの投票も、Blobの転送も)を一本のパイプに通します。 + + * **採用技術:** **QUIC (`quinn` crate)** を強く推奨します。 + * **理由1: 多重化 (Multiplexing)** + * Raftの通信と、巨大なBlobの転送が1つのコネクション内で混在しても、BlobのせいでRaftのHeartbeatが遅延する(Head-of-Line Blocking)のを防げます。これだけでクラスタの安定性が段違いになります。 + * **理由2: 接続維持** + * VMの再起動やネットワークの瞬断に強いです。 + +#### ゼロコピー転送の設計 (Rust特有) + +Blobデータ(数MB〜数GB)を転送する際、シリアライズ/デシリアライズのコストがCPUを食いつぶします。 + + * **Raftメッセージ:** Protobuf等でしっかり構造化。 + * **Blobデータ:** \*\*「ヘッダだけ構造化し、ボディは生バイト列」\*\*として扱います。 + * Rustの `Bytes` クレートを使い、ネットワークバッファからディスク書き込みまで、メモリコピーなし(Zero-Copy)でスルーパスさせます。 + + + +```rust +// ネットワークメッセージのイメージ +struct NetworkPacket { + header: Header, // RegionID, MessageType (Raft or Blob), etc. + payload: Bytes, // 生データ(ここをコピーしない!) +} +``` + +----- + +### 5\. 全体のデータフロー図解 + +```mermaid +sequenceDiagram + participant Client + participant Node A (Primary/Leader) + participant Node B (Follower/Backup) + participant Node C (Follower/Backup) + + Note over Client, Node C: Mode A: 強整合性 (Raft) + Client->>Node A: Put(Meta) + Node A->>Node B: AppendEntries + Node A->>Node C: AppendEntries + Node B-->>Node A: Ack + Node C-->>Node A: Ack + Note right of Node A: 過半数のAckでCommit + Node A->>Client: OK + + Note over Client, Node C: Mode B: 結果整合性 (W=1) + Client->>Node A: Put(Blob) + Node A->>Disk: Write WAL + Node A->>Client: OK (即答) + par Async Replication + Node A->>Node B: Stream Data + Node A->>Node C: Stream Data + end +``` + +### 開発の落とし穴と対策 + +1. **「スプリットブレイン」の扱い:** + + * Mode A (Raft) では起きませんが、Mode Bでは「ネットワーク分断時に、別々のノードで同じキーに書き込まれる」ことが起きます。 + * **対策:** User Contextにもある通り、SNSなどの用途なら\*\*「LWW (Last Write Wins: タイムスタンプが新しい方を正とする)」\*\*で割り切るのが、実装コスト対効果で最強です。Vector Clockは実装が複雑になりすぎる傾向があります。 + +2. **帯域制御 (Rate Limiting):** + + * Mode Bのバックグラウンド転送がネットワーク帯域を使い切ると、Mode AのRaftがタイムアウトしてリーダー交代が頻発します。 + * **対策:** トークンバケットアルゴリズム(`governor` crate等)を導入し、\*\*「バックグラウンド転送は帯域の70%まで」\*\*といった制限をかけられるようにします。 + +**次のステップ:** +ネットワーク層の戦略が見えてきました。次は、これを制御するための頭脳である\*\*「Placement Driver (PD)」**、つまり**「どのデータを、どのノード(Region)に配置し、いつ分割するか」を決めるロジック\*\*について設計しますか? + +--- + +# You Asked: + +はい。お願いします。 + +# Gemini Responded: + +**Placement Driver (PD)** は、クラスタ全体の「CEO(最高経営責任者)」です。データノード(Storage Node)は現場の作業員であり、PDの指示に従ってデータを移動したり、リーダーを交代したりします。 + +FoundationDBやTiKVにおいて、このPDの賢さがデータベース全体の性能と安定性を決定づけます。 + +今回の「ハイブリッド整合性(S3メタデータ + Blob)」という特殊要件を満たすための、RustによるPDの設計仕様を提案します。 + +----- + +### PDの主要な役割 + +PDには大きく分けて3つの仕事があります。 + +1. **Timestamp Oracle (TSO):** 全ノードに一意で単調増加する時刻を配る(MVCC用)。 +2. **Cluster Topology:** どのノードが生きていて、どのRegion(データ断片)を持っているかを把握する。 +3. **Scheduling:** 「このノードはもうディスクがいっぱいだから、データをあっちへ動かせ」という命令を出す。 + +----- + +### 1\. Timestamp Oracle (TSO) の設計 + +MVCCトランザクション(強整合性モード)では、このTSOの性能が書き込み性能の上限(ボトルネック)になり得ます。 + + * **アルゴリズム:** **ハイブリッド論理時計 (Hybrid Logical Clock)** + * 物理時刻(ミリ秒)の下位ビットに、論理カウンタ(シーケンス番号)を足した `u64` を発行します。 + * `[ Physical Time (48bit) ] + [ Logical Counter (16bit) ]` + * **Rustでの実装:** + * PDのリーダーだけがメモリ上で `AtomicU64` をインクリメントして返します。 + * **最適化:** クライアント(Proxy)は毎回PDに問い合わせるのではなく、**「今後3秒分のタイムスタンプ」をバッチで予約**して持っていくようなキャッシュ戦略をとることも可能です(厳密性が少し緩むため、通常はバッチリクエストで対応します)。 + +----- + +### 2\. ハートビートとメタデータ収集 + +PDは能動的に監視に行くのではなく、\*\*データノードからの定期報告(Heartbeat)\*\*を受け取って状態を更新します。 + +#### A. `StoreHeartbeat` (ノード単位の報告: 10秒毎) + + * 「私は生きています」 + * **Payload:** ディスク使用率、CPU負荷、ネットワーク帯域使用量。 + * **PDの判断:** 一定期間来なければ「ダウン」と判定し、そのノードにあるデータの再複製(Repair)を開始します。 + +#### B. `RegionHeartbeat` (データ断片単位の報告: 60秒毎 or 変化時) + + * 「私の担当するRegion Xの状態です」 + * **Payload:** + * Leaderは誰か? Peer(複製先)は誰か? + * データサイズ(MB)、書き込みキー数。 + * **現在のモード:** `Mode::Strong` (Raft) か `Mode::Async` (Blob) か。 + * **重要:** ここで「モード」を報告させることで、PDはスケジューリング戦略を切り替えます。 + +----- + +### 3\. スケジューリング・ロジック (The Brain) + +ここが腕の見せ所です。Rustの構造体で「スケジューラ」を定義し、プラグイン可能な設計にします。 + +#### 戦略A: S3メタデータ用 (Raft Regions) + +CPU負荷の分散を最優先します。 + + * **Leader Balance:** 全ノードで「Raftのリーダー」の数が均等になるようにします。リーダーには書き込みと読み込み(ReadIndex)が集中するためです。 + * **Region Merge:** メタデータ削除等で空になったRegionを見つけ、隣接するRegionと結合させてオーバーヘッドを減らします。 + +#### 戦略B: Blob/SNSデータ用 (Async Regions) + +ディスク容量とI/O帯域の分散を最優先します。 + + * **Disk Usage Balance:** 「ディスク使用率が80%を超えたノード」から、「20%のノード」へ、巨大なBlobデータを強制移動(Move)させます。 + * **Cold Data Rebalancing:** アクセス頻度の低い(古い)Blobデータを、安価なHDDノード群へ移動させる(ティアリング)ロジックもここに組み込めます。 + +#### Rustでの実装イメージ + +```rust +trait Scheduler { + fn schedule(&self, cluster: &ClusterInfo) -> Option; +} + +struct DiskBalanceScheduler; +impl Scheduler for DiskBalanceScheduler { + fn schedule(&self, cluster: &ClusterInfo) -> Option { + // 1. 最もディスクを使っているノードを探す + let source_node = cluster.get_max_disk_usage_node(); + // 2. 最も空いているノードを探す + let target_node = cluster.get_min_disk_usage_node(); + + if source_node.usage - target_node.usage > THRESHOLD { + // 3. 移動命令(Operator)を生成 + return Some(Operator::MoveRegion { + region_id: select_region(source_node), + from: source_node.id, + to: target_node.id + }); + } + None + } +} +``` + +----- + +### 4\. PD自体の高可用性 (HA) + +PDが死ぬとシステム全体が停止するため、PD自体も3ノードでクラスタを組みます。 + + * **メタデータの保存:** + * ここでも `sled` (Rust製組み込みDB) や `rocksdb` を使い、その上に `raft-rs` を乗せて、PD自身のメタデータ(Regionの位置情報など)を複製します。 + * `etcd` を外部プロセスとして使うのが楽ですが、\*\*「Rust製の単一バイナリ」\*\*にこだわるなら、PD内部に「メタデータ専用のRaftグループ」を持つのが美しい設計です。 + +----- + +### 5\. VMメッシュ接続との連携 (Topology Awareness) + +ユーザーコンテキストにある「VM基盤とのメッシュ接続」を活かすため、PDに**物理トポロジー**を理解させます。 + + * **Labels (ラベル機能):** + * 各ノード起動時に `zone=us-east-1a`, `rack=rack-1`, `host=vm-host-5` といったラベルを付与してPDに報告させます。 + * **Isolation Rule (隔離ルール):** + * 「同じ `host` ラベルを持つノードには、絶対に同じRegionのレプリカを置かない」という制約をスケジューラに追加します。 + * これにより、物理ホストが1台死んでも、その上で動いている複数のVM(DBノード)が全滅しても、データロストを防げます。 + +----- + +### 開発のステップ + +PDは複雑になりがちなので、以下の順序で作るのが現実的です。 + +1. **Mock PD:** + * 固定のタイムスタンプを返し、スケジューリングを一切しない「ダミーPD」を作る。これを使ってデータノード(Storage)の開発を進める。 +2. **Basic TSO & ID Allocator:** + * 正しいタイムスタンプと、ユニークなID(RegionID, NodeID)を発行する機能だけ実装する。 +3. **Minimal Membership:** + * ノードのJoinを受け付け、リストとして保持する機能。 +4. **Simple Scheduling:** + * 「ディスクがいっぱいになったら移動」というルールを1つだけ実装する。 + +**次のステップ:** +これで「脳(PD)」と「体(Storage)」と「神経(Network)」の設計が出揃いました。 +次は、これらを動かすための\*\*「クライアントSDK(Rust製ライブラリ)」**、つまりアプリ開発者が最初に触る部分のインターフェースを設計しませんか? それとも**「S3互換APIレイヤー」\*\*の具体的な実装に入りますか? + +--- + +# You Asked: + +クライアントSDKをどうすべきかは気になりますね。 + +# Gemini Responded: + +クライアントSDKは、開発者(あなた自身も含め)がこのデータベースを「使いやすい」と感じるか、「使いにくい」と感じるかを決める最重要パーツです。また、パフォーマンスの観点では、SDKが賢く振る舞うことでサーバー側の負荷を劇的に下げることができます。 + +Rustで実装する\*\*「RustDB Client SDK」**の設計案を提案します。 +最大のポイントは、**「強整合性(Transaction)」と「結果整合性(Raw)」を、型システムを使って安全かつ明快に使い分ける\*\*点です。 + +----- + +### 1\. SDKの全体像:Smart Clientアーキテクチャ + +単にリクエストを投げるだけでなく、クライアント側で「データの場所(Region Cache)」を保持する\*\*Smart Client(シック・クライアント)\*\*として設計します。 + +これにより、毎回プロキシやPD(Placement Driver)に問い合わせるオーバーヘッドをゼロにし、\*\*アプリからデータノードへの「直結」\*\*を実現します。 + +### 2\. インターフェース設計 (Rustコード案) + +開発者が触れるAPIのイメージです。`tokio` ベースの非同期設計です。 + +#### A. 接続とクライアント生成 + +```rust +use rustdb::Client; + +// PD(管理ノード)のアドレスを指定して接続 +let client = Client::builder() + .pd_endpoints(vec!["10.0.0.1:2379", "10.0.0.2:2379"]) + .connect() + .await?; +``` + +#### B. 強整合性モード (S3メタデータ / ユーザー管理) + +ACIDトランザクションを提供します。クロージャを使うことで、\*\*「衝突時の自動リトライ」\*\*をSDK内部で隠蔽するのがモダンな設計です。 + +```rust +// キー "bucket:images" のメタデータを更新する例 +client.transaction(|txn| async move { + // 1. 読み込み (Snapshot Read) + let key = "bucket:images"; + let meta_bytes = txn.get(key).await?; + + // 2. ロジック処理 (デシリアライズして変更) + let mut meta: BucketMeta = bincode::deserialize(&meta_bytes)?; + meta.object_count += 1; + + // 3. 書き込みバッファへ (まだ送信されない) + txn.put(key, bincode::serialize(&meta)?).await; + + // 4. コミット (ここで初めて2PCが走り、衝突なら自動リトライ) + Ok(()) +}).await?; +``` + +#### C. 結果整合性モード (SNSタイムライン / Blobデータ) + +トランザクションオブジェクトを作らず、クライアントから直接「投げっぱなし」にします。 + +```rust +// 高速書き込み (Write 1) +// 内部ではハッシュ計算 -> Primaryノード特定 -> QUICで送信 -> 即座に完了 +client.raw_put("timeline:user:123", event_data).await?; + +// 高速読み込み (Read Any) +// 最新である保証はないが、最もレイテンシの低いノードから読む +let data = client.raw_get("timeline:user:123").await?; +``` + +----- + +### 3\. SDK内部の重要機能 + +SDKの中身(`src/lib.rs`)で実装すべき、パフォーマンスの肝となる機能です。 + +#### ① Region Cache(地図のキャッシュ) + +これが「超高速」の鍵です。 + + * **仕組み:** + * クライアントは「Key `user:a` 〜 `user:z` は `Node-1` にある」という地図(Region Map)をメモリに持っておきます。 + * `client.get("user:k")` が呼ばれると、PDに聞かずに直接 `Node-1` へ通信します。 + * **無効化(Invalidation):** + * もし `Node-1` から「そのRegionはもう持ってないよ(NotLeader)」とエラーが返ってきたら、そのキャッシュを捨ててPDに問い合わせ直します。 + +#### ② Backoff & Retry(賢いリトライ) + +分散システムでは「一時的なネットワーク断」や「リーダー選出中」のエラーが日常的に起きます。 + + * 単純な `loop` ではなく、**Exponential Backoff(指数関数的バックオフ)** と **Jitter(ゆらぎ)** を入れたリトライロジックを標準装備します。これをアプリ開発者に書かせてはいけません。 + +#### ③ Connection Pooling (gRPC / QUIC) + + * `tonic` (gRPC) や `quinn` (QUIC) のチャンネル(コネクション)を内部でプールし、使い回します。 + * TCPハンドシェイクのコストを削減するため、KeepAlive設定も重要です。 + +----- + +### 4\. SQLレイヤーとの統合イメージ + +「SQLのような高度なクエリ」を乗せる場合、このSDKはどう使われるでしょうか? +SQLエンジン(プランナー)がこのSDKの利用者になります。 + +```rust +// SQL: SELECT * FROM users WHERE id > 100 LIMIT 10 + +// SQLエンジンの内部処理 +async fn execute_select(&self, plan: Plan) -> Result> { + // 1. スキャンの範囲を決定 + let start_key = encode_key(plan.table_id, 100); + let end_key = encode_key(plan.table_id, MAX); + + // 2. SDKのScan機能を使ってイテレータを取得 + // SDKは内部で複数のRegionに並列アクセスし、結果をストリームで返す + let mut stream = self.client.scan(start_key, end_key).await?; + + // 3. フィルタリングとリミット処理 + let mut rows = Vec::new(); + while let Some(kv) = stream.next().await { + if rows.len() >= 10 { break; } + rows.push(decode_row(kv)); + } + Ok(rows) +} +``` + +### 5\. 開発の進め方へのアドバイス + +SDKをリッチにしすぎると開発が終わらないので、フェーズを分けます。 + +1. **Phase 1: 「薄い」クライアント** + * 毎回PDに問い合わせる、キャッシュなしの実装。まずは動くことを目指す。 + * API(`transaction`, `raw_put`)の形だけ確定させる。 +2. **Phase 2: キャッシュの実装** + * `HashMap` を導入し、PDへのアクセスを減らす。 +3. **Phase 3: 非同期ストリーミングスキャン** + * `Stream` トレイトを実装し、巨大なリスト(S3のListObjects)をメモリを圧迫せずに返せるようにする。 + +**次のステップ:** +クライアントのイメージが固まったことで、システム全体の役者が揃いました。 +ここまでの議論(ストレージ、分散合意、PD、SDK)を踏まえて、\*\*「まず最初に書き始めるべき `Cargo.toml` の構成(ワークスペース構成)」**や、**「プロトタイプ第1弾として実装すべき最小機能セット」\*\*の定義に進みますか? + +--- + +# You Asked: + +このクライアントについてちょっとまだイメージできていないのですが、これが賢い制御を全てやるという認識なのですか?SQLのようなクエリに互換させるために、普通にSQLと互換性のあるクエリが飛んでくるとそれを返す部分が別にあると思っていました。 + +# Gemini Responded: + +そのご認識の通りです!私の説明が少し「SDK」の機能に寄りすぎていました。申し訳ありません。 + +**「賢い制御をやるSDK」は、SQLサーバーの中に「埋め込まれる部品(ライブラリ)」** という位置付けになります。 + +SQLを解釈して返す部分は、**「SQLゲートウェイ(またはSQLレイヤー)」** という別のサーバープロセス(あるいはモジュール)が存在します。 + +全体像を整理し直すと、以下の図のような3層構造になります。 + +----- + +### 正しいアーキテクチャ図 + +```mermaid +graph TD + subgraph "External World" + App1[Web App / CLI] + App2[BI Tool] + end + + subgraph "Layer 1: Interface Gateways (あなたが想定している部分)" + SQL_GW[SQL Gateway Node] + S3_GW[S3 Compatible Gateway] + end + + subgraph "Layer 2: RustDB Client SDK (賢い部品)" + SDK1[SDK (Embedded in SQL GW)] + SDK2[SDK (Embedded in S3 GW)] + end + + subgraph "Layer 3: Distributed Storage Cluster" + Node1[Storage Node 1] + Node2[Storage Node 2] + Node3[Storage Node 3] + PD[Placement Driver] + end + + App1 -- "SQL (Port 5432)" --> SQL_GW + App2 -- "S3 API (HTTP)" --> S3_GW + + SQL_GW -- "Parse & Plan" --> SDK1 + S3_GW -- "Parse XML/JSON" --> SDK2 + + SDK1 -- "Get/Put (KV)" --> Node1 + SDK1 -- "Get/Put (KV)" --> Node2 + SDK2 -- "Get/Put (KV)" --> Node3 + + SDK1 -.-> PD +``` + +----- + +### 各レイヤーの役割分担 + +ここで、それぞれの役割を明確にします。 + +#### 1\. Layer 1: SQL Gateway(SQL互換レイヤー) + +ここが、あなたが仰っていた\*\*「SQLのようなクエリが飛んでくるとそれを返す部分」\*\*です。 + + * **役割:** + * **プロトコル受信:** PostgreSQL (`pgwire`) や MySQL のプロトコルで通信を受け付けます。 + * **SQLパース:** `SELECT * FROM users WHERE id = 1` という文字列を分解します。 + * **クエリプランニング:** 「これはインデックスを使ったほうが速いな」と考え、実行計画を立てます。 + * **実行 (Executor):** 実際にデータを取るために、**Layer 2 の SDK を呼び出します**。 + * **ここで行うこと:** 「Key-Valueへの変換」です。「UsersテーブルのID=1」を、「Key: `t_users_1` を取ってこい」という命令に変換して、SDKに渡します。 + +#### 2\. Layer 2: Client SDK(賢い制御担当) + +ここは、SQLゲートウェイの**プロセス内部で動くライブラリ**です。 + + * **役割:** + * **ルーティング:** SQL層から「Key: `t_users_1` をくれ」と言われた時、「そのキーは今、Storage Node 3にある」と即座に判断します(キャッシュ活用)。 + * **通信の最適化:** 適切なノードへ、gRPCやQUICでリクエストを飛ばします。 + * **リトライ・リカバリ:** 通信エラーが起きても、SQL層にエラーを返す前に、こっそり別のノードへリトライして成功させます。 + * **なぜここが「賢い」必要があるか:** + SQL層の開発者が、「分散システムの複雑さ(どのデータがどこにあるか、誰がリーダーか)」を気にせず、\*\*「単なるローカルKVSを使っている感覚」\*\*で実装できるようにするためです。 + +#### 3\. Layer 3: Storage Nodes + + * **役割:** 言われたKey-Valueをディスクに読み書きするだけです。 + +----- + +### なぜこの構成にするのか?(S3とSQLの共存) + +この構成の最大のメリットは、**S3互換ゲートウェイも同じSDKを使って作れる**という点です。 + + * **SQLゲートウェイ:** SQLをパースして、SDKを叩く。 + * **S3ゲートウェイ:** HTTP/XMLをパースして、SDKを叩く。 + * **SNSバックエンド:** アプリから直接 SDK を叩く(最速)。 + +このように、**「分散DBの難しい部分」をSDKに封じ込める**ことで、その上に色々なインターフェース(SQL, S3, Redis互換など)を簡単に乗せられるようになります。これが冒頭で仰っていた「モジュール化」の真骨頂です。 + +### 結論 + + * **あなたの認識:** 正しいです。「SQLを受け付けて返す別プロセス」が必要です。 + * **SDKの正体:** そのSQLプロセスが、裏側でデータを取りに行くために使う\*\*「高性能なドライバー」\*\*のことです。 + +この「SQLゲートウェイ」部分(パーサーやプランナー)の実装について、もう少し詳しく掘り下げますか? それとも、まずは下の足場(StorageとSDK)を固める方針でしょうか? + +--- + +# You Asked: + +したの足場が、ゲートウェイ部分を作れるように強固に作られていれば大丈夫だと思っています。そういえば、劔という日本製のRDBMSがあります。書き込み、読み込みともにロックフリーで超高速ということを謳っていますが、これは一体どのような仕組みでなされているのでしょうか?もしできれば取り込みたいところです(ただ、これ自体はJava製なので使いたくはない)。 + +https://www.tsurugidb.com/ + +# Gemini Responded: + +**劔 (Tsurugi)** に目をつけられるとは、非常にお目が高いです! +劔は、従来のDB(PostgreSQLなど)が抱えていた「コア数が増えても性能が上がらない(スケーラビリティの限界)」という課題を解決するために、日本のNEDOプロジェクトで開発された、まさに**「メニーコア時代のモンスターエンジン」**です。 + +その「ロックフリーで超高速」の正体は、魔法ではなく、**「Shirakami(白神)」**と呼ばれるトランザクションエンジンと、**「Limestone」**と呼ばれるログ管理機構にあります。 + +今回のRustDBに取り入れられる要素は大いにあります。その仕組みを紐解き、RustDBへの移植案を提案します。 + +--- + +### 1. 劔の「速さ」の正体:3つのコア技術 + +劔が「ロックフリー」と謳っているのは、**「読み込み時にロックを取らない(Snapshot Isolation)」**だけでなく、書き込み時にも**「楽観的並行性制御 (OCC) の高度な最適化」**を行っているからです。 + +#### ① "Epoch" ベースのメモリ管理とコミット +劔は時間を「エポック」という数ミリ秒単位の区切りで管理します。 + +* **仕組み:** + * トランザクションID発行やガベージコレクション(GC)を、個別のロックではなく、この「エポック」の切り替わりでまとめて行います。 + * ログ(WAL)のディスク書き込みも、1件ずつ `fsync` するのではなく、**「エポック内のログをまとめてドン(Group Commit)」**します。 +* **RustDBへの応用:** + * これはRaftのログ書き込みに即応用できます。 + * リクエストが来るたびに `fsync` するのではなく、**「5ms待って、その間に来たリクエストをまとめて1回のRaftログとして書き込む(Batching)」**ことで、スループットが劇的に向上します。 + +#### ② 高度な OCC (Optimistic Concurrency Control) +従来のDB(2PL: Two-Phase Locking)は、「書き込む前にロックして、他の人を待たせる」方式でした。 +劔(Shirakami)は**OCC**を採用しています。 + +* **仕組み:** + 1. **Read/Compute:** データを読み込み、メモリ上で計算する(この間、ロックは一切取らない)。 + 2. **Validate:**コミット直前に「自分が読んだデータが、計算中に他の誰かに書き換えられていないか?」をチェックする。 + 3. **Write:** 問題なければ一瞬で書き込む。ダメならリトライ。 +* **特徴:** ロック待ちが発生しないため、CPUが常に全速力で動き続けられます。 + +#### ③ "Short" と "Long" トランザクションの分離 +これが劔の最大の発明です。OCCの弱点である「長いバッチ処理(Long TX)が、短い書き込み(Short TX)のせいで何度も失敗して飢える」という問題を解決しました。 + +* **仕組み:** + * **Long TX (バッチ):** 開始時に「これからこのエリアを触るぞ」と宣言(予約)させます。 + * **Short TX (Web):** 予約エリアには触れません。 + * これにより、バッチ処理とWebリクエストが互いに邪魔せず、ロックフリーで共存します。 + +--- + +### 2. RustDB への取り込み案 + +劔は「インメモリ・シングルノード」での最適化がメインですが、分散DBであるRustDBでも、その**トランザクション管理の思想**は借用できます。 + +現在想定している **Percolatorモデル(ロックあり)** と **劔モデル(OCC)** を比較し、導入案を示します。 + +#### 案A: S3メタデータ(ディレクトリ構造)への応用 +S3のメタデータ操作(PutObjectなど)は、**「同じキーに対する競合は少ないが、スループットは極限まで欲しい」**という特性があります。 +ここで、Percolatorの「ロックを書いて消す」コストは無駄になりがちです。 + +**→ RustDB版 OCCの実装アプローチ** + +1. **Read Phase:** + * クライアントSDKはデータを読み込み、バージョン(TS)を覚える。ロックはしない。 +2. **Buffer:** + * 変更内容をクライアント側のメモリに貯める。 +3. **Commit Phase (Single CAS):** + * 書き込み時、サーバー(Storage Node)に対して**「Compare-And-Swap (CAS)」**命令を送る。 + * 命令: `「もし Key A のバージョンが X なら、Y に更新してくれ。違ったらエラーで返して」` + * RocksDBには `MergeOperator` や `TransactionDB` という機能があり、これをアトミックに行えます。 + +これなら、ロックファイル(CF_LOCK)を作る `Prewrite` フェーズを省略でき、**通信回数が半分(1 RTT)**になります。 + +#### 案B: "Epoch" ベースの遅延永続化 +結果整合性モード(SNS等)において、劔の **Limestone (ログエンジン)** の思想を取り入れます。 + +* **実装:** + * `rdb-storage` 層に「書き込みバッファ(Ring Buffer)」を用意します。 + * `put` リクエストは、メモリ上のバッファに書いて即座に `Ok` を返します。 + * バックグラウンドのスレッドが、**数ミリ秒ごとにバッファを切り替え(Epoch Rotate)**、まとめてディスクに `write` & `fsync` します。 +* **効果:** + * I/O回数が `1/100` 以下になり、SSDの寿命も延び、性能は爆上がりします。 + * (ただし、電源断で数ミリ秒のデータロストは許容する設定になります) + +### 結論:何を取り入れるべきか? + +劔(Java実装)そのものを組み込むのではなく、**「設計思想」**を以下のようにRustDBのコードに落とし込むのがベストです。 + +1. **S3メタデータ処理には OCC (Optimistic Concurrency Control) を採用する** + * Percolator(悲観的ロックに近い)ではなく、CASベースのコミット処理にすることで、ロック管理のオーバーヘッドを消す。 + * 「競合したらSDKが自動でやり直す」という責務をクライアントSDKに持たせる。 + +2. **ログ書き込みに "Group Commit" を徹底する** + * Raftのログや、結果整合性モードの書き込みにおいて、リクエスト毎のI/Oを禁止し、必ずバッチ化する。 + +これで、RocksDBを使いながらも、劔が目指したような「ロック待ちの少ない、CPUキャッシュ効率の良い」DBに近づけることができます。 + +**次のステップ:** +トランザクション方式として、より高速な **「OCC (Optimistic) パターンでのAPI設計」** に舵を切りますか? +それとも、まずは堅実に **「Percolator (Pessimistic) パターン」** で作り、後からOCCを選べるようにしますか?(S3互換ならOCCの方が相性が良い可能性が高いです) + +--- + +# You Asked: + +OCCパターンにしたいところです。 + +# Gemini Responded: + +良い判断です。S3のような「オブジェクトストレージのメタデータ管理」において、**OCC(楽観的並行性制御)は最強の選択肢**です。 + +なぜなら、S3の操作(`PutObject`や`DeleteObject`)は「特定のファイル」に対する操作であり、**「同じファイル名のファイルを、1マイクロ秒の差で同時にアップロードし合う」ような激しい競合は、現実にはほぼ発生しない**からです。 + +競合が起きないなら、ロック(事前の場所取り)はただの無駄です。 +「劔」の思想を取り入れ、RustDBを\*\*「CAS (Compare-And-Swap) ベースの超高速エンジン」\*\*にするための設計仕様を提案します。 + +----- + +### RustDB-OCC Architecture + +これまでの重厚な `Prewrite` / `Commit` の2往復を廃止し、\*\*「1往復 (1 RTT)」\*\*でトランザクションを完了させます。 + +[Image of optimistic concurrency control transaction flow] + +#### 1\. Core Concept: "CAS" over Raft + +Raftのログとして流すコマンドを、単純な `Put` ではなく、条件付き書き込みである **`CompareAndSwap`** にします。 + + * **サーバー側のロジック:** + 「現在DBにある `Key` のバージョンが `expected_ver` と一致するなら、`new_value` に書き換える。違うならエラーを返す」 + これだけです。非常にシンプルで、CPU命令のように高速です。 + +#### 2\. データ構造の簡素化 (RocksDB) + +Percolatorモデルで必要だった `CF_LOCK`(ロック情報)は**不要**です。 +データ本体とバージョンを管理するだけで済みます。 + + * **CF\_DEFAULT (Data):** + * Key: `[TableID] [Key_Bytes]` + * Value: `[Metadata_Bytes] [Version(u64)]` + * ※以前のようにKeyにTimestampを含める必要すらありません(最新だけ持つなら)。履歴を持つなら含めますが、OCCなら「最新値」との比較がメインになります。 + +----- + +### 具体的な処理フロー (S3 PutObject の例) + +ユーザーが `bucket-a/image.jpg` をアップロードする場合のSDKとサーバーの動きです。 + +#### Step 1: Read (SDK) + +まず、現在の状態を確認します。 + + * SDK -\> Server: `Get("bucket-a/image.jpg")` + * Server -\> SDK: + * データなし (None) の場合: `Version = 0` + * データありの場合: `Version = 105` + +#### Step 2: Local Compute (SDK) + +SDK内部で書き込みデータを準備します。サーバーへの通信は発生しません。 + + * 「新しいメタデータを作成。サイズは5MB...」 + +#### Step 3: CAS Commit (SDK -\> Server) + +SDKは、「私が読んだ時はバージョン0(存在しない)だったので、新規作成します」というリクエストを投げます。 + +```rust +// ネットワークリクエストのイメージ +struct CasRequest { + key: Vec, + expected_version: u64, // 0 なら「新規作成」を意味する + new_value: Vec, +} +``` + +#### Step 4: Validation & Apply (Server / Raft) + +Raftリーダーはこのリクエストを受け取ると、ログに複製し、各ノードで適用(Apply)しようとします。 + + * **成功ケース:** + + * DB上の現在のバージョンが `0` である。 + * \-\> 書き込み実行。バージョンを `1` (またはTSOの時刻) に更新。 + * \-\> SDKに `Ok` を返す。 + + * **失敗ケース (競合発生):** + + * Step 1 と Step 3 の間に、別の誰かがアップロードを完了させており、バージョンが `108` になっていた。 + * \-\> `0 != 108` なので書き込み拒否。 + * \-\> SDKに `Error::Conflict { current: 108 }` を返す。 + +#### Step 5: Retry (SDK) + +エラーを受け取ったSDKは、\*\*「賢いクライアント」\*\*の本領を発揮します。 +ユーザーにエラーを返さず、内部でリトライします。 + + * 「おっと、誰かが先に書いたのか。じゃあバージョン `108` をベースにもう一度試そう(あるいは上書き設定ならそのまま上書きしよう)」 + +----- + +### Rustでの実装イメージ + +#### 1\. RPCメッセージ定義 (`rdb-proto`) + +```rust +// gRPCの定義イメージ +message CasRequest { + bytes key = 1; + uint64 expected_version = 2; + bytes value = 3; +} + +message CasResponse { + bool success = 1; + uint64 current_version = 2; // 失敗時に最新バージョンを教える +} +``` + +#### 2\. ストレージエンジンの実装 (`rdb-storage`) + +RocksDBの `Merge` 機能を使うか、単純に `Get` -\> `Put` をアトミックに行います。RocksDBの `TransactionDB` を使うと楽です。 + +```rust +impl StorageEngine { + pub fn compare_and_swap(&self, key: &[u8], expected: u64, val: &[u8]) -> Result { + // RocksDBの楽観的トランザクションを開始 + let txn = self.db.transaction(); + + // 1. 最新値を読む (Snapshot Read) + let current_val = txn.get_for_update(key, true)?; + let current_ver = extract_version(¤t_val); + + // 2. 検証 + if current_ver != expected { + return Err(Error::Conflict(current_ver)); + } + + // 3. 書き込み (メモリバッファへ) + let new_data = append_version(val, generate_new_version()); + txn.put(key, new_data)?; + + // 4. コミット (ここでRocksDB内部で最終チェックが行われる) + txn.commit()?; + + Ok(new_version) + } +} +``` + +----- + +### 「劔」要素の取り込み: Group Commit & Epoch + +さらにパフォーマンスを上げるために、劔のアイデアをここで注入します。 + +#### Group Commit (Raft層の最適化) + +OCCのリクエストは非常に軽量なので、1件ずつRaftのログに書いて `fsync` しているとディスクI/Oがボトルネックになります。 + + * **実装:** + * Raftリーダーのスレッドは、リクエストが来てもすぐにはディスクに書きません。 + * \*\*「数ミリ秒のバス」\*\*を待機させます。 + * バスが満員になるか、時間が来たら、**数十〜数百個のCASリクエストをまとめて1つのRaftログエントリ**として書き込みます。 + * **効果:** + * IOPS(I/O回数)の限界を超えてスループットが出せます。 + +#### Epoch-based GC (読み込みのノンブロッキング化) + +「読み込みロックフリー」を徹底するため、古いバージョンのデータをいつ消すか(GC)をEpochで管理します。 + + * 現在処理中のEpochより古いデータは、**「どのトランザクションからも参照されていない」ことが保証された瞬間**に、別スレッドでバッサリ削除します。 + * これにより、メインの書き込みスレッドはGCのことを忘れて走り続けられます。 + +----- + +### この設計のメリット・デメリット + +| 項目 | OCCパターン (CAS) | 従来のPercolatorパターン | +| :--- | :--- | :--- | +| **通信回数** | **1 RTT** (爆速) | 2 RTT (Prewrite -\> Commit) | +| **ロック** | **なし** (サーバー負荷が低い) | あり (ロック情報の管理コスト大) | +| **競合時の挙動** | リトライ地獄になる可能性あり | ロック待ちで解決できる | +| **S3適性** | **最適** (同じKeyへの同時書き込みは稀) | オーバースペック | +| **カウンター適性** | 不向き (100人が同時にカウントアップすると99人が失敗・リトライする) | 向いている (ロックで順序制御) | + +### 結論 + +**RustDBは「OCC (CAS) ベース」で行きましょう。** + +想定されている用途(S3、SNS、高速イベント処理)において、Percolatorのような悲観的ロックが必要なシーンは限定的です。 +もし将来的に「カウンターのような激しい競合」が必要になれば、その時だけ使える「悲観ロックAPI」を追加すれば良いのです(Redisの `INCR` のような専用コマンドを作るなど)。 + +**次のステップ:** +これでDBのコアエンジンの仕様が決まりました。 +いよいよ具体的な開発に入るために、\*\*「Rustのワークスペース構成(ディレクトリ構成)」**と**「最初に実装する `Hello World` 的なミニマムゴール」\*\*を提示しましょうか? + +--- + +# You Asked: + +そうですね、ロックの仕様も選べるようにしたいです。全体像(アーキテクチャ)と計画、各部で開発すべきものとそれぞれのゴールをまとめて、コーディングエージェントでもわかる程度の粒度にまとめてほしいです。 + +# Gemini Responded: + +承知しました。コーディングエージェント(AIや他の開発者)に渡しても実装が進められるレベルで、**RustDB** の全体アーキテクチャ定義、ディレクトリ構成、そして開発フェーズごとの詳細仕様を定義します。 + +方針として、**「Core (Storage/Net) は強固に、Logic (Consensus/Lock) はプラガブルに」** を徹底します。 + +----- + +# RustDB Project Definition + +## 1\. ハイレベル・アーキテクチャ + +システムは以下の4つの独立したコンポーネント(マイクロサービス)で構成されます。 + +```mermaid +graph TD + UserApp[User App / Gateway] -->|Uses| SDK[rdb-client (Smart SDK)] + + subgraph Control_Plane + PD[rdb-pd (Placement Driver)] + end + + subgraph Data_Plane_Cluster + Node1[rdb-server (Node 1)] + Node2[rdb-server (Node 2)] + Node3[rdb-server (Node 3)] + end + + SDK -.->|Metadata/TSO| PD + SDK ==|Direct Access (gRPC/QUIC)|==> Node1 + PD -.->|Heartbeat/Schedule| Node1 + Node1 <-->|Raft/Replication| Node2 +``` + +1. **`rdb-client` (Smart SDK):** アプリに埋め込まれるライブラリ。シャーディング情報のキャッシュ、リトライ、トランザクション制御(OCC/Pessimisticの使い分け)を担当。 +2. **`rdb-pd` (Placement Driver):** クラスタの脳。TSO(時刻配信)、トポロジー管理、データ再配置の指令を行う。 +3. **`rdb-server` (Storage Node):** データの保存と複製。`raft-rs` による合意形成、または非同期レプリケーションを実行。 +4. **`rdb-storage` (Engine):** RocksDBのラッパー。MVCC, CAS, Raw Writeの物理的な実行を担当。 + +----- + +## 2\. Workspace構成 (`Cargo.toml`) + +Monorepo構成を採用します。 + +```toml +[workspace] +members = [ + "rdb-proto", # Protocol Buffers / gRPC definitions + "rdb-common", # Shared utilities (Error types, Configs) + "rdb-storage", # RocksDB wrapper & Local Storage Engine + "rdb-server", # The main Data Node binary + "rdb-pd", # Placement Driver binary + "rdb-client", # Client SDK library + "rdb-cli", # Admin CLI tool +] + +[profile.release] +lto = true +opt-level = 3 +codegen-units = 1 +``` + +----- + +## 3\. モジュール別 詳細仕様と開発ゴール + +コーディングエージェントへの指示書として使える粒度です。 + +### Module 1: `rdb-proto` (インターフェース定義) + +全ての通信規約をここで定義します。 + + * **Tech Stack:** `tonic` (gRPC), `prost` + * **Definitions (`.proto`):** + * `meta.proto`: `Region`, `Peer`, `NodeInfo` 等の型定義。 + * `pdpb.proto`: `GetTsoRequest`, `HeartbeatRequest`, `AskSplitRequest` 等。 + * `kvrpc.proto`: データ操作用。 + * `RawPutRequest`: 結果整合性用(Key, Value)。 + * `CasRequest`: OCC用(Key, Value, ExpectedVersion)。 + * `LockRequest` / `UnlockRequest`: 悲観的ロック用(将来用)。 + * **Goal:** `cargo build` でRustのコードが自動生成されること。 + +### Module 2: `rdb-storage` (ストレージエンジン) + +分散を意識せず、\*\*「単体ノードでの最強のKVS」\*\*を作ります。 + + * **Tech Stack:** `rocksdb` binding + * **Architecture:** + * **Trait `StorageEngine`:** テスト容易性のため、`RocksEngine` と `MemoryEngine` (test用) を実装。 + * **Key Features:** + 1. **Column Families:** `default` (Data), `lock` (Pessimistic Lock用), `raft` (Raft Log用). + 2. **Method `compare_and_swap(key, expected_ver, new_val)`:** + * RocksDBの `Transaction` または `MergeOperator` を使用。 + * Atomicに「読み出し -\> 比較 -\> 書き込み」を行う。 + 3. **Method `put_raw(key, val)`:** + * WAL最適化(Syncなし)での高速書き込み。 + * **Goal:** ユニットテストで `compare_and_swap` が正しく競合検知(バージョン不一致エラー)できること。 + +### Module 3: `rdb-server` (分散ノード・合意形成) + +最難関パートです。`rdb-storage` をラップし、ネットワーク越しの複製を行います。 + + * **Tech Stack:** `raft-rs`, `tokio`, `quinn` (Data stream) + * **Internal Structure:** + * **`RegionRouter`:** RegionID を受け取り、対応する `Peer` (Raft Group) へメッセージを配送。 + * **`RaftStore`:** `raft-rs` の駆動ループ。Tick(時計), Step(メッセージ処理), Apply(ストレージ適用) を回す。 + * **Strategy Pattern for Consistency:** + * `ConsistencyMode::Strong`: Raftの `propose` を呼ぶ。 + * `ConsistencyMode::Eventual`: ローカルの `rdb-storage` に書き込み、非同期キューに積む。 + * **Optimization:** + * **Batch System:** 複数のClientリクエストを1つのRaft Log Entryにまとめる(劔/TiKVの思想)。 + * **Goal:** + * 3つのプロセスを立ち上げ、1つをKillしてもデータが読み書きできること(Raftモード)。 + * Rawモードで書き込み、非同期で他ノードに伝播すること。 + +### Module 4: `rdb-pd` (司令塔) + +ステートレスに見えて、自身のメタデータは堅牢に守る必要があります。 + + * **Tech Stack:** `axum` (HTTP API), `sled` (Embedded DB for metadata) + * **Features:** + 1. **TSO (Timestamp Oracle):** `AtomicU64` で単調増加IDを発行。 + 2. **ID Allocator:** ノードID、RegionIDの一意な発行。 + 3. **Memberlist:** 現在生きているノードのリスト保持。 + * **Goal:** HTTP/gRPC経由で「現在時刻」と「ノード一覧」が取得できること。 + +### Module 5: `rdb-client` (SDK) + +「賢い」ロジックの集約場所です。 + + * **Features:** + 1. **Region Cache:** `BTreeMap` で「どのキーがどこにあるか」を管理。 + 2. **Smart Routing:** キャッシュを見て直接ノードへgRPC。エラーならPDへ問い合わせてキャッシュ更新。 + 3. **Transactions (Optimistic):** + * `txn.commit()` 時に自動でバージョンの不整合をチェックし、指定回数リトライするロジック。 + 4. **Transactions (Pessimistic - Option):** + * `txn.lock_for_update(key)` を呼ぶと、サーバーへ `LockRequest` を投げる実装。 + * **Goal:** + * `client.cas_put("key", "val")` のようなシンプルなコードで、裏側の分散処理が隠蔽されていること。 + +----- + +## 4\. 開発ロードマップ(Step-by-Step Plan) + +エージェントには以下の順序でタスクを割り振ります。 + +### Phase 1: 足場とストレージ (Week 1-2) + + * [ ] Workspaceの作成と `rdb-proto` の定義完了。 + * [ ] `rdb-storage` で RocksDB をラップし、`CAS` と `Raw Put` の単体テストをパスさせる。 + * [ ] `rdb-server` のスケルトン作成(gRPCサーバーが立ち上がり、`rdb-storage` に書き込めるだけの単一ノード版)。 + +### Phase 2: PDと基本通信 (Week 3) + + * [ ] `rdb-pd` の実装。TSOの実装。 + * [ ] `rdb-server` が起動時に `rdb-pd` へRegister(登録)し、Heartbeatを送る機能の実装。 + * [ ] `rdb-client` の作成。PDからノード情報を取得して接続する機能。 + +### Phase 3: Raftの実装 (Week 4-6) + + * [ ] `rdb-server` に `raft-rs` を組み込む。 + * [ ] 固定の1 Regionだけでいいので、3ノード間でのログ複製(Leader Election, Log Replication)を成功させる。 + * [ ] Clientからの `CAS` リクエストを Raft 経由で処理するパイプラインの構築。 + +### Phase 4: マルチモードとSDKの高度化 (Week 7-8) + + * [ ] `rdb-server` に「結果整合性モード(Async Replication)」の分岐を追加。 + * [ ] `rdb-client` に「Region Cache」と「リトライロジック」を実装。 + * [ ] **ロック仕様の追加:** ここで初めて `LockRequest` (悲観的ロック) の処理を `rdb-server` とSDKに追加する(オプション扱い)。 + +### Phase 5: VMメッシュ統合と最適化 (Week 9+) + + * [ ] `quinn` (QUIC) の導入による通信高速化。 + * [ ] PDへのスケジューリングロジック(ディスク容量バランス等)の実装。 + +----- + +## 5\. ロック仕様の選定について(補足) + +ご要望の「ロック仕様を選べるようにする」点は、SDKのメソッドで表現します。 + +```rust +// A. 楽観的ロック (デフォルト・推奨) +// サーバー側でロック保持せず、書き込み瞬間のCAS判定のみ。高速。 +let res = client.optimistic_txn(|txn| async move { + let val = txn.get("key").await?; + txn.put("key", update(val)); + Ok(()) // commit時に検証 +}).await; + +// B. 悲観的ロック (オプション) +// サーバーのLock CFにレコードを書き、他をブロックする。 +// カウンターや厳密な在庫管理などに使用。 +let res = client.pessimistic_txn(|txn| async move { + txn.lock("key").await?; // ここでサーバーへLockRequestが飛ぶ + let val = txn.get("key").await?; + txn.put("key", update(val)); + Ok(()) +}).await; +``` + +この設計書をベースに、まずは **Phase 1 (Proto & Storage)** の実装を開始するのが良いでしょう。 + +--- + diff --git a/flaredb/crates/flaredb-cli/Cargo.toml b/flaredb/crates/flaredb-cli/Cargo.toml new file mode 100644 index 0000000..d55b902 --- /dev/null +++ b/flaredb/crates/flaredb-cli/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "flaredb-cli" +version.workspace = true +edition.workspace = true + +[dependencies] +flaredb-client = { path = "../flaredb-client" } +tokio.workspace = true +clap.workspace = true diff --git a/flaredb/crates/flaredb-cli/src/main.rs b/flaredb/crates/flaredb-cli/src/main.rs new file mode 100644 index 0000000..e9022cd --- /dev/null +++ b/flaredb/crates/flaredb-cli/src/main.rs @@ -0,0 +1,3 @@ +fn main() { + println!("Hello from rdb-cli!"); +} diff --git a/flaredb/crates/flaredb-client/Cargo.toml b/flaredb/crates/flaredb-client/Cargo.toml new file mode 100644 index 0000000..cce116e --- /dev/null +++ b/flaredb/crates/flaredb-client/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "flaredb-client" +version.workspace = true +edition.workspace = true + +[dependencies] +flaredb-proto = { path = "../flaredb-proto" } +tokio.workspace = true +tonic.workspace = true +prost.workspace = true +clap.workspace = true + +[dev-dependencies] +tokio-stream.workspace = true diff --git a/flaredb/crates/flaredb-client/src/client.rs b/flaredb/crates/flaredb-client/src/client.rs new file mode 100644 index 0000000..75841fd --- /dev/null +++ b/flaredb/crates/flaredb-client/src/client.rs @@ -0,0 +1,264 @@ +use flaredb_proto::kvrpc::kv_cas_client::KvCasClient; +use flaredb_proto::kvrpc::kv_raw_client::KvRawClient; +use flaredb_proto::kvrpc::{ + CasRequest, DeleteRequest, GetRequest, RawDeleteRequest, RawGetRequest, RawPutRequest, + RawScanRequest, ScanRequest, +}; +use std::collections::HashMap; +use std::sync::Arc; +use tokio::sync::Mutex; +use tonic::transport::Channel; + +use flaredb_proto::pdpb::pd_client::PdClient; +use flaredb_proto::pdpb::tso_client::TsoClient; +use flaredb_proto::pdpb::{GetRegionRequest, TsoRequest}; + +use crate::region_cache::RegionCache; + +pub struct RdbClient { + // We need a map of addr -> Channel/Client to reuse connections + // Or just create on fly for MVP? Connection creation is expensive. + // Let's cache channels. + channels: Arc>>, + + // Clients for PD (fixed) + tso_client: TsoClient, + pd_client: PdClient, + + region_cache: RegionCache, + namespace: String, +} + +impl RdbClient { + pub async fn connect_with_pd( + _server_addr: String, + pd_addr: String, + ) -> Result { + Self::connect_with_pd_namespace(_server_addr, pd_addr, String::new()).await + } + + pub async fn connect_with_pd_namespace( + _server_addr: String, + pd_addr: String, + namespace: impl Into, + ) -> Result { + // server_addr is intentionally unused for now; once the region cache is populated we route via PD. + + let pd_ep = format!("http://{}", pd_addr); + let pd_channel = Channel::from_shared(pd_ep).unwrap().connect().await?; + + Ok(Self { + channels: Arc::new(Mutex::new(HashMap::new())), + tso_client: TsoClient::new(pd_channel.clone()), + pd_client: PdClient::new(pd_channel), + region_cache: RegionCache::new(), + namespace: namespace.into(), + }) + } + + async fn resolve_addr(&self, key: &[u8]) -> Result { + if let Some(addr) = self.region_cache.get_store_addr(key).await { + return Ok(addr); + } + + let mut pd_c = self.pd_client.clone(); + let req = GetRegionRequest { key: key.to_vec() }; + let resp = pd_c.get_region(req).await?.into_inner(); + if let (Some(region), Some(leader)) = (resp.region, resp.leader) { + self.region_cache.update(region, leader.clone()).await; + Ok(leader.addr) + } else { + Err(tonic::Status::not_found("Region not found")) + } + } + + async fn get_channel(&self, addr: &str) -> Result { + let mut map = self.channels.lock().await; + if let Some(chan) = map.get(addr) { + return Ok(chan.clone()); + } + + let ep = format!("http://{}", addr); + let chan = Channel::from_shared(ep).unwrap().connect().await?; + map.insert(addr.to_string(), chan.clone()); + Ok(chan) + } + + pub async fn get_tso(&mut self) -> Result { + let req = TsoRequest { count: 1 }; + let resp = self.tso_client.get_timestamp(req).await?.into_inner(); + Ok(resp.timestamp) + } + + pub async fn raw_put(&mut self, key: Vec, value: Vec) -> Result<(), tonic::Status> { + let addr = self.resolve_addr(&key).await?; + let channel = self + .get_channel(&addr) + .await + .map_err(|e| tonic::Status::internal(e.to_string()))?; + let mut client = KvRawClient::new(channel); + + let req = RawPutRequest { + key, + value, + namespace: self.namespace.clone(), + }; + client.raw_put(req).await?; + Ok(()) + } + + pub async fn raw_get(&mut self, key: Vec) -> Result>, tonic::Status> { + let addr = self.resolve_addr(&key).await?; + let channel = self + .get_channel(&addr) + .await + .map_err(|e| tonic::Status::internal(e.to_string()))?; + let mut client = KvRawClient::new(channel); + let req = RawGetRequest { + key, + namespace: self.namespace.clone(), + }; + let resp = client.raw_get(req).await?.into_inner(); + if resp.found { + Ok(Some(resp.value)) + } else { + Ok(None) + } + } + + pub async fn raw_delete(&mut self, key: Vec) -> Result { + let addr = self.resolve_addr(&key).await?; + let channel = self + .get_channel(&addr) + .await + .map_err(|e| tonic::Status::internal(e.to_string()))?; + let mut client = KvRawClient::new(channel); + let req = RawDeleteRequest { + key, + namespace: self.namespace.clone(), + }; + let resp = client.raw_delete(req).await?.into_inner(); + Ok(resp.existed) + } + + /// Scan a range of keys in raw (eventual consistency) mode. + /// + /// Returns (keys, values, next_key if has_more). + pub async fn raw_scan( + &mut self, + start_key: Vec, + end_key: Vec, + limit: u32, + ) -> Result<(Vec>, Vec>, Option>), tonic::Status> { + let addr = self.resolve_addr(&start_key).await?; + let channel = self + .get_channel(&addr) + .await + .map_err(|e| tonic::Status::internal(e.to_string()))?; + let mut client = KvRawClient::new(channel); + let req = RawScanRequest { + start_key, + end_key, + limit, + namespace: self.namespace.clone(), + }; + let resp = client.raw_scan(req).await?.into_inner(); + let next = if resp.has_more { + Some(resp.next_key) + } else { + None + }; + Ok((resp.keys, resp.values, next)) + } + + pub async fn cas( + &mut self, + key: Vec, + value: Vec, + expected_version: u64, + ) -> Result<(bool, u64, u64), tonic::Status> { + let addr = self.resolve_addr(&key).await?; + + let channel = self + .get_channel(&addr) + .await + .map_err(|e| tonic::Status::internal(e.to_string()))?; + let mut client = KvCasClient::new(channel); + + let req = CasRequest { + key, + value, + expected_version, + namespace: self.namespace.clone(), + }; + let resp = client.compare_and_swap(req).await?.into_inner(); + Ok((resp.success, resp.current_version, resp.new_version)) + } + + pub async fn cas_get(&mut self, key: Vec) -> Result)>, tonic::Status> { + let addr = self.resolve_addr(&key).await?; + let channel = self + .get_channel(&addr) + .await + .map_err(|e| tonic::Status::internal(e.to_string()))?; + let mut client = KvCasClient::new(channel); + let req = GetRequest { + key, + namespace: self.namespace.clone(), + }; + let resp = client.get(req).await?.into_inner(); + if resp.found { + Ok(Some((resp.version, resp.value))) + } else { + Ok(None) + } + } + + pub async fn cas_scan( + &mut self, + start_key: Vec, + end_key: Vec, + limit: u32, + ) -> Result<(Vec<(Vec, Vec, u64)>, Option>), tonic::Status> { + let addr = self.resolve_addr(&start_key).await?; + let channel = self + .get_channel(&addr) + .await + .map_err(|e| tonic::Status::internal(e.to_string()))?; + let mut client = KvCasClient::new(channel); + let req = flaredb_proto::kvrpc::ScanRequest { + start_key, + end_key, + limit, + namespace: self.namespace.clone(), + }; + let resp = client.scan(req).await?.into_inner(); + let entries: Vec<(Vec, Vec, u64)> = resp + .entries + .into_iter() + .map(|kv| (kv.key, kv.value, kv.version)) + .collect(); + let next = if resp.has_more { Some(resp.next_key) } else { None }; + Ok((entries, next)) + } + + pub async fn cas_delete( + &mut self, + key: Vec, + expected_version: u64, + ) -> Result<(bool, u64, bool), tonic::Status> { + let addr = self.resolve_addr(&key).await?; + let channel = self + .get_channel(&addr) + .await + .map_err(|e| tonic::Status::internal(e.to_string()))?; + let mut client = KvCasClient::new(channel); + let req = DeleteRequest { + key, + expected_version, + namespace: self.namespace.clone(), + }; + let resp = client.delete(req).await?.into_inner(); + Ok((resp.success, resp.current_version, resp.existed)) + } +} diff --git a/flaredb/crates/flaredb-client/src/lib.rs b/flaredb/crates/flaredb-client/src/lib.rs new file mode 100644 index 0000000..48b2a2c --- /dev/null +++ b/flaredb/crates/flaredb-client/src/lib.rs @@ -0,0 +1,4 @@ +pub mod client; +pub mod region_cache; + +pub use client::RdbClient; diff --git a/flaredb/crates/flaredb-client/src/main.rs b/flaredb/crates/flaredb-client/src/main.rs new file mode 100644 index 0000000..a98a95f --- /dev/null +++ b/flaredb/crates/flaredb-client/src/main.rs @@ -0,0 +1,91 @@ +use clap::{Parser, Subcommand}; +use flaredb_client::RdbClient; + +#[derive(Parser)] +#[command(author, version, about, long_about = None)] +struct Args { + #[arg(long, default_value = "127.0.0.1:50051")] + addr: String, + + #[arg(long, default_value = "127.0.0.1:2379")] + pd_addr: String, + + #[command(subcommand)] + command: Commands, +} + +#[derive(Subcommand)] +enum Commands { + RawPut { + #[arg(long)] + key: String, + #[arg(long)] + value: String, + }, + RawGet { + #[arg(long)] + key: String, + }, + CasGet { + #[arg(long)] + key: String, + }, + Cas { + #[arg(long)] + key: String, + #[arg(long)] + value: String, + #[arg(long)] + expected: u64, + }, + Tso, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let args = Args::parse(); + let mut client = RdbClient::connect_with_pd(args.addr, args.pd_addr).await?; + + match args.command { + Commands::RawPut { key, value } => { + client.raw_put(key.into_bytes(), value.into_bytes()).await?; + println!("Success"); + } + Commands::RawGet { key } => match client.raw_get(key.into_bytes()).await? { + Some(val) => println!("{}", String::from_utf8_lossy(&val)), + None => { + eprintln!("Not found"); + std::process::exit(1); + } + }, + Commands::Cas { + key, + value, + expected, + } => { + let (success, current, new) = client + .cas(key.into_bytes(), value.into_bytes(), expected) + .await?; + if success { + println!("Success, Version: {}", new); + } else { + println!("Conflict! Current Version: {}", current); + } + } + Commands::Tso => { + let ts = client.get_tso().await?; + println!("Timestamp: {}", ts); + } + Commands::CasGet { key } => match client.cas_get(key.into_bytes()).await? { + Some((ver, val)) => { + println!("Version: {}, Value: {}", ver, String::from_utf8_lossy(&val)); + } + None => { + eprintln!("Not found"); + std::process::exit(1); + } + }, + } + + Ok(()) +} diff --git a/flaredb/crates/flaredb-client/src/region_cache.rs b/flaredb/crates/flaredb-client/src/region_cache.rs new file mode 100644 index 0000000..07a6e28 --- /dev/null +++ b/flaredb/crates/flaredb-client/src/region_cache.rs @@ -0,0 +1,48 @@ +use flaredb_proto::pdpb::{Region, Store}; +use std::sync::Arc; +use tokio::sync::RwLock; + +#[derive(Clone)] +pub struct RegionCache { + // Map: StartKey -> RegionInfo + // For MVP, strict match or prefix? + // Regions are: [start, end). + // We need to find a region where start <= key < end. + // BTreeMap is better for range search. But Rust BTreeMap is sync. + // Using Vec + Scan for MVP is fine (few regions). + regions: Arc>>, +} + +impl RegionCache { + pub fn new() -> Self { + Self { + regions: Arc::new(RwLock::new(Vec::new())), + } + } + + pub async fn update(&self, region: Region, leader: Store) { + let mut cache = self.regions.write().await; + // Remove overlaps? Or just append and filter? + // Ideally replace. + cache.retain(|(r, _)| r.id != region.id); + cache.push((region, leader)); + } + + pub async fn get_store_addr(&self, key: &[u8]) -> Option { + let cache = self.regions.read().await; + for (region, store) in cache.iter() { + let start_ok = region.start_key.is_empty() || key >= ®ion.start_key[..]; + let end_ok = region.end_key.is_empty() || key < ®ion.end_key[..]; + if start_ok && end_ok { + return Some(store.addr.clone()); + } + } + None + } +} + +impl Default for RegionCache { + fn default() -> Self { + Self::new() + } +} diff --git a/flaredb/crates/flaredb-client/tests/test_rpc_connect.rs b/flaredb/crates/flaredb-client/tests/test_rpc_connect.rs new file mode 100644 index 0000000..dfbaabc --- /dev/null +++ b/flaredb/crates/flaredb-client/tests/test_rpc_connect.rs @@ -0,0 +1,297 @@ +use std::collections::HashMap; +use std::net::SocketAddr; +use std::sync::{ + atomic::{AtomicU64, Ordering}, + Arc, +}; + +use flaredb_client::RdbClient; +use flaredb_proto::kvrpc::kv_cas_server::KvCas; +use flaredb_proto::kvrpc::kv_cas_server::KvCasServer; +use flaredb_proto::kvrpc::kv_raw_server::KvRaw; +use flaredb_proto::kvrpc::kv_raw_server::KvRawServer; +use flaredb_proto::kvrpc::{ + CasRequest, CasResponse, GetRequest, GetResponse, RawGetRequest, RawGetResponse, RawPutRequest, + RawPutResponse, RawScanRequest, RawScanResponse, ScanRequest, ScanResponse, +}; +use flaredb_proto::pdpb::pd_server::Pd; +use flaredb_proto::pdpb::pd_server::PdServer; +use flaredb_proto::pdpb::tso_server::Tso; +use flaredb_proto::pdpb::tso_server::TsoServer; +use flaredb_proto::pdpb::{ + GetRegionRequest, GetRegionResponse, ListRegionsRequest, ListRegionsResponse, Region, + RegisterStoreRequest, RegisterStoreResponse, Store, TsoRequest, TsoResponse, +}; +use tokio::net::TcpListener; +use tokio::sync::{oneshot, Mutex}; +use tokio_stream::wrappers::TcpListenerStream; +use tonic::transport::Server; +use tonic::{Request, Response, Status}; + +#[derive(Clone, Default)] +struct TestKvService { + raw: Arc, Vec>>>, + cas: Arc, (u64, Vec)>>>, +} + +#[tonic::async_trait] +impl KvRaw for TestKvService { + async fn raw_put( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let mut raw = self.raw.lock().await; + raw.insert(req.key, req.value); + Ok(Response::new(RawPutResponse { success: true })) + } + + async fn raw_get( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let raw = self.raw.lock().await; + if let Some(val) = raw.get(&req.key) { + Ok(Response::new(RawGetResponse { + found: true, + value: val.clone(), + })) + } else { + Ok(Response::new(RawGetResponse { + found: false, + value: Vec::new(), + })) + } + } + + async fn raw_scan( + &self, + _request: Request, + ) -> Result, Status> { + Ok(Response::new(RawScanResponse { + keys: vec![], + values: vec![], + has_more: false, + next_key: vec![], + })) + } +} + +#[tonic::async_trait] +impl KvCas for TestKvService { + async fn compare_and_swap( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let mut cas = self.cas.lock().await; + let (current_version, _) = cas.get(&req.key).cloned().unwrap_or((0, Vec::new())); + + if current_version != req.expected_version { + return Ok(Response::new(CasResponse { + success: false, + current_version, + new_version: 0, + })); + } + + let new_version = current_version + 1; + cas.insert(req.key, (new_version, req.value)); + + Ok(Response::new(CasResponse { + success: true, + current_version, + new_version, + })) + } + + async fn get(&self, request: Request) -> Result, Status> { + let req = request.into_inner(); + let cas = self.cas.lock().await; + if let Some((ver, val)) = cas.get(&req.key) { + Ok(Response::new(GetResponse { + found: true, + value: val.clone(), + version: *ver, + })) + } else { + Ok(Response::new(GetResponse { + found: false, + value: Vec::new(), + version: 0, + })) + } + } + + async fn scan(&self, _request: Request) -> Result, Status> { + Ok(Response::new(ScanResponse { + entries: vec![], + has_more: false, + next_key: vec![], + })) + } +} + +#[derive(Clone)] +struct TestPdService { + region: Region, + leader: Store, +} + +#[tonic::async_trait] +impl Pd for TestPdService { + async fn register_store( + &self, + _request: Request, + ) -> Result, Status> { + Ok(Response::new(RegisterStoreResponse { + store_id: self.leader.id, + cluster_id: 1, + })) + } + + async fn get_region( + &self, + _request: Request, + ) -> Result, Status> { + Ok(Response::new(GetRegionResponse { + region: Some(self.region.clone()), + leader: Some(self.leader.clone()), + })) + } + + async fn list_regions( + &self, + _request: Request, + ) -> Result, Status> { + Ok(Response::new(ListRegionsResponse { + regions: vec![self.region.clone()], + stores: vec![self.leader.clone()], + })) + } +} + +#[derive(Clone, Default)] +struct TestTsoService { + counter: Arc, +} + +#[tonic::async_trait] +impl Tso for TestTsoService { + async fn get_timestamp( + &self, + request: Request, + ) -> Result, Status> { + let count = request.into_inner().count.max(1) as u64; + let start = self.counter.fetch_add(count, Ordering::AcqRel) + 1; + Ok(Response::new(TsoResponse { + timestamp: start, + count: count as u32, + })) + } +} + +async fn start_kv_server( + service: TestKvService, +) -> Result< + (SocketAddr, oneshot::Sender<()>, tokio::task::JoinHandle<()>), + Box, +> { + let listener = TcpListener::bind("127.0.0.1:0").await?; + let addr = listener.local_addr()?; + let incoming = TcpListenerStream::new(listener); + let (tx, rx) = oneshot::channel(); + let raw_service = service.clone(); + let cas_service = service.clone(); + + let handle = tokio::spawn(async move { + Server::builder() + .add_service(KvRawServer::new(raw_service)) + .add_service(KvCasServer::new(cas_service)) + .serve_with_incoming_shutdown(incoming, async { + let _ = rx.await; + }) + .await + .unwrap(); + }); + + Ok((addr, tx, handle)) +} + +async fn start_pd_server( + region: Region, + leader: Store, +) -> Result< + (SocketAddr, oneshot::Sender<()>, tokio::task::JoinHandle<()>), + Box, +> { + let listener = TcpListener::bind("127.0.0.1:0").await?; + let addr = listener.local_addr()?; + let incoming = TcpListenerStream::new(listener); + let (tx, rx) = oneshot::channel(); + + let tso_service = TestTsoService::default(); + let pd_service = TestPdService { region, leader }; + + let handle = tokio::spawn(async move { + Server::builder() + .add_service(TsoServer::new(tso_service)) + .add_service(PdServer::new(pd_service)) + .serve_with_incoming_shutdown(incoming, async { + let _ = rx.await; + }) + .await + .unwrap(); + }); + + Ok((addr, tx, handle)) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_rpc_connect() -> Result<(), Box> { + let kv_service = TestKvService::default(); + let (kv_addr, kv_shutdown, kv_handle) = start_kv_server(kv_service).await?; + + let leader = Store { + id: 1, + addr: kv_addr.to_string(), + }; + let region = Region { + id: 1, + start_key: Vec::new(), + end_key: Vec::new(), + peers: vec![1], + leader_id: 1, + }; + + let (pd_addr, pd_shutdown, pd_handle) = start_pd_server(region, leader).await?; + + let mut client = RdbClient::connect_with_pd(kv_addr.to_string(), pd_addr.to_string()).await?; + + let ts = client.get_tso().await?; + assert!(ts > 0); + + client.raw_put(b"k1".to_vec(), b"v1".to_vec()).await?; + let got = client.raw_get(b"k1".to_vec()).await?; + assert_eq!(got, Some(b"v1".to_vec())); + + let (ok, current, new_version) = client.cas(b"cas_key".to_vec(), b"v1".to_vec(), 0).await?; + assert!(ok); + assert_eq!(current, 0); + assert_eq!(new_version, 1); + + let (ok2, current2, _) = client.cas(b"cas_key".to_vec(), b"v2".to_vec(), 0).await?; + assert!(!ok2); + assert_eq!(current2, 1); + + let cas_val = client.cas_get(b"cas_key".to_vec()).await?; + assert_eq!(cas_val, Some((1, b"v1".to_vec()))); + + let _ = kv_shutdown.send(()); + let _ = pd_shutdown.send(()); + kv_handle.await?; + pd_handle.await?; + + Ok(()) +} diff --git a/flaredb/crates/flaredb-pd/Cargo.toml b/flaredb/crates/flaredb-pd/Cargo.toml new file mode 100644 index 0000000..577547c --- /dev/null +++ b/flaredb/crates/flaredb-pd/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "flaredb-pd" +version.workspace = true +edition.workspace = true + +[dependencies] +flaredb-proto = { path = "../flaredb-proto" } +tokio.workspace = true +tonic.workspace = true +prost.workspace = true +clap.workspace = true diff --git a/flaredb/crates/flaredb-pd/src/cluster.rs b/flaredb/crates/flaredb-pd/src/cluster.rs new file mode 100644 index 0000000..19e253f --- /dev/null +++ b/flaredb/crates/flaredb-pd/src/cluster.rs @@ -0,0 +1,136 @@ +use flaredb_proto::pdpb::{Region, Store}; +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; + +#[derive(Clone)] +pub struct Cluster { + inner: Arc>, +} + +struct ClusterState { + stores: HashMap, + regions: Vec, // Sorted by start_key? Or just linear scan for MVP + region_leaders: HashMap, // region_id -> store_id + next_store_id: u64, + next_region_id: u64, +} + +impl Cluster { + pub fn new() -> Self { + Self { + inner: Arc::new(Mutex::new(ClusterState { + stores: HashMap::new(), + regions: Vec::new(), + region_leaders: HashMap::new(), + next_store_id: 1, + next_region_id: 1, + })), + } + } + + pub fn register_store(&self, addr: String) -> u64 { + let mut state = self.inner.lock().unwrap(); + + // Dedup check? For now, always new ID. + // In real system, check if addr exists. + for store in state.stores.values() { + if store.addr == addr { + return store.id; + } + } + + let id = state.next_store_id; + state.next_store_id += 1; + + state.stores.insert(id, Store { id, addr }); + + // Bootstrap check: If we have 3 stores (Raft requires 3 for quorum), create Raft group + if state.stores.len() == 3 && state.regions.is_empty() { + let r1_id = state.next_region_id; + state.next_region_id += 1; + + let r1 = Region { + id: r1_id, + start_key: vec![], + end_key: vec![], // All keys + peers: Vec::new(), + leader_id: 0, + }; + + state.regions.push(r1); + + // Assign ALL 3 stores as peers (Leader = S1, but peers = [S1, S2, S3]) + // We need to update Region definition to include Peers? + // Currently Region struct in pdpb doesn't have peers field (it's commented out). + // We can just assign leader for routing. + // Real Raft: Server needs to know peers. + // PD should return Peers in GetRegionResponse. + + let ids: Vec = state.stores.keys().cloned().collect(); + let mut sorted_ids = ids; + sorted_ids.sort(); + + state.region_leaders.insert(r1_id, sorted_ids[0]); + + println!( + "Bootstrapped Raft Cluster: Region {} -> Leader S{}, Peers {:?}", + r1_id, sorted_ids[0], sorted_ids + ); + } else if state.stores.len() == 1 && state.regions.is_empty() { + // Single node: one region ["", "") + let r1_id = state.next_region_id; + state.next_region_id += 1; + + let r1 = Region { + id: r1_id, + start_key: vec![], + end_key: vec![], + peers: Vec::new(), + leader_id: 0, + }; + state.regions.push(r1); + + // Assign to this store + state.region_leaders.insert(r1_id, id); + println!("Bootstrapped Single Node: R1->S{}", id); + } + + id + } + + pub fn get_region_for_key(&self, key: &[u8]) -> Option<(Region, Store)> { + let state = self.inner.lock().unwrap(); + + for region in &state.regions { + let start_ok = region.start_key.is_empty() || key >= ®ion.start_key[..]; + let end_ok = region.end_key.is_empty() || key < ®ion.end_key[..]; + + if start_ok && end_ok { + // Found region + if let Some(leader_id) = state.region_leaders.get(®ion.id) { + if let Some(store) = state.stores.get(leader_id) { + let mut r = region.clone(); + r.leader_id = *leader_id; + r.peers = state.stores.keys().cloned().collect(); + return Some((r, store.clone())); + } + } + } + } + + None + } + + pub fn list_regions(&self) -> (Vec, Vec) { + let state = self.inner.lock().unwrap(); + let mut regions = Vec::new(); + for r in &state.regions { + let mut clone = r.clone(); + clone.leader_id = *state.region_leaders.get(&r.id).unwrap_or(&0); + clone.peers = state.stores.keys().cloned().collect(); + regions.push(clone); + } + let stores: Vec = state.stores.values().cloned().collect(); + (regions, stores) + } +} diff --git a/flaredb/crates/flaredb-pd/src/main.rs b/flaredb/crates/flaredb-pd/src/main.rs new file mode 100644 index 0000000..8f28fb7 --- /dev/null +++ b/flaredb/crates/flaredb-pd/src/main.rs @@ -0,0 +1,40 @@ +use clap::Parser; +use flaredb_proto::pdpb::pd_server::PdServer; +use flaredb_proto::pdpb::tso_server::TsoServer; +use std::sync::Arc; +use tonic::transport::Server; + +mod cluster; +mod pd_service; +mod service; +mod tso; + +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + #[arg(long, default_value = "127.0.0.1:2379")] + addr: String, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let args = Args::parse(); + let addr = args.addr.parse()?; + + println!("rdb-pd listening on {}", addr); + + let tso_oracle = Arc::new(tso::TsoOracle::new()); + // Use TsoServiceImpl for TsoServer (legacy) + let tso_service = service::TsoServiceImpl::new(tso_oracle.clone()); + + // Use PdServiceImpl for PdServer (new) + let pd_service = pd_service::PdServiceImpl::new(tso_oracle); + + Server::builder() + .add_service(TsoServer::new(tso_service)) + .add_service(PdServer::new(pd_service)) + .serve(addr) + .await?; + + Ok(()) +} diff --git a/flaredb/crates/flaredb-pd/src/pd_service.rs b/flaredb/crates/flaredb-pd/src/pd_service.rs new file mode 100644 index 0000000..d02d8fc --- /dev/null +++ b/flaredb/crates/flaredb-pd/src/pd_service.rs @@ -0,0 +1,78 @@ +use crate::cluster::Cluster; +use crate::tso::TsoOracle; +use flaredb_proto::pdpb::pd_server::Pd; +use flaredb_proto::pdpb::{ + GetRegionRequest, GetRegionResponse, ListRegionsRequest, ListRegionsResponse, + MoveRegionRequest, MoveRegionResponse, RegisterStoreRequest, RegisterStoreResponse, +}; +use std::sync::Arc; +use tonic::{Request, Response, Status}; + +pub struct PdServiceImpl { + _tso_oracle: Arc, + cluster: Cluster, +} + +impl PdServiceImpl { + pub fn new(tso_oracle: Arc) -> Self { + Self { + _tso_oracle: tso_oracle, + cluster: Cluster::new(), + } + } + + #[allow(dead_code)] + async fn list_regions( + &self, + _request: Request, + ) -> Result, Status> { + let (regions, stores) = self.cluster.list_regions(); + Ok(Response::new(ListRegionsResponse { regions, stores })) + } + + #[allow(dead_code)] + async fn move_region( + &self, + _request: Request, + ) -> Result, Status> { + Ok(Response::new(MoveRegionResponse { accepted: true })) + } +} + +#[tonic::async_trait] +impl Pd for PdServiceImpl { + async fn register_store( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let store_id = self.cluster.register_store(req.addr); + Ok(Response::new(RegisterStoreResponse { + store_id, + cluster_id: 1, // fixed for now + })) + } + + async fn get_region( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + if let Some((region, leader)) = self.cluster.get_region_for_key(&req.key) { + Ok(Response::new(GetRegionResponse { + region: Some(region), + leader: Some(leader), + })) + } else { + Err(Status::not_found("Region not initialized yet")) + } + } + + async fn list_regions( + &self, + _request: Request, + ) -> Result, Status> { + let (regions, stores) = self.cluster.list_regions(); + Ok(Response::new(ListRegionsResponse { regions, stores })) + } +} diff --git a/flaredb/crates/flaredb-pd/src/service.rs b/flaredb/crates/flaredb-pd/src/service.rs new file mode 100644 index 0000000..4edbaf2 --- /dev/null +++ b/flaredb/crates/flaredb-pd/src/service.rs @@ -0,0 +1,31 @@ +use crate::tso::TsoOracle; +use flaredb_proto::pdpb::tso_server::Tso; +use flaredb_proto::pdpb::{TsoRequest, TsoResponse}; +use std::sync::Arc; +use tonic::{Request, Response, Status}; + +pub struct TsoServiceImpl { + oracle: Arc, +} + +impl TsoServiceImpl { + pub fn new(oracle: Arc) -> Self { + Self { oracle } + } +} + +#[tonic::async_trait] +impl Tso for TsoServiceImpl { + async fn get_timestamp( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let count = req.count; + let ts = self.oracle.get_timestamp(count); + Ok(Response::new(TsoResponse { + timestamp: ts, + count, + })) + } +} diff --git a/flaredb/crates/flaredb-pd/src/tso.rs b/flaredb/crates/flaredb-pd/src/tso.rs new file mode 100644 index 0000000..fc627a5 --- /dev/null +++ b/flaredb/crates/flaredb-pd/src/tso.rs @@ -0,0 +1,332 @@ +//! Timestamp Oracle (TSO) for FlareDB +//! +//! The TSO provides globally unique, monotonically increasing timestamps +//! used for transaction ordering and MVCC in the distributed KV store. +//! +//! # Timestamp Format +//! +//! Each timestamp is a 64-bit integer with the following structure: +//! - **Upper 48 bits**: Physical time in milliseconds since Unix epoch +//! - **Lower 16 bits**: Logical counter (0-65535) +//! +//! ```text +//! ┌──────────────────────────────────────────────────────────┬──────────────────┐ +//! │ Physical Time (48 bits) │ Logical (16 bits)│ +//! │ milliseconds since epoch │ counter 0-65535 │ +//! └──────────────────────────────────────────────────────────┴──────────────────┘ +//! ``` +//! +//! This format allows: +//! - Up to 65536 timestamps per millisecond +//! - Time-based ordering (most significant bits are physical time) +//! - Easy extraction of physical time: `ts >> 16` +//! - Easy extraction of logical counter: `ts & 0xFFFF` +//! +//! # Thread Safety +//! +//! The TSO is fully thread-safe and uses atomic operations to ensure +//! monotonicity even under high concurrency. +//! +//! # Usage +//! +//! ```rust +//! use flaredb_pd::tso::TsoOracle; +//! +//! let tso = TsoOracle::new(); +//! +//! // Get a single timestamp +//! let ts = tso.get_timestamp(1); +//! +//! // Extract components +//! let physical = TsoOracle::physical_time(ts); +//! let logical = TsoOracle::logical_counter(ts); +//! +//! // Get a batch of 100 timestamps (returns the first one) +//! let first_ts = tso.get_timestamp(100); +//! // The allocated range is [first_ts, first_ts + 99] +//! ``` + +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::{SystemTime, UNIX_EPOCH}; + +/// Timestamp Oracle providing globally unique timestamps. +/// +/// The oracle maintains monotonicity by tracking the last issued timestamp +/// and advancing the logical counter or waiting for the next millisecond +/// when the counter would overflow. +pub struct TsoOracle { + /// Combined physical (upper 48 bits) and logical (lower 16 bits) timestamp. + /// Uses atomic operations to ensure thread safety. + last_tso: AtomicU64, +} + +impl Default for TsoOracle { + fn default() -> Self { + Self::new() + } +} + +impl TsoOracle { + /// The number of bits used for the logical counter. + pub const LOGICAL_BITS: u32 = 16; + + /// Maximum value for the logical counter (65535). + pub const MAX_LOGICAL: u64 = 0xFFFF; + + /// Create a new TSO oracle. + /// + /// The oracle starts with no previously issued timestamps. + pub fn new() -> Self { + Self { + last_tso: AtomicU64::new(0), + } + } + + /// Allocate `count` timestamps and return the first one. + /// + /// The caller is responsible for using timestamps in the range + /// `[returned_ts, returned_ts + count - 1]`. + /// + /// # Arguments + /// + /// * `count` - Number of timestamps to allocate (1-65535) + /// + /// # Returns + /// + /// The first timestamp in the allocated batch. + /// + /// # Panics + /// + /// This function will spin-wait if the logical counter would overflow, + /// waiting for the next millisecond. In extreme cases (>65536 requests + /// per millisecond), this may cause delays. + pub fn get_timestamp(&self, count: u32) -> u64 { + debug_assert!(count > 0 && count <= Self::MAX_LOGICAL as u32); + + loop { + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() as u64; + + let current_tso = self.last_tso.load(Ordering::Acquire); + let current_physical = current_tso >> Self::LOGICAL_BITS; + let current_logical = current_tso & Self::MAX_LOGICAL; + + if now > current_physical { + // Time has advanced, reset logical counter + let start_logical = 0; + let end_logical = count as u64 - 1; + let new_val = (now << Self::LOGICAL_BITS) | end_logical; + + if self + .last_tso + .compare_exchange(current_tso, new_val, Ordering::AcqRel, Ordering::Acquire) + .is_ok() + { + return (now << Self::LOGICAL_BITS) | start_logical; + } + } else { + // Same millisecond, advance logical counter + if current_logical + (count as u64) > Self::MAX_LOGICAL { + // Would overflow, wait for next millisecond + std::thread::yield_now(); + continue; + } + + let start_logical = current_logical + 1; + let end_logical = current_logical + (count as u64); + let new_val = (current_physical << Self::LOGICAL_BITS) | end_logical; + + if self + .last_tso + .compare_exchange(current_tso, new_val, Ordering::AcqRel, Ordering::Acquire) + .is_ok() + { + return (current_physical << Self::LOGICAL_BITS) | start_logical; + } + } + } + } + + /// Extract the physical time (milliseconds since epoch) from a timestamp. + /// + /// # Example + /// + /// ```rust + /// use flaredb_pd::tso::TsoOracle; + /// + /// let tso = TsoOracle::new(); + /// let ts = tso.get_timestamp(1); + /// let physical = TsoOracle::physical_time(ts); + /// // physical is approximately the current time in milliseconds + /// ``` + #[inline] + pub fn physical_time(ts: u64) -> u64 { + ts >> Self::LOGICAL_BITS + } + + /// Extract the logical counter from a timestamp. + /// + /// # Example + /// + /// ```rust + /// use flaredb_pd::tso::TsoOracle; + /// + /// let tso = TsoOracle::new(); + /// let ts = tso.get_timestamp(1); + /// let logical = TsoOracle::logical_counter(ts); + /// // logical is the sequence number within the millisecond (0-65535) + /// ``` + #[inline] + pub fn logical_counter(ts: u64) -> u16 { + (ts & Self::MAX_LOGICAL) as u16 + } + + /// Compose a timestamp from physical time and logical counter. + /// + /// # Arguments + /// + /// * `physical` - Physical time in milliseconds since epoch + /// * `logical` - Logical counter (0-65535) + /// + /// # Returns + /// + /// The composed timestamp. + #[inline] + pub fn compose(physical: u64, logical: u16) -> u64 { + (physical << Self::LOGICAL_BITS) | (logical as u64) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashSet; + use std::sync::Arc; + use std::thread; + + #[test] + fn test_monotonic_tso() { + let tso = Arc::new(TsoOracle::new()); + let mut threads = vec![]; + + for _ in 0..10 { + let tso = tso.clone(); + threads.push(thread::spawn(move || { + let mut last = 0; + for _ in 0..1000 { + let ts = tso.get_timestamp(1); + assert!(ts > last, "TSO not monotonic: {} vs {}", ts, last); + last = ts; + } + })); + } + + for t in threads { + t.join().unwrap(); + } + } + + #[test] + fn test_batch_allocation() { + let tso = TsoOracle::new(); + + // Allocate batch of 10 + let first = tso.get_timestamp(10); + + // Next allocation should be after the batch + let next = tso.get_timestamp(1); + assert!( + next >= first + 10, + "next={} should be >= first+10={}", + next, + first + 10 + ); + } + + #[test] + fn test_batch_uniqueness() { + let tso = Arc::new(TsoOracle::new()); + let all_timestamps = Arc::new(std::sync::Mutex::new(HashSet::new())); + let mut handles = vec![]; + + // Spawn threads each allocating batches + for _ in 0..10 { + let tso = tso.clone(); + let timestamps = all_timestamps.clone(); + handles.push(thread::spawn(move || { + for _ in 0..100 { + let batch_size = 10u32; + let first = tso.get_timestamp(batch_size); + let mut ts_set = timestamps.lock().unwrap(); + for i in 0..batch_size as u64 { + let ts = first + i; + assert!(ts_set.insert(ts), "Duplicate timestamp detected: {}", ts); + } + } + })); + } + + for h in handles { + h.join().unwrap(); + } + + // Should have 10 threads * 100 batches * 10 per batch = 10000 unique timestamps + assert_eq!(all_timestamps.lock().unwrap().len(), 10000); + } + + #[test] + fn test_physical_logical_extraction() { + let tso = TsoOracle::new(); + let ts = tso.get_timestamp(1); + + let physical = TsoOracle::physical_time(ts); + let logical = TsoOracle::logical_counter(ts); + + // Reconstruct and verify + assert_eq!(TsoOracle::compose(physical, logical), ts); + + // Physical time should be recent (within 1 second of now) + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() as u64; + assert!( + (now_ms - physical) < 1000, + "physical={} should be within 1s of now={}", + physical, + now_ms + ); + } + + #[test] + fn test_compose_roundtrip() { + // Test that compose and extract are inverses + let physical = 1234567890123u64; + let logical = 42u16; + + let ts = TsoOracle::compose(physical, logical); + assert_eq!(TsoOracle::physical_time(ts), physical); + assert_eq!(TsoOracle::logical_counter(ts), logical); + } + + #[test] + fn test_default_trait() { + let tso1 = TsoOracle::default(); + let tso2 = TsoOracle::new(); + + // Both should produce valid timestamps + let ts1 = tso1.get_timestamp(1); + let ts2 = tso2.get_timestamp(1); + assert!(ts1 > 0); + assert!(ts2 > 0); + } + + #[test] + fn test_constants() { + assert_eq!(TsoOracle::LOGICAL_BITS, 16); + assert_eq!(TsoOracle::MAX_LOGICAL, 0xFFFF); + assert_eq!(TsoOracle::MAX_LOGICAL, 65535); + } +} diff --git a/flaredb/crates/flaredb-proto/Cargo.toml b/flaredb/crates/flaredb-proto/Cargo.toml new file mode 100644 index 0000000..0bcc15e --- /dev/null +++ b/flaredb/crates/flaredb-proto/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "flaredb-proto" +version.workspace = true +edition.workspace = true + +[dependencies] +tonic.workspace = true +prost.workspace = true + +[build-dependencies] +tonic-build.workspace = true +protoc-bin-vendored = "3.2" diff --git a/flaredb/crates/flaredb-proto/build.rs b/flaredb/crates/flaredb-proto/build.rs new file mode 100644 index 0000000..5ca7c0a --- /dev/null +++ b/flaredb/crates/flaredb-proto/build.rs @@ -0,0 +1,18 @@ +fn main() -> Result<(), Box> { + let protoc_path = protoc_bin_vendored::protoc_bin_path()?; + std::env::set_var("PROTOC", protoc_path); + + tonic_build::configure() + .build_server(true) + .build_client(true) + .compile_protos( + &[ + "src/kvrpc.proto", + "src/pdpb.proto", + "src/raft_server.proto", + "src/chainfire.proto", + ], + &["src"], // proto_path + )?; + Ok(()) +} diff --git a/flaredb/crates/flaredb-proto/src/chainfire.proto b/flaredb/crates/flaredb-proto/src/chainfire.proto new file mode 100644 index 0000000..ef56985 --- /dev/null +++ b/flaredb/crates/flaredb-proto/src/chainfire.proto @@ -0,0 +1,319 @@ +syntax = "proto3"; + +package chainfire.v1; + +// Key-Value service +service KV { + // Range gets the keys in the range from the key-value store + rpc Range(RangeRequest) returns (RangeResponse); + + // Put puts the given key into the key-value store + rpc Put(PutRequest) returns (PutResponse); + + // Delete deletes the given range from the key-value store + rpc Delete(DeleteRangeRequest) returns (DeleteRangeResponse); + + // Txn processes multiple requests in a single transaction + rpc Txn(TxnRequest) returns (TxnResponse); +} + +// Watch service +service Watch { + // Watch watches for events happening or that have happened + rpc Watch(stream WatchRequest) returns (stream WatchResponse); +} + +// Cluster management service +service Cluster { + // MemberAdd adds a member into the cluster + rpc MemberAdd(MemberAddRequest) returns (MemberAddResponse); + + // MemberRemove removes an existing member from the cluster + rpc MemberRemove(MemberRemoveRequest) returns (MemberRemoveResponse); + + // MemberList lists all the members in the cluster + rpc MemberList(MemberListRequest) returns (MemberListResponse); + + // Status gets the status of the cluster + rpc Status(StatusRequest) returns (StatusResponse); +} + +// Response header included in all responses +message ResponseHeader { + // cluster_id is the ID of the cluster + uint64 cluster_id = 1; + // member_id is the ID of the responding member + uint64 member_id = 2; + // revision is the key-value store revision + int64 revision = 3; + // raft_term is the current Raft term + uint64 raft_term = 4; +} + +// Key-value pair +message KeyValue { + // key is the key in bytes + bytes key = 1; + // create_revision is the revision of last creation + int64 create_revision = 2; + // mod_revision is the revision of last modification + int64 mod_revision = 3; + // version is the version of the key + int64 version = 4; + // value is the value held by the key + bytes value = 5; + // lease is the ID of the lease attached to the key + int64 lease = 6; +} + +// ========== Range ========== + +message RangeRequest { + // key is the first key for the range + bytes key = 1; + // range_end is the upper bound on the requested range + bytes range_end = 2; + // limit is a limit on the number of keys returned + int64 limit = 3; + // revision is the point-in-time of the store to use + int64 revision = 4; + // keys_only when set returns only the keys and not the values + bool keys_only = 5; + // count_only when set returns only the count of the keys + bool count_only = 6; +} + +message RangeResponse { + ResponseHeader header = 1; + // kvs is the list of key-value pairs matched by the range request + repeated KeyValue kvs = 2; + // more indicates if there are more keys to return + bool more = 3; + // count is set to the number of keys within the range + int64 count = 4; +} + +// ========== Put ========== + +message PutRequest { + // key is the key to put + bytes key = 1; + // value is the value to put + bytes value = 2; + // lease is the lease ID to attach to the key + int64 lease = 3; + // prev_kv when set returns the previous key-value pair + bool prev_kv = 4; +} + +message PutResponse { + ResponseHeader header = 1; + // prev_kv is the key-value pair before the put + KeyValue prev_kv = 2; +} + +// ========== Delete ========== + +message DeleteRangeRequest { + // key is the first key to delete + bytes key = 1; + // range_end is the key following the last key to delete + bytes range_end = 2; + // prev_kv when set returns deleted key-value pairs + bool prev_kv = 3; +} + +message DeleteRangeResponse { + ResponseHeader header = 1; + // deleted is the number of keys deleted + int64 deleted = 2; + // prev_kvs holds the deleted key-value pairs + repeated KeyValue prev_kvs = 3; +} + +// ========== Transaction ========== + +message TxnRequest { + // compare is a list of predicates + repeated Compare compare = 1; + // success is a list of operations to apply if all comparisons succeed + repeated RequestOp success = 2; + // failure is a list of operations to apply if any comparison fails + repeated RequestOp failure = 3; +} + +message TxnResponse { + ResponseHeader header = 1; + // succeeded is set to true if all comparisons evaluated to true + bool succeeded = 2; + // responses is a list of responses corresponding to the results + repeated ResponseOp responses = 3; +} + +message Compare { + enum CompareResult { + EQUAL = 0; + GREATER = 1; + LESS = 2; + NOT_EQUAL = 3; + } + enum CompareTarget { + VERSION = 0; + CREATE = 1; + MOD = 2; + VALUE = 3; + } + CompareResult result = 1; + CompareTarget target = 2; + bytes key = 3; + oneof target_union { + int64 version = 4; + int64 create_revision = 5; + int64 mod_revision = 6; + bytes value = 7; + } +} + +message RequestOp { + oneof request { + RangeRequest request_range = 1; + PutRequest request_put = 2; + DeleteRangeRequest request_delete_range = 3; + } +} + +message ResponseOp { + oneof response { + RangeResponse response_range = 1; + PutResponse response_put = 2; + DeleteRangeResponse response_delete_range = 3; + } +} + +// ========== Watch ========== + +message WatchRequest { + oneof request_union { + WatchCreateRequest create_request = 1; + WatchCancelRequest cancel_request = 2; + WatchProgressRequest progress_request = 3; + } +} + +message WatchCreateRequest { + // key is the key to watch + bytes key = 1; + // range_end is the end of the range to watch + bytes range_end = 2; + // start_revision is an optional revision to start watching from + int64 start_revision = 3; + // progress_notify is set to true to enable progress notifications + bool progress_notify = 4; + // prev_kv when set includes previous key-value in events + bool prev_kv = 5; + // watch_id is the user-provided watch ID (0 for server-assigned) + int64 watch_id = 6; +} + +message WatchCancelRequest { + // watch_id is the watch ID to cancel + int64 watch_id = 1; +} + +message WatchProgressRequest {} + +message WatchResponse { + ResponseHeader header = 1; + // watch_id is the watch ID for this response + int64 watch_id = 2; + // created is set to true if this response is for a create request + bool created = 3; + // canceled is set to true if the watch was canceled + bool canceled = 4; + // compact_revision is the minimum revision the watcher may receive + int64 compact_revision = 5; + // cancel_reason indicates the reason for cancellation + string cancel_reason = 6; + // events is the list of events in this response + repeated Event events = 11; +} + +message Event { + enum EventType { + PUT = 0; + DELETE = 1; + } + // type is the kind of event + EventType type = 1; + // kv is the KeyValue affected by the event + KeyValue kv = 2; + // prev_kv is the KeyValue prior to the event + KeyValue prev_kv = 3; +} + +// ========== Cluster Management ========== + +message Member { + // ID is the member ID + uint64 id = 1; + // name is the human-readable name + string name = 2; + // peer_urls are URLs for Raft communication + repeated string peer_urls = 3; + // client_urls are URLs for client communication + repeated string client_urls = 4; + // is_learner indicates if member is a learner + bool is_learner = 5; +} + +message MemberAddRequest { + // peer_urls are the URLs to reach the new member + repeated string peer_urls = 1; + // is_learner indicates if the member is a learner + bool is_learner = 2; +} + +message MemberAddResponse { + ResponseHeader header = 1; + // member is the member information for the added member + Member member = 2; + // members is the list of all members after adding + repeated Member members = 3; +} + +message MemberRemoveRequest { + // ID is the member ID to remove + uint64 id = 1; +} + +message MemberRemoveResponse { + ResponseHeader header = 1; + // members is the list of all members after removing + repeated Member members = 2; +} + +message MemberListRequest {} + +message MemberListResponse { + ResponseHeader header = 1; + // members is the list of all members + repeated Member members = 2; +} + +message StatusRequest {} + +message StatusResponse { + ResponseHeader header = 1; + // version is the version of the server + string version = 2; + // db_size is the size of the database + int64 db_size = 3; + // leader is the member ID of the current leader + uint64 leader = 4; + // raft_index is the current Raft committed index + uint64 raft_index = 5; + // raft_term is the current Raft term + uint64 raft_term = 6; + // raft_applied_index is the current Raft applied index + uint64 raft_applied_index = 7; +} diff --git a/flaredb/crates/flaredb-proto/src/kvrpc.proto b/flaredb/crates/flaredb-proto/src/kvrpc.proto new file mode 100644 index 0000000..23d3ab0 --- /dev/null +++ b/flaredb/crates/flaredb-proto/src/kvrpc.proto @@ -0,0 +1,123 @@ +syntax = "proto3"; + +package kvrpc; + +// Raw (Eventual Consistency) Operations +service KvRaw { + rpc RawPut(RawPutRequest) returns (RawPutResponse); + rpc RawGet(RawGetRequest) returns (RawGetResponse); + rpc RawScan(RawScanRequest) returns (RawScanResponse); + rpc RawDelete(RawDeleteRequest) returns (RawDeleteResponse); +} + +message RawPutRequest { + bytes key = 1; + bytes value = 2; + string namespace = 3; +} + +message RawPutResponse { + bool success = 1; +} + +message RawGetRequest { + bytes key = 1; + string namespace = 2; +} + +message RawGetResponse { + bool found = 1; + bytes value = 2; +} + +// Raw scan request (eventual consistency) +message RawScanRequest { + bytes start_key = 1; // inclusive + bytes end_key = 2; // exclusive (empty = no upper bound) + uint32 limit = 3; // max entries to return (0 = default 100) + string namespace = 4; +} + +// Raw scan response +message RawScanResponse { + repeated bytes keys = 1; + repeated bytes values = 2; + bool has_more = 3; // true if more entries exist + bytes next_key = 4; // next start_key for pagination +} + +message RawDeleteRequest { + bytes key = 1; + string namespace = 2; +} + +message RawDeleteResponse { + bool success = 1; + bool existed = 2; // true if key existed before deletion +} + +// CAS (Strong Consistency / Optimistic) Operations +service KvCas { + rpc CompareAndSwap(CasRequest) returns (CasResponse); + rpc Get(GetRequest) returns (GetResponse); + rpc Scan(ScanRequest) returns (ScanResponse); + rpc Delete(DeleteRequest) returns (DeleteResponse); +} + +message CasRequest { + bytes key = 1; + bytes value = 2; + uint64 expected_version = 3; // 0 implies "create if not exists" + string namespace = 4; +} + +message CasResponse { + bool success = 1; + uint64 current_version = 2; // Returns current version on failure (for retry) + uint64 new_version = 3; // Returns assigned version on success +} + +message GetRequest { + bytes key = 1; + string namespace = 2; +} + +message GetResponse { + bool found = 1; + bytes value = 2; + uint64 version = 3; +} + +// CAS scan request (strong consistency) +message ScanRequest { + bytes start_key = 1; // inclusive + bytes end_key = 2; // exclusive (empty = no upper bound) + uint32 limit = 3; // max entries to return (0 = default 100) + string namespace = 4; +} + +// Key-value with version for CAS scan +message VersionedKv { + bytes key = 1; + bytes value = 2; + uint64 version = 3; +} + +// CAS scan response +message ScanResponse { + repeated VersionedKv entries = 1; + bool has_more = 2; // true if more entries exist + bytes next_key = 3; // next start_key for pagination +} + +message DeleteRequest { + bytes key = 1; + uint64 expected_version = 2; // 0 implies "delete if exists" (no version check) + string namespace = 3; +} + +message DeleteResponse { + bool success = 1; + uint64 current_version = 2; // Returns current version on failure (for retry) + bool existed = 3; // true if key existed before deletion +} diff --git a/flaredb/crates/flaredb-proto/src/lib.rs b/flaredb/crates/flaredb-proto/src/lib.rs new file mode 100644 index 0000000..f86d76f --- /dev/null +++ b/flaredb/crates/flaredb-proto/src/lib.rs @@ -0,0 +1,16 @@ +// Re-export proto modules +pub mod kvrpc { + tonic::include_proto!("kvrpc"); +} + +pub mod pdpb { + tonic::include_proto!("pdpb"); +} + +pub mod raft_server { + tonic::include_proto!("raftpb"); +} + +pub mod chainfire { + tonic::include_proto!("chainfire.v1"); +} diff --git a/flaredb/crates/flaredb-proto/src/pdpb.proto b/flaredb/crates/flaredb-proto/src/pdpb.proto new file mode 100644 index 0000000..71dd342 --- /dev/null +++ b/flaredb/crates/flaredb-proto/src/pdpb.proto @@ -0,0 +1,78 @@ +syntax = "proto3"; + +package pdpb; + +// TSO Service +service Tso { + rpc GetTimestamp(TsoRequest) returns (TsoResponse); +} + +message TsoRequest { + uint32 count = 1; +} + +message TsoResponse { + uint64 timestamp = 1; // Physical << 16 | Logical + uint32 count = 2; +} + +// Cluster Management Service +service Pd { + // Store Registration + rpc RegisterStore(RegisterStoreRequest) returns (RegisterStoreResponse); + + // Region Discovery + rpc GetRegion(GetRegionRequest) returns (GetRegionResponse); + // Region Listing (bootstrap/refresh) + rpc ListRegions(ListRegionsRequest) returns (ListRegionsResponse); +} + +message RegisterStoreRequest { + string addr = 1; // e.g., "127.0.0.1:50051" +} + +message RegisterStoreResponse { + uint64 store_id = 1; + uint64 cluster_id = 2; // Verify cluster match +} + +message GetRegionRequest { + bytes key = 1; +} + +message GetRegionResponse { + Region region = 1; + Store leader = 2; +} + +// Move directives (simplified): add replica to target store, then remove source. +message MoveRegionRequest { + uint64 region_id = 1; + uint64 from_store = 2; + uint64 to_store = 3; +} + +message MoveRegionResponse { + bool accepted = 1; +} + +message ListRegionsRequest {} + +message ListRegionsResponse { + repeated Region regions = 1; + repeated Store stores = 2; +} + +message Region { + uint64 id = 1; + bytes start_key = 2; + bytes end_key = 3; // empty = infinity + repeated uint64 peers = 4; + uint64 leader_id = 5; + // In future: repeated Peer peers = 4; +} + +message Store { + uint64 id = 1; + string addr = 2; +} diff --git a/flaredb/crates/flaredb-proto/src/raft_server.proto b/flaredb/crates/flaredb-proto/src/raft_server.proto new file mode 100644 index 0000000..9d60043 --- /dev/null +++ b/flaredb/crates/flaredb-proto/src/raft_server.proto @@ -0,0 +1,127 @@ +syntax = "proto3"; + +package raftpb; + +message RaftMessage { + uint64 region_id = 1; + uint64 from_peer_id = 2; + uint64 to_peer_id = 3; + bytes message = 4; // Serialized raft::eraftpb::Message +} + +message RaftResponse {} + +service RaftService { + // Legacy raft-rs message passing + rpc Send(RaftMessage) returns (RaftResponse); + + // OpenRaft v2 RPCs + rpc VoteV2(OpenRaftVoteRequest) returns (OpenRaftVoteResponse); + rpc AppendEntriesV2(OpenRaftAppendEntriesRequest) returns (OpenRaftAppendEntriesResponse); + rpc InstallSnapshotV2(OpenRaftSnapshotRequest) returns (OpenRaftSnapshotResponse); + + // Application-level RPCs + rpc GetMode(GetModeRequest) returns (GetModeResponse); + rpc UpdateNamespaceMode(UpdateNamespaceModeRequest) returns (UpdateNamespaceModeResponse); + rpc ListNamespaceModes(ListNamespaceModesRequest) returns (ListNamespaceModesResponse); + rpc ForwardEventual(ForwardEventualRequest) returns (RaftResponse); + rpc GetMerkle(GetMerkleRequest) returns (GetMerkleResponse); + rpc FetchRange(FetchRangeRequest) returns (FetchRangeResponse); +} + +// OpenRaft v2 messages - using serialized JSON for flexibility +message OpenRaftVoteRequest { + uint64 region_id = 1; + bytes data = 2; // JSON-serialized VoteRequest +} + +message OpenRaftVoteResponse { + bytes data = 1; // JSON-serialized VoteResponse +} + +message OpenRaftAppendEntriesRequest { + uint64 region_id = 1; + bytes data = 2; // JSON-serialized AppendEntriesRequest +} + +message OpenRaftAppendEntriesResponse { + bytes data = 1; // JSON-serialized AppendEntriesResponse +} + +message OpenRaftSnapshotRequest { + uint64 region_id = 1; + bytes vote = 2; // JSON-serialized Vote + bytes meta = 3; // JSON-serialized SnapshotMeta + bytes data = 4; // Snapshot data +} + +message OpenRaftSnapshotResponse { + bytes data = 1; // JSON-serialized SnapshotResponse +} + +message GetModeRequest { + string namespace = 1; +} + +message GetModeResponse { + string mode = 1; // "strong" or "eventual" +} + +message UpdateNamespaceModeRequest { + string namespace = 1; + string mode = 2; // "strong" or "eventual" +} + +message UpdateNamespaceModeResponse { + NamespaceMode mode = 1; +} + +message ListNamespaceModesRequest {} + +message ListNamespaceModesResponse { + repeated NamespaceMode namespaces = 1; +} + +message NamespaceMode { + string namespace = 1; + uint32 id = 2; + string mode = 3; + bool from_default = 4; // true when created implicitly from default mode +} + +message ForwardEventualRequest { + uint64 region_id = 5; + uint32 namespace_id = 1; + bytes key = 2; + bytes value = 3; + uint64 ts = 4; +} + +message GetMerkleRequest { + uint64 region_id = 3; + uint32 namespace_id = 1; + uint32 chunk_size = 2; // number of keys per leaf; default server-side if zero +} + +message MerkleRange { + bytes start_key = 1; + bytes end_key = 2; + bytes hash = 3; // sha256 +} + +message GetMerkleResponse { + bytes root = 1; // sha256 root + repeated MerkleRange leaves = 2; +} + +message FetchRangeRequest { + uint64 region_id = 4; + uint32 namespace_id = 1; + bytes start_key = 2; // inclusive prefix key (namespaced) + bytes end_key = 3; // exclusive prefix end +} + +message FetchRangeResponse { + repeated bytes keys = 1; + repeated bytes values = 2; +} diff --git a/flaredb/crates/flaredb-raft/Cargo.toml b/flaredb/crates/flaredb-raft/Cargo.toml new file mode 100644 index 0000000..eda0d11 --- /dev/null +++ b/flaredb/crates/flaredb-raft/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "flaredb-raft" +version.workspace = true +edition.workspace = true + +[dependencies] +flaredb-types = { path = "../flaredb-types" } +flaredb-proto = { path = "../flaredb-proto" } +flaredb-storage = { path = "../flaredb-storage" } +tokio.workspace = true +tonic.workspace = true +openraft.workspace = true +serde.workspace = true +serde_json.workspace = true +bincode.workspace = true +tracing.workspace = true + +[dev-dependencies] +tempfile.workspace = true diff --git a/flaredb/crates/flaredb-raft/src/lib.rs b/flaredb/crates/flaredb-raft/src/lib.rs new file mode 100644 index 0000000..662841c --- /dev/null +++ b/flaredb/crates/flaredb-raft/src/lib.rs @@ -0,0 +1,15 @@ +//! OpenRaft implementation for FlareDB +//! +//! This crate provides the Raft consensus layer for FlareDB, built on top of OpenRaft. + +pub mod network; +pub mod persistent_storage; +pub mod raft_node; +pub mod storage; +pub mod types; + +pub use network::{FlareNetwork, FlareNetworkFactory}; +pub use persistent_storage::PersistentFlareStore; +pub use raft_node::{FlareRaftNode, FlareStorage}; +pub use storage::{FlareSnapshot, FlareStateMachine, FlareStore}; +pub use types::*; diff --git a/flaredb/crates/flaredb-raft/src/network.rs b/flaredb/crates/flaredb-raft/src/network.rs new file mode 100644 index 0000000..4c08057 --- /dev/null +++ b/flaredb/crates/flaredb-raft/src/network.rs @@ -0,0 +1,338 @@ +//! RaftNetwork implementation for FlareDB +//! +//! This module implements OpenRaft's RaftNetwork and RaftNetworkFactory traits +//! for inter-node communication using gRPC. + +use crate::types::{FlareNode, FlareNodeId, FlareTypeConfig}; +use openraft::error::{Fatal, NetworkError, RPCError, RaftError, ReplicationClosed, StreamingError}; +use openraft::network::{RPCOption, RaftNetwork, RaftNetworkFactory}; +use openraft::raft::{ + AppendEntriesRequest, AppendEntriesResponse, InstallSnapshotRequest, InstallSnapshotResponse, + SnapshotResponse, VoteRequest, VoteResponse, +}; +use openraft::{OptionalSend, Snapshot, Vote}; +use flaredb_proto::raft_server::raft_service_client::RaftServiceClient; +use std::collections::HashMap; +use std::future::Future; +use std::sync::Arc; +use tokio::sync::RwLock; +use tonic::transport::Channel; +use tracing::{debug, error}; + +/// Type aliases for cleaner code +type FlareRPCError = RPCError>; +type FlareSnapshotError = StreamingError>; +type FlareInstallSnapshotError = + RPCError>; + +/// Network factory for creating per-node connections +pub struct FlareNetworkFactory { + local_id: FlareNodeId, + region_id: u64, + clients: Arc>>>, + node_addrs: Arc>>, +} + +impl FlareNetworkFactory { + pub fn new(local_id: FlareNodeId, region_id: u64) -> Self { + Self { + local_id, + region_id, + clients: Arc::new(RwLock::new(HashMap::new())), + node_addrs: Arc::new(RwLock::new(HashMap::new())), + } + } + + /// Add a peer's address to the registry + pub async fn add_peer(&self, peer_id: FlareNodeId, addr: String) { + let mut addrs = self.node_addrs.write().await; + addrs.insert(peer_id, addr); + } + + /// Remove a peer from the registry + pub async fn remove_peer(&self, peer_id: FlareNodeId) { + let mut addrs = self.node_addrs.write().await; + addrs.remove(&peer_id); + + let mut clients = self.clients.write().await; + clients.remove(&peer_id); + } + + /// Get all known peer addresses + pub async fn get_peers(&self) -> HashMap { + self.node_addrs.read().await.clone() + } +} + +impl Clone for FlareNetworkFactory { + fn clone(&self) -> Self { + Self { + local_id: self.local_id, + region_id: self.region_id, + clients: self.clients.clone(), + node_addrs: self.node_addrs.clone(), + } + } +} + +impl RaftNetworkFactory for FlareNetworkFactory { + type Network = FlareNetwork; + + async fn new_client(&mut self, target: FlareNodeId, node: &FlareNode) -> Self::Network { + // Register the node address + if !node.addr.is_empty() { + let mut addrs = self.node_addrs.write().await; + addrs.insert(target, node.addr.clone()); + } + + FlareNetwork { + target, + region_id: self.region_id, + clients: self.clients.clone(), + node_addrs: self.node_addrs.clone(), + } + } +} + +/// Network client for a specific remote node +pub struct FlareNetwork { + target: FlareNodeId, + region_id: u64, + clients: Arc>>>, + node_addrs: Arc>>, +} + +impl FlareNetwork { + /// Get or create a gRPC client for the target node + async fn get_client(&self) -> Result, String> { + // Check cached client + { + let clients = self.clients.read().await; + if let Some(client) = clients.get(&self.target) { + return Ok(client.clone()); + } + } + + // Get target address + let addr = { + let addrs = self.node_addrs.read().await; + addrs.get(&self.target).cloned() + }; + + let addr = addr.ok_or_else(|| format!("address not known for node {}", self.target))?; + + // Create new connection + let endpoint = format!("http://{}", addr); + let channel = Channel::from_shared(endpoint) + .map_err(|e| e.to_string())? + .connect() + .await + .map_err(|e| e.to_string())?; + + let client = RaftServiceClient::new(channel); + + // Cache the client + let mut clients = self.clients.write().await; + clients.insert(self.target, client.clone()); + + Ok(client) + } + + fn network_error(msg: impl ToString) -> NetworkError { + NetworkError::new(&std::io::Error::new( + std::io::ErrorKind::ConnectionRefused, + msg.to_string(), + )) + } +} + +impl RaftNetwork for FlareNetwork { + async fn append_entries( + &mut self, + rpc: AppendEntriesRequest, + _option: RPCOption, + ) -> Result, FlareRPCError> { + debug!( + target = self.target, + region = self.region_id, + entries = rpc.entries.len(), + "sending append_entries" + ); + + let mut client = self + .get_client() + .await + .map_err(|e| RPCError::Network(Self::network_error(e)))?; + + // Serialize the request + let serialized = serde_json::to_vec(&rpc) + .map_err(|e| RPCError::Network(Self::network_error(e.to_string())))?; + + // Send via gRPC using OpenRaft message type + let grpc_req = flaredb_proto::raft_server::OpenRaftAppendEntriesRequest { + region_id: self.region_id, + data: serialized, + }; + + let response = client.append_entries_v2(grpc_req).await.map_err(|e| { + error!(error = %e, "append_entries RPC failed"); + RPCError::Network(Self::network_error(e.to_string())) + })?; + + let resp: AppendEntriesResponse = + serde_json::from_slice(&response.into_inner().data) + .map_err(|e| RPCError::Network(Self::network_error(e.to_string())))?; + + Ok(resp) + } + + async fn install_snapshot( + &mut self, + rpc: InstallSnapshotRequest, + _option: RPCOption, + ) -> Result, FlareInstallSnapshotError> { + debug!( + target = self.target, + region = self.region_id, + "sending install_snapshot" + ); + + let mut client = self + .get_client() + .await + .map_err(|e| RPCError::Network(Self::network_error(e)))?; + + // Serialize the request + let serialized = serde_json::to_vec(&rpc) + .map_err(|e| RPCError::Network(Self::network_error(e.to_string())))?; + + // Send via gRPC + let grpc_req = flaredb_proto::raft_server::OpenRaftSnapshotRequest { + region_id: self.region_id, + vote: vec![], + meta: vec![], + data: serialized, + }; + + let response = client.install_snapshot_v2(grpc_req).await.map_err(|e| { + error!(error = %e, "install_snapshot RPC failed"); + RPCError::Network(Self::network_error(e.to_string())) + })?; + + let resp: InstallSnapshotResponse = + serde_json::from_slice(&response.into_inner().data) + .map_err(|e| RPCError::Network(Self::network_error(e.to_string())))?; + + Ok(resp) + } + + async fn full_snapshot( + &mut self, + vote: Vote, + snapshot: Snapshot, + _cancel: impl Future + OptionalSend + 'static, + _option: RPCOption, + ) -> Result, FlareSnapshotError> { + debug!( + target = self.target, + region = self.region_id, + snapshot_id = %snapshot.meta.snapshot_id, + "sending full_snapshot" + ); + + let mut client = self + .get_client() + .await + .map_err(|e| StreamingError::Network(Self::network_error(e)))?; + + // Read snapshot data + let data = snapshot.snapshot.into_inner(); + + // Serialize metadata + let meta_bytes = serde_json::to_vec(&snapshot.meta) + .map_err(|e| StreamingError::Network(Self::network_error(e.to_string())))?; + + let vote_bytes = serde_json::to_vec(&vote) + .map_err(|e| StreamingError::Network(Self::network_error(e.to_string())))?; + + // Send via gRPC + let grpc_req = flaredb_proto::raft_server::OpenRaftSnapshotRequest { + region_id: self.region_id, + vote: vote_bytes, + meta: meta_bytes, + data, + }; + + let response = client.install_snapshot_v2(grpc_req).await.map_err(|e| { + error!(error = %e, "install_snapshot RPC failed"); + StreamingError::Network(Self::network_error(e.to_string())) + })?; + + let resp: SnapshotResponse = + serde_json::from_slice(&response.into_inner().data) + .map_err(|e| StreamingError::Network(Self::network_error(e.to_string())))?; + + Ok(resp) + } + + async fn vote( + &mut self, + rpc: VoteRequest, + _option: RPCOption, + ) -> Result, FlareRPCError> { + debug!( + target = self.target, + region = self.region_id, + "sending vote request" + ); + + let mut client = self + .get_client() + .await + .map_err(|e| RPCError::Network(Self::network_error(e)))?; + + // Serialize the request + let serialized = serde_json::to_vec(&rpc) + .map_err(|e| RPCError::Network(Self::network_error(e.to_string())))?; + + // Send via gRPC + let grpc_req = flaredb_proto::raft_server::OpenRaftVoteRequest { + region_id: self.region_id, + data: serialized, + }; + + let response = client.vote_v2(grpc_req).await.map_err(|e| { + error!(error = %e, "vote RPC failed"); + RPCError::Network(Self::network_error(e.to_string())) + })?; + + let resp: VoteResponse = serde_json::from_slice(&response.into_inner().data) + .map_err(|e| RPCError::Network(Self::network_error(e.to_string())))?; + + Ok(resp) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_network_factory_add_peer() { + let factory = FlareNetworkFactory::new(1, 1); + factory.add_peer(2, "127.0.0.1:50052".to_string()).await; + + let peers = factory.get_peers().await; + assert_eq!(peers.get(&2), Some(&"127.0.0.1:50052".to_string())); + } + + #[tokio::test] + async fn test_network_factory_remove_peer() { + let factory = FlareNetworkFactory::new(1, 1); + factory.add_peer(2, "127.0.0.1:50052".to_string()).await; + factory.remove_peer(2).await; + + let peers = factory.get_peers().await; + assert!(peers.get(&2).is_none()); + } +} diff --git a/flaredb/crates/flaredb-raft/src/persistent_storage.rs b/flaredb/crates/flaredb-raft/src/persistent_storage.rs new file mode 100644 index 0000000..0b2ae2f --- /dev/null +++ b/flaredb/crates/flaredb-raft/src/persistent_storage.rs @@ -0,0 +1,611 @@ +//! Persistent Raft storage for FlareDB +//! +//! This module provides a RocksDB-backed implementation of OpenRaft's RaftStorage trait. + +use crate::storage::{FlareSnapshot, FlareStateMachine}; +use crate::types::{ + FlareEntry, FlareLogId, FlareNode, FlareNodeId, FlareResponse, FlareTypeConfig, +}; +use flaredb_storage::rocks_engine::RocksEngine; +use openraft::storage::{LogState, RaftLogReader, RaftSnapshotBuilder, Snapshot}; +use openraft::{Entry, EntryPayload, OptionalSend, RaftLogId, RaftStorage, SnapshotMeta}; +use openraft::{StorageError, StorageIOError, StoredMembership, Vote}; +use std::fmt::Debug; +use std::io::Cursor; +use std::ops::RangeBounds; +use std::sync::Arc; +use tokio::sync::RwLock; +use tracing::{debug, trace}; + +// Raft state keys +const KEY_VOTE: &str = "vote"; +const KEY_COMMITTED: &str = "committed"; +const KEY_LAST_PURGED: &str = "last_purged"; +const KEY_SNAPSHOT_DATA: &str = "snapshot_data"; +const KEY_SNAPSHOT_META: &str = "snapshot_meta"; +const KEY_SNAPSHOT_IDX: &str = "snapshot_idx"; + +/// Persistent storage implementing RaftStorage for FlareDB +pub struct PersistentFlareStore { + /// RocksDB engine + engine: Arc, + /// State machine (still in-memory for now, but log is persistent) + sm: RwLock, + /// Snapshot counter + snapshot_idx: RwLock, + /// Current snapshot + current_snapshot: RwLock>, +} + +impl PersistentFlareStore { + /// Create a new PersistentFlareStore with recovery from disk + /// + /// On startup, this will: + /// 1. Load the last snapshot from RocksDB (if any) + /// 2. Restore the state machine from the snapshot + /// 3. OpenRaft will replay logs since the snapshot automatically + pub fn new(engine: Arc) -> Self { + // Try to recover snapshot from RocksDB + let (sm, snapshot, snapshot_idx) = Self::recover_from_disk(&engine); + + debug!( + has_snapshot = snapshot.is_some(), + snapshot_idx, + "PersistentFlareStore initialized" + ); + + Self { + engine, + sm: RwLock::new(sm), + snapshot_idx: RwLock::new(snapshot_idx), + current_snapshot: RwLock::new(snapshot), + } + } + + /// Recover state from RocksDB on startup + fn recover_from_disk( + engine: &RocksEngine, + ) -> (FlareStateMachine, Option, u64) { + // Load snapshot index + let snapshot_idx = engine + .get_raft_state(KEY_SNAPSHOT_IDX) + .ok() + .flatten() + .and_then(|data| { + if data.len() == 8 { + let mut arr = [0u8; 8]; + arr.copy_from_slice(&data); + Some(u64::from_be_bytes(arr)) + } else { + None + } + }) + .unwrap_or(0); + + // Load snapshot metadata + let snapshot_meta: Option> = engine + .get_raft_state(KEY_SNAPSHOT_META) + .ok() + .flatten() + .and_then(|data| serde_json::from_slice(&data).ok()); + + // Load snapshot data + let snapshot_data: Option> = engine + .get_raft_state(KEY_SNAPSHOT_DATA) + .ok() + .flatten(); + + // If we have both meta and data, reconstruct the snapshot and state machine + match (snapshot_meta, snapshot_data) { + (Some(meta), Some(data)) => { + // Try to deserialize state machine from snapshot (using bincode) + match bincode::deserialize::(&data) { + Ok(sm) => { + debug!( + snapshot_id = %meta.snapshot_id, + "Recovered state machine from snapshot" + ); + let snapshot = FlareSnapshot { + meta, + data, + }; + (sm, Some(snapshot), snapshot_idx) + } + Err(e) => { + tracing::warn!( + error = %e, + "Failed to deserialize state machine from snapshot, starting fresh" + ); + (FlareStateMachine::default(), None, 0) + } + } + } + _ => { + // No snapshot, start with empty state machine + (FlareStateMachine::default(), None, snapshot_idx) + } + } + } + + /// Persist snapshot to RocksDB + fn persist_snapshot(&self, snapshot: &FlareSnapshot, idx: u64) -> Result<(), StorageError> { + // Persist snapshot metadata + let meta_data = serde_json::to_vec(&snapshot.meta) + .map_err(|e| StorageIOError::write(&e))?; + self.engine + .put_raft_state(KEY_SNAPSHOT_META, &meta_data) + .map_err(|e| StorageIOError::write(&e))?; + + // Persist snapshot data + self.engine + .put_raft_state(KEY_SNAPSHOT_DATA, &snapshot.data) + .map_err(|e| StorageIOError::write(&e))?; + + // Persist snapshot index + self.engine + .put_raft_state(KEY_SNAPSHOT_IDX, &idx.to_be_bytes()) + .map_err(|e| StorageIOError::write(&e))?; + + debug!( + snapshot_id = %snapshot.meta.snapshot_id, + idx, + "Persisted snapshot to RocksDB" + ); + + Ok(()) + } + + /// Create a new PersistentFlareStore wrapped in Arc + pub fn new_arc(engine: Arc) -> Arc { + Arc::new(Self::new(engine)) + } + + /// Get the current state machine + pub async fn get_state_machine(&self) -> FlareStateMachine { + self.sm.read().await.clone() + } + + /// Read a value from the state machine (eventual consistency) + pub async fn read_kv(&self, namespace_id: u32, key: &[u8]) -> Option<(Vec, u64)> { + let sm = self.sm.read().await; + sm.kv_data.get(&(namespace_id, key.to_vec())).cloned() + } + + /// Read a CAS value from the state machine (strong consistency) + /// Returns (value, version, timestamp) if found + pub async fn read_cas(&self, namespace_id: u32, key: &[u8]) -> Option<(Vec, u64, u64)> { + let sm = self.sm.read().await; + sm.cas_data.get(&(namespace_id, key.to_vec())).cloned() + } + + fn serialize_log_id(log_id: &FlareLogId) -> Vec { + serde_json::to_vec(log_id).unwrap_or_default() + } + + fn deserialize_log_id(data: &[u8]) -> Option { + serde_json::from_slice(data).ok() + } + + fn serialize_vote(vote: &Vote) -> Vec { + serde_json::to_vec(vote).unwrap_or_default() + } + + fn deserialize_vote(data: &[u8]) -> Option> { + serde_json::from_slice(data).ok() + } +} + +impl RaftLogReader for Arc { + async fn try_get_log_entries + Clone + Debug + OptionalSend>( + &mut self, + range: RB, + ) -> Result, StorageError> { + let start = match range.start_bound() { + std::ops::Bound::Included(&n) => n, + std::ops::Bound::Excluded(&n) => n + 1, + std::ops::Bound::Unbounded => 0, + }; + let end = match range.end_bound() { + std::ops::Bound::Included(&n) => Some(n + 1), + std::ops::Bound::Excluded(&n) => Some(n), + std::ops::Bound::Unbounded => None, + }; + + let raw_entries = self + .engine + .get_raft_log_range(start, end) + .map_err(|e| StorageIOError::read_logs(&e))?; + + let mut entries = Vec::with_capacity(raw_entries.len()); + for (_, data) in raw_entries { + let entry: FlareEntry = + serde_json::from_slice(&data).map_err(|e| StorageIOError::read_logs(&e))?; + entries.push(entry); + } + + Ok(entries) + } +} + +impl RaftSnapshotBuilder for Arc { + async fn build_snapshot( + &mut self, + ) -> Result, StorageError> { + let data; + let last_applied_log; + let last_membership; + + { + let sm = self.sm.read().await; + // Use bincode for state machine (handles tuple map keys) + data = bincode::serialize(&*sm).map_err(|e| StorageIOError::read_state_machine(&e))?; + last_applied_log = sm.last_applied_log; + last_membership = sm.last_membership.clone(); + } + + let snapshot_idx = { + let mut idx = self.snapshot_idx.write().await; + *idx += 1; + *idx + }; + + let snapshot_id = if let Some(last) = last_applied_log { + format!("{}-{}-{}", last.leader_id, last.index, snapshot_idx) + } else { + format!("--{}", snapshot_idx) + }; + + let meta = SnapshotMeta { + last_log_id: last_applied_log, + last_membership, + snapshot_id, + }; + + let snapshot = FlareSnapshot { + meta: meta.clone(), + data: data.clone(), + }; + + // Persist snapshot to RocksDB for recovery + self.persist_snapshot(&snapshot, snapshot_idx)?; + + { + let mut current = self.current_snapshot.write().await; + *current = Some(snapshot); + } + + Ok(Snapshot { + meta, + snapshot: Box::new(Cursor::new(data)), + }) + } +} + +impl RaftStorage for Arc { + async fn get_log_state(&mut self) -> Result, StorageError> { + // Get last purged log ID from state + let last_purged = self + .engine + .get_raft_state(KEY_LAST_PURGED) + .map_err(|e| StorageIOError::read_logs(&e))? + .and_then(|data| PersistentFlareStore::deserialize_log_id(&data)); + + // Get last log entry from log + let (_, last_idx) = self + .engine + .get_raft_log_bounds() + .map_err(|e| StorageIOError::read_logs(&e))?; + + let last = if let Some(idx) = last_idx { + let entry_data = self + .engine + .get_raft_log(idx) + .map_err(|e| StorageIOError::read_logs(&e))?; + + if let Some(data) = entry_data { + let entry: FlareEntry = + serde_json::from_slice(&data).map_err(|e| StorageIOError::read_logs(&e))?; + Some(*entry.get_log_id()) + } else { + last_purged + } + } else { + last_purged + }; + + Ok(LogState { + last_purged_log_id: last_purged, + last_log_id: last, + }) + } + + async fn save_vote( + &mut self, + vote: &Vote, + ) -> Result<(), StorageError> { + let data = PersistentFlareStore::serialize_vote(vote); + self.engine + .put_raft_state(KEY_VOTE, &data) + .map_err(|e| StorageIOError::write_vote(&e))?; + debug!(?vote, "Saved vote to persistent storage"); + Ok(()) + } + + async fn read_vote(&mut self) -> Result>, StorageError> { + let vote = self + .engine + .get_raft_state(KEY_VOTE) + .map_err(|e| StorageIOError::read_vote(&e))? + .and_then(|data| PersistentFlareStore::deserialize_vote(&data)); + Ok(vote) + } + + async fn save_committed( + &mut self, + committed: Option, + ) -> Result<(), StorageError> { + if let Some(log_id) = committed { + let data = PersistentFlareStore::serialize_log_id(&log_id); + self.engine + .put_raft_state(KEY_COMMITTED, &data) + .map_err(|e| StorageIOError::write(&e))?; + } + Ok(()) + } + + async fn read_committed( + &mut self, + ) -> Result, StorageError> { + let committed = self + .engine + .get_raft_state(KEY_COMMITTED) + .map_err(|e| StorageIOError::read(&e))? + .and_then(|data| PersistentFlareStore::deserialize_log_id(&data)); + Ok(committed) + } + + async fn last_applied_state( + &mut self, + ) -> Result<(Option, StoredMembership), StorageError> { + let sm = self.sm.read().await; + Ok((sm.last_applied_log, sm.last_membership.clone())) + } + + async fn delete_conflict_logs_since( + &mut self, + log_id: FlareLogId, + ) -> Result<(), StorageError> { + let (_, last_idx) = self + .engine + .get_raft_log_bounds() + .map_err(|e| StorageIOError::write_logs(&e))?; + + if let Some(last) = last_idx { + if log_id.index <= last { + self.engine + .delete_raft_log_range(log_id.index, last) + .map_err(|e| StorageIOError::write_logs(&e))?; + } + } + Ok(()) + } + + async fn purge_logs_upto( + &mut self, + log_id: FlareLogId, + ) -> Result<(), StorageError> { + // Save last purged log ID + let data = PersistentFlareStore::serialize_log_id(&log_id); + self.engine + .put_raft_state(KEY_LAST_PURGED, &data) + .map_err(|e| StorageIOError::write_logs(&e))?; + + // Delete logs up to and including log_id.index + let (first_idx, _) = self + .engine + .get_raft_log_bounds() + .map_err(|e| StorageIOError::write_logs(&e))?; + + if let Some(first) = first_idx { + if first <= log_id.index { + self.engine + .delete_raft_log_range(first, log_id.index) + .map_err(|e| StorageIOError::write_logs(&e))?; + } + } + + Ok(()) + } + + async fn append_to_log(&mut self, entries: I) -> Result<(), StorageError> + where + I: IntoIterator + OptionalSend, + { + for entry in entries { + let index = entry.log_id.index; + let data = serde_json::to_vec(&entry) + .map_err(|e| StorageIOError::write_log_entry(*entry.get_log_id(), &e))?; + self.engine + .put_raft_log(index, &data) + .map_err(|e| StorageIOError::write_log_entry(*entry.get_log_id(), &e))?; + trace!(index, "Appended log entry to persistent storage"); + } + Ok(()) + } + + async fn apply_to_state_machine( + &mut self, + entries: &[Entry], + ) -> Result, StorageError> { + let mut res = Vec::with_capacity(entries.len()); + let mut sm = self.sm.write().await; + + for entry in entries { + sm.last_applied_log = Some(entry.log_id); + + match &entry.payload { + EntryPayload::Blank => { + res.push(FlareResponse::Ok); + } + EntryPayload::Normal(req) => { + let response = crate::storage::apply_request(&mut sm, req, entry.log_id.index); + res.push(response); + } + EntryPayload::Membership(mem) => { + sm.last_membership = StoredMembership::new(Some(entry.log_id), mem.clone()); + res.push(FlareResponse::Ok); + } + } + } + + Ok(res) + } + + async fn begin_receiving_snapshot( + &mut self, + ) -> Result>>, StorageError> { + Ok(Box::new(Cursor::new(Vec::new()))) + } + + async fn install_snapshot( + &mut self, + meta: &SnapshotMeta, + snapshot: Box>>, + ) -> Result<(), StorageError> { + let new_snapshot = FlareSnapshot { + meta: meta.clone(), + data: snapshot.into_inner(), + }; + + // Update state machine (using bincode to match build_snapshot) + { + let new_sm: FlareStateMachine = bincode::deserialize(&new_snapshot.data) + .map_err(|e| StorageIOError::read_snapshot(Some(new_snapshot.meta.signature()), &e))?; + let mut sm = self.sm.write().await; + *sm = new_sm; + } + + // Persist snapshot to RocksDB for recovery + let snapshot_idx = { + let mut idx = self.snapshot_idx.write().await; + *idx += 1; + *idx + }; + self.persist_snapshot(&new_snapshot, snapshot_idx)?; + + // Update current snapshot in memory + let mut current = self.current_snapshot.write().await; + *current = Some(new_snapshot); + + Ok(()) + } + + async fn get_current_snapshot( + &mut self, + ) -> Result>, StorageError> { + let current = self.current_snapshot.read().await; + match &*current { + Some(snapshot) => Ok(Some(Snapshot { + meta: snapshot.meta.clone(), + snapshot: Box::new(Cursor::new(snapshot.data.clone())), + })), + None => Ok(None), + } + } + + type LogReader = Self; + type SnapshotBuilder = Self; + + async fn get_log_reader(&mut self) -> Self::LogReader { + self.clone() + } + + async fn get_snapshot_builder(&mut self) -> Self::SnapshotBuilder { + self.clone() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + fn create_test_store() -> Arc { + let dir = tempdir().unwrap(); + let engine = Arc::new(RocksEngine::new(dir.path().to_str().unwrap()).unwrap()); + PersistentFlareStore::new_arc(engine) + } + + #[tokio::test] + async fn test_vote_persistence() { + let mut store = create_test_store(); + + let vote = Vote::new(5, 1); + store.save_vote(&vote).await.unwrap(); + + let loaded = store.read_vote().await.unwrap().unwrap(); + assert_eq!(loaded.leader_id().term, 5); + assert_eq!(loaded.leader_id().node_id, 1); + } + + #[tokio::test] + async fn test_log_state_initial() { + let mut store = create_test_store(); + + let state = store.get_log_state().await.unwrap(); + assert!(state.last_log_id.is_none()); + assert!(state.last_purged_log_id.is_none()); + } + + #[tokio::test] + async fn test_snapshot_persistence_and_recovery() { + let dir = tempdir().unwrap(); + let path = dir.path().to_str().unwrap(); + + // Create a store and populate state machine + { + let engine = Arc::new(RocksEngine::new(path).unwrap()); + let store = PersistentFlareStore::new_arc(engine); + + // Manually add data to state machine + { + let mut sm = store.sm.write().await; + sm.kv_data + .insert((1, b"key1".to_vec()), (b"value1".to_vec(), 100)); + sm.cas_data + .insert((1, b"cas_key".to_vec()), (b"cas_value".to_vec(), 5, 200)); + sm.last_applied_log = Some(FlareLogId { + leader_id: openraft::LeaderId::new(1, 1), + index: 10, + }); + } + + // Build and persist snapshot + let mut store_mut = store.clone(); + let snapshot = store_mut.build_snapshot().await.unwrap(); + assert!(snapshot.meta.last_log_id.is_some()); + } + + // Reopen store and verify recovery + { + let engine = Arc::new(RocksEngine::new(path).unwrap()); + let store = PersistentFlareStore::new_arc(engine); + + // Verify state machine was recovered + let kv = store.read_kv(1, b"key1").await; + assert_eq!(kv, Some((b"value1".to_vec(), 100))); + + let cas = store.read_cas(1, b"cas_key").await; + assert_eq!(cas, Some((b"cas_value".to_vec(), 5, 200))); + + // Verify snapshot is present + let mut store_mut = store.clone(); + let current = store_mut.get_current_snapshot().await.unwrap(); + assert!(current.is_some()); + + let sm = store.get_state_machine().await; + assert!(sm.last_applied_log.is_some()); + assert_eq!(sm.last_applied_log.unwrap().index, 10); + } + } +} diff --git a/flaredb/crates/flaredb-raft/src/raft_node.rs b/flaredb/crates/flaredb-raft/src/raft_node.rs new file mode 100644 index 0000000..3007023 --- /dev/null +++ b/flaredb/crates/flaredb-raft/src/raft_node.rs @@ -0,0 +1,517 @@ +//! FlareRaftNode - High-level wrapper for OpenRaft +//! +//! This module provides a convenient wrapper around OpenRaft's Raft +//! that mirrors the interface of the old Peer struct, making migration easier. + +use crate::network::FlareNetworkFactory; +use crate::persistent_storage::PersistentFlareStore; +use crate::storage::FlareStore; +use crate::types::{ + FlareNode, FlareNodeId, FlareRaft, FlareRequest, FlareResponse, +}; +use flaredb_storage::rocks_engine::RocksEngine; +use openraft::error::{ClientWriteError, InitializeError, RaftError}; +use openraft::storage::Adaptor; +use openraft::{Config, Raft}; +use std::collections::{BTreeMap, BTreeSet, HashMap}; +use std::sync::Arc; +use tracing::{info, warn}; + +/// Type alias for ClientWriteError with our types +pub type FlareClientWriteError = RaftError>; + +/// Storage backend for FlareRaftNode +pub enum FlareStorage { + /// In-memory storage (for testing or non-persistent deployments) + InMemory(Arc), + /// Persistent storage backed by RocksDB + Persistent(Arc), +} + +impl FlareStorage { + /// Read a value (eventual consistency) + pub async fn read_kv(&self, namespace_id: u32, key: &[u8]) -> Option<(Vec, u64)> { + match self { + FlareStorage::InMemory(s) => s.read_kv(namespace_id, key).await, + FlareStorage::Persistent(s) => s.read_kv(namespace_id, key).await, + } + } + + /// Read a CAS value (strong consistency) + pub async fn read_cas(&self, namespace_id: u32, key: &[u8]) -> Option<(Vec, u64, u64)> { + match self { + FlareStorage::InMemory(s) => s.read_cas(namespace_id, key).await, + FlareStorage::Persistent(s) => s.read_cas(namespace_id, key).await, + } + } +} + +/// Wrapper around OpenRaft for easier migration from raft-rs +pub struct FlareRaftNode { + /// The underlying OpenRaft instance + pub raft: FlareRaft, + + /// Node ID (same as store_id) + store_id: FlareNodeId, + + /// Region ID this node belongs to + region_id: u64, + + /// Network factory for peer communication + network_factory: FlareNetworkFactory, + + /// Storage instance for direct reads + storage: FlareStorage, +} + +impl FlareRaftNode { + /// Create OpenRaft config with sensible defaults + fn create_raft_config(region_id: u64) -> Result, Box> { + let raft_config = Config { + cluster_name: format!("flare-region-{}", region_id), + heartbeat_interval: 100, + election_timeout_min: 300, + election_timeout_max: 600, + install_snapshot_timeout: 10000, + replication_lag_threshold: 1000, + snapshot_policy: openraft::SnapshotPolicy::LogsSinceLast(1000), + max_in_snapshot_log_to_keep: 100, + ..Default::default() + }; + Ok(Arc::new(raft_config.validate()?)) + } + + /// Create a new Raft node with in-memory storage (for testing) + pub async fn new( + store_id: FlareNodeId, + region_id: u64, + network_factory: FlareNetworkFactory, + voters: Vec, + peer_addrs: &HashMap, + ) -> Result> { + let raft_config = Self::create_raft_config(region_id)?; + + // Create in-memory storage + let storage = FlareStore::new_arc(); + + // Create adaptor that provides both RaftLogStorage and RaftStateMachine + let (log_store, state_machine) = Adaptor::new(storage.clone()); + + // Create the Raft instance + let raft = Raft::new( + store_id, + raft_config, + network_factory.clone(), + log_store, + state_machine, + ) + .await?; + + let node = Self { + raft, + store_id, + region_id, + network_factory, + storage: FlareStorage::InMemory(storage), + }; + + // Initialize cluster if voters are provided and cluster is not already initialized + if !voters.is_empty() { + node.try_initialize(voters, peer_addrs).await; + } + + Ok(node) + } + + /// Create a new Raft node with persistent RocksDB storage + /// + /// This constructor persists Raft logs, votes, and state to RocksDB, + /// ensuring data survives node restarts. + pub async fn new_persistent( + store_id: FlareNodeId, + region_id: u64, + network_factory: FlareNetworkFactory, + voters: Vec, + peer_addrs: &HashMap, + engine: Arc, + ) -> Result> { + let raft_config = Self::create_raft_config(region_id)?; + + // Create persistent storage backed by RocksDB + let storage = PersistentFlareStore::new_arc(engine); + + // Create adaptor that provides both RaftLogStorage and RaftStateMachine + let (log_store, state_machine) = Adaptor::new(storage.clone()); + + // Create the Raft instance + let raft = Raft::new( + store_id, + raft_config, + network_factory.clone(), + log_store, + state_machine, + ) + .await?; + + let node = Self { + raft, + store_id, + region_id, + network_factory, + storage: FlareStorage::Persistent(storage), + }; + + // Initialize cluster if voters are provided and cluster is not already initialized + if !voters.is_empty() { + node.try_initialize(voters, peer_addrs).await; + } + + info!( + store_id, + region_id, + "Created persistent Raft node with RocksDB storage" + ); + + Ok(node) + } + + /// Try to initialize the cluster with the given voters + async fn try_initialize(&self, voters: Vec, peer_addrs: &HashMap) { + // Build membership from voters with their addresses + let membership: BTreeMap = voters + .into_iter() + .map(|id| { + let addr = peer_addrs.get(&id).cloned().unwrap_or_default(); + ( + id, + FlareNode { + addr, + region_id: self.region_id, + }, + ) + }) + .collect(); + + match self.raft.initialize(membership).await { + Ok(_) => { + info!( + store_id = self.store_id, + region_id = self.region_id, + "cluster initialized" + ); + } + Err(RaftError::APIError(InitializeError::NotAllowed(_))) => { + // Already initialized, this is fine + } + Err(e) => { + warn!( + error = %e, + store_id = self.store_id, + region_id = self.region_id, + "failed to initialize cluster" + ); + } + } + } + + /// Get the store ID + pub fn store_id(&self) -> FlareNodeId { + self.store_id + } + + /// Get the region ID + pub fn region_id(&self) -> u64 { + self.region_id + } + + /// Check if this node is the current leader + pub async fn is_leader(&self) -> bool { + self.raft + .current_leader() + .await + .map(|id| id == self.store_id) + .unwrap_or(false) + } + + /// Get the current leader ID + pub async fn leader_id(&self) -> Option { + self.raft.current_leader().await + } + + /// Get current membership + pub async fn get_membership(&self) -> BTreeSet { + self.raft + .metrics() + .borrow() + .membership_config + .membership() + .voter_ids() + .collect() + } + + /// Propose a KV write operation (replaces peer.propose_kv) + /// Uses Last-Write-Wins for eventual consistency + pub async fn write_kv( + &self, + namespace_id: u32, + key: Vec, + value: Vec, + ts: u64, + ) -> Result { + let req = FlareRequest::KvWrite { + namespace_id, + key, + value, + ts, + }; + + let result = self.raft.client_write(req).await?; + Ok(result.data) + } + + /// Propose a CAS (Compare-And-Swap) write operation + /// CAS is executed atomically within the Raft state machine, + /// ensuring linearizable semantics across all replicas. + pub async fn cas_write( + &self, + namespace_id: u32, + key: Vec, + value: Vec, + expected_version: u64, + ts: u64, + ) -> Result { + let req = FlareRequest::CasWrite { + namespace_id, + key, + value, + expected_version, + ts, + }; + + let result = self.raft.client_write(req).await?; + Ok(result.data) + } + + /// Propose a KV delete operation (eventual consistency) + /// Uses Last-Write-Wins semantics, similar to write_kv + pub async fn delete_kv( + &self, + namespace_id: u32, + key: Vec, + ts: u64, + ) -> Result { + let req = FlareRequest::KvDelete { + namespace_id, + key, + ts, + }; + + let result = self.raft.client_write(req).await?; + Ok(result.data) + } + + /// Propose a CAS delete operation (strong consistency) + /// Deletes the key if the version matches expected_version. + /// If expected_version is 0, deletes if key exists (no version check). + pub async fn cas_delete( + &self, + namespace_id: u32, + key: Vec, + expected_version: u64, + ts: u64, + ) -> Result { + let req = FlareRequest::CasDelete { + namespace_id, + key, + expected_version, + ts, + }; + + let result = self.raft.client_write(req).await?; + Ok(result.data) + } + + /// Read a value directly from the state machine (local read, eventual consistency) + pub async fn read_kv(&self, namespace_id: u32, key: &[u8]) -> Option<(Vec, u64)> { + self.storage.read_kv(namespace_id, key).await + } + + /// Read a CAS value directly from the state machine (local read) + /// Returns (value, version, timestamp) if found + pub async fn read_cas(&self, namespace_id: u32, key: &[u8]) -> Option<(Vec, u64, u64)> { + self.storage.read_cas(namespace_id, key).await + } + + /// Perform a linearizable read (ensures we read from committed state) + /// + /// This method first calls `ensure_linearizable()` which confirms this node + /// has the most recent committed data, then reads from local state machine. + /// This provides strong consistency guarantees. + pub async fn linearizable_read_kv( + &self, + namespace_id: u32, + key: &[u8], + ) -> Result, u64)>, String> { + // Ensure we have the latest committed state + self.raft + .ensure_linearizable() + .await + .map_err(|e| format!("Linearizable read failed: {}", e))?; + + // Now safe to read from local state machine + Ok(self.storage.read_kv(namespace_id, key).await) + } + + /// Perform a linearizable CAS read (ensures we read from committed state) + /// Returns (value, version, timestamp) if found + pub async fn linearizable_read_cas( + &self, + namespace_id: u32, + key: &[u8], + ) -> Result, u64, u64)>, String> { + // Ensure we have the latest committed state + self.raft + .ensure_linearizable() + .await + .map_err(|e| format!("Linearizable read failed: {}", e))?; + + // Now safe to read from local state machine + Ok(self.storage.read_cas(namespace_id, key).await) + } + + /// Propose a split operation + pub async fn propose_split( + &self, + region_id: u64, + split_key: Vec, + new_region_id: u64, + ) -> Result { + let req = FlareRequest::Split { + region_id, + split_key, + new_region_id, + }; + + let result = self.raft.client_write(req).await?; + Ok(result.data) + } + + /// Trigger a leader election (replaces peer.campaign) + pub async fn trigger_election(&self) -> Result<(), Box> { + self.raft.trigger().elect().await?; + Ok(()) + } + + /// Add a learner node (first step in membership change) + pub async fn add_learner( + &self, + node_id: FlareNodeId, + node: FlareNode, + ) -> Result<(), FlareClientWriteError> { + self.raft.add_learner(node_id, node, true).await?; + Ok(()) + } + + /// Change the cluster membership + pub async fn change_membership( + &self, + members: BTreeSet, + ) -> Result<(), FlareClientWriteError> { + self.raft.change_membership(members, false).await?; + Ok(()) + } + + /// Add a voter to the cluster (convenience method) + pub async fn add_voter(&self, node_id: FlareNodeId, addr: String) -> Result<(), String> { + let node = FlareNode { + addr: addr.clone(), + region_id: self.region_id, + }; + + // First add as learner + self.add_learner(node_id, node) + .await + .map_err(|e| e.to_string())?; + + // Then promote to voter + let mut members = self.get_membership().await; + members.insert(node_id); + + self.change_membership(members) + .await + .map_err(|e| e.to_string())?; + + // Register with network factory + self.network_factory.add_peer(node_id, addr).await; + + Ok(()) + } + + /// Remove a voter from the cluster + pub async fn remove_voter(&self, node_id: FlareNodeId) -> Result<(), String> { + let mut members = self.get_membership().await; + members.remove(&node_id); + + self.change_membership(members) + .await + .map_err(|e| e.to_string())?; + + // Remove from network factory + self.network_factory.remove_peer(node_id).await; + + Ok(()) + } + + /// Get the network factory + pub fn network_factory(&self) -> &FlareNetworkFactory { + &self.network_factory + } + + /// Get the storage instance + pub fn storage(&self) -> &FlareStorage { + &self.storage + } + + /// Shutdown the Raft node gracefully + pub async fn shutdown(&self) -> Result<(), Box> { + self.raft.shutdown().await?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::time::Duration; + + async fn create_test_node() -> FlareRaftNode { + let network_factory = FlareNetworkFactory::new(1, 1); + let mut peer_addrs = HashMap::new(); + peer_addrs.insert(1, "127.0.0.1:50051".to_string()); + + FlareRaftNode::new(1, 1, network_factory, vec![1], &peer_addrs) + .await + .unwrap() + } + + #[tokio::test] + async fn test_create_node() { + let node = create_test_node().await; + assert_eq!(node.store_id(), 1); + assert_eq!(node.region_id(), 1); + } + + #[tokio::test] + async fn test_leader_election() { + let node = create_test_node().await; + + // In a single-node cluster, the node should become leader after triggering election + node.trigger_election().await.unwrap(); + + // Give some time for election + tokio::time::sleep(Duration::from_millis(500)).await; + + // Should be leader in a single-node cluster + assert!(node.is_leader().await); + } +} diff --git a/flaredb/crates/flaredb-raft/src/storage.rs b/flaredb/crates/flaredb-raft/src/storage.rs new file mode 100644 index 0000000..2402fb0 --- /dev/null +++ b/flaredb/crates/flaredb-raft/src/storage.rs @@ -0,0 +1,809 @@ +//! In-memory Raft storage for FlareDB +//! +//! This module provides an in-memory implementation of OpenRaft's RaftStorage trait, +//! similar to openraft-memstore but using FlareDB's type configuration. + +use crate::types::{ + FlareEntry, FlareLogId, FlareNode, FlareNodeId, FlareRequest, FlareResponse, + FlareStoredMembership, FlareTypeConfig, +}; +use openraft::storage::{LogState, RaftLogReader, RaftSnapshotBuilder, Snapshot}; +use openraft::{Entry, EntryPayload, OptionalSend, RaftLogId, RaftStorage, SnapshotMeta}; +use openraft::{StorageError, StorageIOError, StoredMembership, Vote}; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; +use std::fmt::Debug; +use std::io::Cursor; +use std::ops::RangeBounds; +use std::sync::Arc; +use tokio::sync::RwLock; + +/// Snapshot data for FlareDB +#[derive(Debug)] +pub struct FlareSnapshot { + pub meta: SnapshotMeta, + pub data: Vec, +} + +/// State machine for FlareDB +#[derive(Serialize, Deserialize, Debug, Default, Clone)] +pub struct FlareStateMachine { + pub last_applied_log: Option, + pub last_membership: FlareStoredMembership, + /// Key-value data: (namespace_id, key) -> (value, timestamp) + /// Used for eventual consistency (Last-Write-Wins) + pub kv_data: BTreeMap<(u32, Vec), (Vec, u64)>, + /// CAS data: (namespace_id, key) -> (value, version, timestamp) + /// Used for strong consistency (Compare-And-Swap) + pub cas_data: BTreeMap<(u32, Vec), (Vec, u64, u64)>, +} + +/// In-memory storage implementing RaftStorage for FlareDB +pub struct FlareStore { + last_purged_log_id: RwLock>, + committed: RwLock>, + /// Raft log entries stored as serialized JSON + log: RwLock>, + /// State machine + sm: RwLock, + /// Current vote + vote: RwLock>>, + /// Snapshot counter + snapshot_idx: RwLock, + /// Current snapshot + current_snapshot: RwLock>, +} + +impl FlareStore { + /// Create a new FlareStore + pub fn new() -> Self { + Self { + last_purged_log_id: RwLock::new(None), + committed: RwLock::new(None), + log: RwLock::new(BTreeMap::new()), + sm: RwLock::new(FlareStateMachine::default()), + vote: RwLock::new(None), + snapshot_idx: RwLock::new(0), + current_snapshot: RwLock::new(None), + } + } + + /// Create a new FlareStore wrapped in Arc + pub fn new_arc() -> Arc { + Arc::new(Self::new()) + } + + /// Get the current state machine (for testing) + pub async fn get_state_machine(&self) -> FlareStateMachine { + self.sm.read().await.clone() + } + + /// Read a value from the state machine (eventual consistency) + pub async fn read_kv(&self, namespace_id: u32, key: &[u8]) -> Option<(Vec, u64)> { + let sm = self.sm.read().await; + sm.kv_data.get(&(namespace_id, key.to_vec())).cloned() + } + + /// Read a CAS value from the state machine (strong consistency) + /// Returns (value, version, timestamp) if found + pub async fn read_cas(&self, namespace_id: u32, key: &[u8]) -> Option<(Vec, u64, u64)> { + let sm = self.sm.read().await; + sm.cas_data.get(&(namespace_id, key.to_vec())).cloned() + } +} + +impl Default for FlareStore { + fn default() -> Self { + Self::new() + } +} + +impl RaftLogReader for Arc { + async fn try_get_log_entries + Clone + Debug + OptionalSend>( + &mut self, + range: RB, + ) -> Result, StorageError> { + let mut entries = vec![]; + let log = self.log.read().await; + + for (_, serialized) in log.range(range) { + let ent: FlareEntry = + serde_json::from_str(serialized).map_err(|e| StorageIOError::read_logs(&e))?; + entries.push(ent); + } + + Ok(entries) + } +} + +impl RaftSnapshotBuilder for Arc { + async fn build_snapshot(&mut self) -> Result, StorageError> { + let data; + let last_applied_log; + let last_membership; + + { + let sm = self.sm.read().await; + // Use bincode for state machine (handles tuple map keys) + data = bincode::serialize(&*sm).map_err(|e| StorageIOError::read_state_machine(&e))?; + last_applied_log = sm.last_applied_log; + last_membership = sm.last_membership.clone(); + } + + let snapshot_idx = { + let mut idx = self.snapshot_idx.write().await; + *idx += 1; + *idx + }; + + let snapshot_id = if let Some(last) = last_applied_log { + format!("{}-{}-{}", last.leader_id, last.index, snapshot_idx) + } else { + format!("--{}", snapshot_idx) + }; + + let meta = SnapshotMeta { + last_log_id: last_applied_log, + last_membership, + snapshot_id, + }; + + let snapshot = FlareSnapshot { + meta: meta.clone(), + data: data.clone(), + }; + + { + let mut current = self.current_snapshot.write().await; + *current = Some(snapshot); + } + + Ok(Snapshot { + meta, + snapshot: Box::new(Cursor::new(data)), + }) + } +} + +impl RaftStorage for Arc { + async fn get_log_state(&mut self) -> Result, StorageError> { + let log = self.log.read().await; + let last_serialized = log.iter().next_back().map(|(_, ent)| ent); + + let last = match last_serialized { + None => None, + Some(serialized) => { + let ent: FlareEntry = + serde_json::from_str(serialized).map_err(|e| StorageIOError::read_logs(&e))?; + Some(*ent.get_log_id()) + } + }; + + let last_purged = *self.last_purged_log_id.read().await; + + let last = match last { + None => last_purged, + Some(x) => Some(x), + }; + + Ok(LogState { + last_purged_log_id: last_purged, + last_log_id: last, + }) + } + + async fn save_vote(&mut self, vote: &Vote) -> Result<(), StorageError> { + let mut v = self.vote.write().await; + *v = Some(*vote); + Ok(()) + } + + async fn read_vote(&mut self) -> Result>, StorageError> { + Ok(*self.vote.read().await) + } + + async fn save_committed( + &mut self, + committed: Option, + ) -> Result<(), StorageError> { + let mut c = self.committed.write().await; + *c = committed; + Ok(()) + } + + async fn read_committed(&mut self) -> Result, StorageError> { + Ok(*self.committed.read().await) + } + + async fn last_applied_state( + &mut self, + ) -> Result<(Option, StoredMembership), StorageError> + { + let sm = self.sm.read().await; + Ok((sm.last_applied_log, sm.last_membership.clone())) + } + + async fn delete_conflict_logs_since( + &mut self, + log_id: FlareLogId, + ) -> Result<(), StorageError> { + let mut log = self.log.write().await; + let keys: Vec<_> = log.range(log_id.index..).map(|(k, _)| *k).collect(); + for key in keys { + log.remove(&key); + } + Ok(()) + } + + async fn purge_logs_upto(&mut self, log_id: FlareLogId) -> Result<(), StorageError> { + { + let mut ld = self.last_purged_log_id.write().await; + assert!(*ld <= Some(log_id)); + *ld = Some(log_id); + } + + { + let mut log = self.log.write().await; + let keys: Vec<_> = log.range(..=log_id.index).map(|(k, _)| *k).collect(); + for key in keys { + log.remove(&key); + } + } + + Ok(()) + } + + async fn append_to_log(&mut self, entries: I) -> Result<(), StorageError> + where + I: IntoIterator + OptionalSend, + { + let mut log = self.log.write().await; + for entry in entries { + let s = serde_json::to_string(&entry) + .map_err(|e| StorageIOError::write_log_entry(*entry.get_log_id(), &e))?; + log.insert(entry.log_id.index, s); + } + Ok(()) + } + + async fn apply_to_state_machine( + &mut self, + entries: &[Entry], + ) -> Result, StorageError> { + let mut res = Vec::with_capacity(entries.len()); + let mut sm = self.sm.write().await; + + for entry in entries { + sm.last_applied_log = Some(entry.log_id); + + match &entry.payload { + EntryPayload::Blank => { + res.push(FlareResponse::Ok); + } + EntryPayload::Normal(req) => { + let response = apply_request(&mut sm, req, entry.log_id.index); + res.push(response); + } + EntryPayload::Membership(mem) => { + sm.last_membership = StoredMembership::new(Some(entry.log_id), mem.clone()); + res.push(FlareResponse::Ok); + } + } + } + + Ok(res) + } + + async fn begin_receiving_snapshot( + &mut self, + ) -> Result>>, StorageError> { + Ok(Box::new(Cursor::new(Vec::new()))) + } + + async fn install_snapshot( + &mut self, + meta: &SnapshotMeta, + snapshot: Box>>, + ) -> Result<(), StorageError> { + let new_snapshot = FlareSnapshot { + meta: meta.clone(), + data: snapshot.into_inner(), + }; + + // Update state machine (using bincode to match build_snapshot) + { + let new_sm: FlareStateMachine = bincode::deserialize(&new_snapshot.data) + .map_err(|e| StorageIOError::read_snapshot(Some(new_snapshot.meta.signature()), &e))?; + let mut sm = self.sm.write().await; + *sm = new_sm; + } + + // Update current snapshot + let mut current = self.current_snapshot.write().await; + *current = Some(new_snapshot); + + Ok(()) + } + + async fn get_current_snapshot( + &mut self, + ) -> Result>, StorageError> { + let current = self.current_snapshot.read().await; + match &*current { + Some(snapshot) => Ok(Some(Snapshot { + meta: snapshot.meta.clone(), + snapshot: Box::new(Cursor::new(snapshot.data.clone())), + })), + None => Ok(None), + } + } + + type LogReader = Self; + type SnapshotBuilder = Self; + + async fn get_log_reader(&mut self) -> Self::LogReader { + self.clone() + } + + async fn get_snapshot_builder(&mut self) -> Self::SnapshotBuilder { + self.clone() + } +} + +/// Apply a request to the state machine +pub fn apply_request(sm: &mut FlareStateMachine, req: &FlareRequest, index: u64) -> FlareResponse { + match req { + FlareRequest::KvWrite { + namespace_id, + key, + value, + ts, + } => { + let key_tuple = (*namespace_id, key.clone()); + + // Last-Write-Wins: only apply if timestamp is newer + let should_apply = match sm.kv_data.get(&key_tuple) { + Some((_, existing_ts)) => *ts >= *existing_ts, + None => true, + }; + + if should_apply { + sm.kv_data.insert(key_tuple, (value.clone(), *ts)); + } + + FlareResponse::Applied { index } + } + FlareRequest::KvDelete { + namespace_id, + key, + ts, + } => { + let key_tuple = (*namespace_id, key.clone()); + + // Last-Write-Wins: only apply if timestamp is newer + let (should_apply, existed) = match sm.kv_data.get(&key_tuple) { + Some((_, existing_ts)) => (*ts >= *existing_ts, true), + None => (true, false), + }; + + if should_apply { + sm.kv_data.remove(&key_tuple); + } + + FlareResponse::DeleteResult { existed } + } + FlareRequest::CasWrite { + namespace_id, + key, + value, + expected_version, + ts, + } => { + let key_tuple = (*namespace_id, key.clone()); + + // Get current version (0 if key doesn't exist) + let current_version = sm + .cas_data + .get(&key_tuple) + .map(|(_, v, _)| *v) + .unwrap_or(0); + + if current_version != *expected_version { + // Version mismatch - CAS fails + FlareResponse::CasResult { + success: false, + current_version, + new_version: 0, + } + } else { + // Version matches - apply CAS atomically + let new_version = current_version + 1; + sm.cas_data + .insert(key_tuple, (value.clone(), new_version, *ts)); + FlareResponse::CasResult { + success: true, + current_version, + new_version, + } + } + } + FlareRequest::CasDelete { + namespace_id, + key, + expected_version, + ts: _, + } => { + let key_tuple = (*namespace_id, key.clone()); + + // Get current version (0 if key doesn't exist) + let current_version = sm + .cas_data + .get(&key_tuple) + .map(|(_, v, _)| *v) + .unwrap_or(0); + + // If expected_version is 0, delete if exists (no version check) + // Otherwise, only delete if version matches + let should_delete = if *expected_version == 0 { + current_version > 0 // Key exists + } else { + current_version == *expected_version + }; + + if should_delete { + sm.cas_data.remove(&key_tuple); + FlareResponse::CasDeleteResult { + success: true, + current_version, + existed: true, + } + } else { + FlareResponse::CasDeleteResult { + success: false, + current_version, + existed: current_version > 0, + } + } + } + FlareRequest::Split { + region_id: _, + split_key: _, + new_region_id, + } => { + // Split handling would be done at a higher level + FlareResponse::SplitCreated { + new_region_id: *new_region_id, + } + } + FlareRequest::Noop => FlareResponse::Ok, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_create_store() { + let _store = FlareStore::new_arc(); + } + + #[tokio::test] + async fn test_apply_kv_write() { + let mut sm = FlareStateMachine::default(); + let req = FlareRequest::KvWrite { + namespace_id: 1, + key: b"test_key".to_vec(), + value: b"test_value".to_vec(), + ts: 100, + }; + + let response = apply_request(&mut sm, &req, 1); + assert!(matches!(response, FlareResponse::Applied { index: 1 })); + + let data = sm.kv_data.get(&(1, b"test_key".to_vec())); + assert_eq!(data, Some(&(b"test_value".to_vec(), 100))); + } + + #[tokio::test] + async fn test_lww_conflict_resolution() { + let mut sm = FlareStateMachine::default(); + + // First write with ts=100 + let req1 = FlareRequest::KvWrite { + namespace_id: 1, + key: b"key".to_vec(), + value: b"value1".to_vec(), + ts: 100, + }; + apply_request(&mut sm, &req1, 1); + + // Second write with ts=50 (older, should not apply) + let req2 = FlareRequest::KvWrite { + namespace_id: 1, + key: b"key".to_vec(), + value: b"value2".to_vec(), + ts: 50, + }; + apply_request(&mut sm, &req2, 2); + + // Value should still be "value1" + let data = sm.kv_data.get(&(1, b"key".to_vec())); + assert_eq!(data, Some(&(b"value1".to_vec(), 100))); + + // Third write with ts=200 (newer, should apply) + let req3 = FlareRequest::KvWrite { + namespace_id: 1, + key: b"key".to_vec(), + value: b"value3".to_vec(), + ts: 200, + }; + apply_request(&mut sm, &req3, 3); + + // Value should now be "value3" + let data = sm.kv_data.get(&(1, b"key".to_vec())); + assert_eq!(data, Some(&(b"value3".to_vec(), 200))); + } + + #[tokio::test] + async fn test_apply_cas_write_success() { + let mut sm = FlareStateMachine::default(); + + // Create new key (expected version 0) + let req = FlareRequest::CasWrite { + namespace_id: 1, + key: b"cas_key".to_vec(), + value: b"value1".to_vec(), + expected_version: 0, + ts: 100, + }; + + let response = apply_request(&mut sm, &req, 1); + assert!(matches!( + response, + FlareResponse::CasResult { + success: true, + current_version: 0, + new_version: 1, + } + )); + + let data = sm.cas_data.get(&(1, b"cas_key".to_vec())); + assert_eq!(data, Some(&(b"value1".to_vec(), 1, 100))); + + // Update with expected version 1 + let req2 = FlareRequest::CasWrite { + namespace_id: 1, + key: b"cas_key".to_vec(), + value: b"value2".to_vec(), + expected_version: 1, + ts: 200, + }; + + let response2 = apply_request(&mut sm, &req2, 2); + assert!(matches!( + response2, + FlareResponse::CasResult { + success: true, + current_version: 1, + new_version: 2, + } + )); + + let data2 = sm.cas_data.get(&(1, b"cas_key".to_vec())); + assert_eq!(data2, Some(&(b"value2".to_vec(), 2, 200))); + } + + #[tokio::test] + async fn test_apply_cas_write_conflict() { + let mut sm = FlareStateMachine::default(); + + // Create key + let req1 = FlareRequest::CasWrite { + namespace_id: 1, + key: b"cas_key".to_vec(), + value: b"value1".to_vec(), + expected_version: 0, + ts: 100, + }; + apply_request(&mut sm, &req1, 1); + + // Try to update with wrong version (expected 0, but actual is 1) + let req2 = FlareRequest::CasWrite { + namespace_id: 1, + key: b"cas_key".to_vec(), + value: b"value2".to_vec(), + expected_version: 0, // Wrong! Should be 1 + ts: 200, + }; + + let response = apply_request(&mut sm, &req2, 2); + assert!(matches!( + response, + FlareResponse::CasResult { + success: false, + current_version: 1, + new_version: 0, + } + )); + + // Value should remain unchanged + let data = sm.cas_data.get(&(1, b"cas_key".to_vec())); + assert_eq!(data, Some(&(b"value1".to_vec(), 1, 100))); + } + + #[tokio::test] + async fn test_apply_kv_delete() { + let mut sm = FlareStateMachine::default(); + + // First write a value + let req_write = FlareRequest::KvWrite { + namespace_id: 1, + key: b"key".to_vec(), + value: b"value".to_vec(), + ts: 100, + }; + apply_request(&mut sm, &req_write, 1); + + // Delete the key + let req_delete = FlareRequest::KvDelete { + namespace_id: 1, + key: b"key".to_vec(), + ts: 200, + }; + let response = apply_request(&mut sm, &req_delete, 2); + assert!(matches!(response, FlareResponse::DeleteResult { existed: true })); + + // Key should be deleted + let data = sm.kv_data.get(&(1, b"key".to_vec())); + assert_eq!(data, None); + } + + #[tokio::test] + async fn test_apply_kv_delete_nonexistent() { + let mut sm = FlareStateMachine::default(); + + // Delete a non-existent key + let req_delete = FlareRequest::KvDelete { + namespace_id: 1, + key: b"nonexistent".to_vec(), + ts: 100, + }; + let response = apply_request(&mut sm, &req_delete, 1); + assert!(matches!(response, FlareResponse::DeleteResult { existed: false })); + } + + #[tokio::test] + async fn test_apply_kv_delete_lww() { + let mut sm = FlareStateMachine::default(); + + // Write with ts=200 + let req_write = FlareRequest::KvWrite { + namespace_id: 1, + key: b"key".to_vec(), + value: b"value".to_vec(), + ts: 200, + }; + apply_request(&mut sm, &req_write, 1); + + // Try to delete with older timestamp (ts=100) - should not delete + let req_delete = FlareRequest::KvDelete { + namespace_id: 1, + key: b"key".to_vec(), + ts: 100, + }; + apply_request(&mut sm, &req_delete, 2); + + // Key should still exist (delete was ignored due to older timestamp) + let data = sm.kv_data.get(&(1, b"key".to_vec())); + assert_eq!(data, Some(&(b"value".to_vec(), 200))); + } + + #[tokio::test] + async fn test_apply_cas_delete_success() { + let mut sm = FlareStateMachine::default(); + + // Create key with CAS + let req_write = FlareRequest::CasWrite { + namespace_id: 1, + key: b"cas_key".to_vec(), + value: b"value".to_vec(), + expected_version: 0, + ts: 100, + }; + apply_request(&mut sm, &req_write, 1); + + // Delete with correct version + let req_delete = FlareRequest::CasDelete { + namespace_id: 1, + key: b"cas_key".to_vec(), + expected_version: 1, + ts: 200, + }; + let response = apply_request(&mut sm, &req_delete, 2); + assert!(matches!( + response, + FlareResponse::CasDeleteResult { + success: true, + current_version: 1, + existed: true, + } + )); + + // Key should be deleted + let data = sm.cas_data.get(&(1, b"cas_key".to_vec())); + assert_eq!(data, None); + } + + #[tokio::test] + async fn test_apply_cas_delete_version_mismatch() { + let mut sm = FlareStateMachine::default(); + + // Create key + let req_write = FlareRequest::CasWrite { + namespace_id: 1, + key: b"cas_key".to_vec(), + value: b"value".to_vec(), + expected_version: 0, + ts: 100, + }; + apply_request(&mut sm, &req_write, 1); + + // Try to delete with wrong version + let req_delete = FlareRequest::CasDelete { + namespace_id: 1, + key: b"cas_key".to_vec(), + expected_version: 99, // Wrong version + ts: 200, + }; + let response = apply_request(&mut sm, &req_delete, 2); + assert!(matches!( + response, + FlareResponse::CasDeleteResult { + success: false, + current_version: 1, + existed: true, + } + )); + + // Key should still exist + let data = sm.cas_data.get(&(1, b"cas_key".to_vec())); + assert_eq!(data, Some(&(b"value".to_vec(), 1, 100))); + } + + #[tokio::test] + async fn test_apply_cas_delete_if_exists() { + let mut sm = FlareStateMachine::default(); + + // Create key + let req_write = FlareRequest::CasWrite { + namespace_id: 1, + key: b"cas_key".to_vec(), + value: b"value".to_vec(), + expected_version: 0, + ts: 100, + }; + apply_request(&mut sm, &req_write, 1); + + // Delete with expected_version=0 (delete if exists) + let req_delete = FlareRequest::CasDelete { + namespace_id: 1, + key: b"cas_key".to_vec(), + expected_version: 0, + ts: 200, + }; + let response = apply_request(&mut sm, &req_delete, 2); + assert!(matches!( + response, + FlareResponse::CasDeleteResult { + success: true, + current_version: 1, + existed: true, + } + )); + + // Key should be deleted + let data = sm.cas_data.get(&(1, b"cas_key".to_vec())); + assert_eq!(data, None); + } +} diff --git a/flaredb/crates/flaredb-raft/src/types.rs b/flaredb/crates/flaredb-raft/src/types.rs new file mode 100644 index 0000000..416b87a --- /dev/null +++ b/flaredb/crates/flaredb-raft/src/types.rs @@ -0,0 +1,155 @@ +//! Type definitions for OpenRaft integration +//! +//! This module defines the core types used throughout the OpenRaft implementation, +//! including the RaftTypeConfig, request/response types, and node metadata. + +use serde::{Deserialize, Serialize}; +use std::io::Cursor; + +/// Node ID type - matches existing store_id +pub type FlareNodeId = u64; + +/// Request types for the state machine +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub enum FlareRequest { + /// Key-value write operation (Last-Write-Wins, eventual consistency) + KvWrite { + namespace_id: u32, + key: Vec, + value: Vec, + ts: u64, + }, + /// Key-value delete operation (Last-Write-Wins, eventual consistency) + KvDelete { + namespace_id: u32, + key: Vec, + ts: u64, + }, + /// Compare-and-swap write operation (strong consistency) + /// CAS is executed atomically within the state machine to ensure + /// linearizable semantics across all replicas. + CasWrite { + namespace_id: u32, + key: Vec, + value: Vec, + expected_version: u64, + ts: u64, + }, + /// Compare-and-swap delete operation (strong consistency) + /// Deletes the key if the version matches expected_version. + /// If expected_version is 0, deletes if key exists (no version check). + CasDelete { + namespace_id: u32, + key: Vec, + expected_version: u64, + ts: u64, + }, + /// Region split operation + Split { + region_id: u64, + split_key: Vec, + new_region_id: u64, + }, + /// No-op for leader election + Noop, +} + +/// Response types from the state machine +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub enum FlareResponse { + /// Operation completed successfully + Ok, + /// Write applied with index + Applied { index: u64 }, + /// Delete operation result + DeleteResult { + existed: bool, + }, + /// Compare-and-swap result + CasResult { + success: bool, + current_version: u64, + new_version: u64, + }, + /// CAS delete result + CasDeleteResult { + success: bool, + current_version: u64, + existed: bool, + }, + /// Split operation created new region + SplitCreated { new_region_id: u64 }, +} + +/// Node metadata - contains address and region information +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Default)] +pub struct FlareNode { + /// Network address (host:port) + pub addr: String, + /// Region ID this node belongs to + pub region_id: u64, +} + +impl std::fmt::Display for FlareNode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "region-{}@{}", self.region_id, self.addr) + } +} + +/// Snapshot data type - uses in-memory cursor for simplicity +pub type FlareSnapshotData = Cursor>; + +// Declare the RaftTypeConfig using OpenRaft's macro +openraft::declare_raft_types!( + /// FlareDB's OpenRaft type configuration + pub FlareTypeConfig: + D = FlareRequest, + R = FlareResponse, + Node = FlareNode, + NodeId = FlareNodeId, +); + +/// Type alias for OpenRaft's Entry with our config +pub type FlareEntry = openraft::Entry; + +/// Type alias for OpenRaft's LogId with our config +pub type FlareLogId = openraft::LogId; + +/// Type alias for OpenRaft's Vote with our config +pub type FlareVote = openraft::Vote; + +/// Type alias for OpenRaft's Membership with our config +pub type FlareMembership = openraft::Membership; + +/// Type alias for StoredMembership with our config +pub type FlareStoredMembership = openraft::StoredMembership; + +/// Type alias for the Raft instance +pub type FlareRaft = openraft::Raft; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_flare_request_serialization() { + let req = FlareRequest::KvWrite { + namespace_id: 1, + key: b"test_key".to_vec(), + value: b"test_value".to_vec(), + ts: 12345, + }; + let json = serde_json::to_string(&req).unwrap(); + let decoded: FlareRequest = serde_json::from_str(&json).unwrap(); + assert_eq!(req, decoded); + } + + #[test] + fn test_flare_node_display() { + let node = FlareNode { + addr: "127.0.0.1:50051".to_string(), + region_id: 1, + }; + assert_eq!(format!("{}", node), "region-1@127.0.0.1:50051"); + } +} diff --git a/flaredb/crates/flaredb-server/Cargo.toml b/flaredb/crates/flaredb-server/Cargo.toml new file mode 100644 index 0000000..0674b4d --- /dev/null +++ b/flaredb/crates/flaredb-server/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "flaredb-server" +version.workspace = true +edition.workspace = true + +[dependencies] +flaredb-types = { path = "../flaredb-types" } +flaredb-proto = { path = "../flaredb-proto" } +flaredb-storage = { path = "../flaredb-storage" } +flaredb-raft = { path = "../flaredb-raft" } +tokio.workspace = true +tonic.workspace = true +tonic-health.workspace = true +prost.workspace = true +clap.workspace = true +openraft.workspace = true +serde.workspace = true +serde_json.workspace = true +async-trait.workspace = true +tracing.workspace = true +tracing-subscriber.workspace = true +rocksdb.workspace = true +futures.workspace = true +sha2.workspace = true +tokio-stream.workspace = true + +[dev-dependencies] +tempfile.workspace = true + +[lib] +name = "flaredb_server" +path = "src/lib.rs" diff --git a/flaredb/crates/flaredb-server/examples/test_cluster.rs b/flaredb/crates/flaredb-server/examples/test_cluster.rs new file mode 100644 index 0000000..fc55c80 --- /dev/null +++ b/flaredb/crates/flaredb-server/examples/test_cluster.rs @@ -0,0 +1,69 @@ +use flaredb_proto::kvrpc::kv_cas_client::KvCasClient; +use flaredb_proto::kvrpc::{CasRequest, GetRequest}; +use tonic::transport::Channel; + +#[tokio::main] +async fn main() -> Result<(), Box> { + println!("Testing CAS operations on node 1..."); + + // Connect to node 1 + let channel = Channel::from_static("http://127.0.0.1:50051") + .connect() + .await?; + let mut client = KvCasClient::new(channel); + + // Write key + println!("Writing key 'test-key' = 'hello-world'..."); + let req = CasRequest { + key: b"test-key".to_vec(), + value: b"hello-world".to_vec(), + expected_version: 0, + namespace: "default".to_string(), + }; + let resp = client.compare_and_swap(req).await?.into_inner(); + println!( + "CAS response: success={}, version={}", + resp.success, resp.new_version + ); + + // Read back + println!("Reading key 'test-key'..."); + let req = GetRequest { + key: b"test-key".to_vec(), + namespace: "default".to_string(), + }; + let resp = client.get(req).await?.into_inner(); + println!( + "Get response: found={}, value={:?}, version={}", + resp.found, + String::from_utf8_lossy(&resp.value), + resp.version + ); + + // Connect to node 2 and check if data is there + println!("\nConnecting to node 2..."); + let channel2 = Channel::from_static("http://127.0.0.1:50052") + .connect() + .await?; + let mut client2 = KvCasClient::new(channel2); + + let req = GetRequest { + key: b"test-key".to_vec(), + namespace: "default".to_string(), + }; + match client2.get(req).await { + Ok(resp) => { + let resp = resp.into_inner(); + println!( + "Node 2 Get response: found={}, value={:?}", + resp.found, + String::from_utf8_lossy(&resp.value) + ); + } + Err(e) => { + println!("Node 2 error (expected if not leader): {}", e); + } + } + + Ok(()) +} diff --git a/flaredb/crates/flaredb-server/src/config/mod.rs b/flaredb/crates/flaredb-server/src/config/mod.rs new file mode 100644 index 0000000..a809a65 --- /dev/null +++ b/flaredb/crates/flaredb-server/src/config/mod.rs @@ -0,0 +1,279 @@ +use std::collections::HashMap; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::RwLock; + +/// Reserved namespaces that cannot have their mode changed. +/// These are automatically created with Strong consistency on server startup. +pub const RESERVED_NAMESPACES: &[&str] = &["iam", "metrics", "_system"]; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ConsistencyMode { + Strong, + Eventual, +} + +#[derive(Debug, Clone)] +pub struct NamespaceConfig { + pub id: u32, + pub name: String, + pub mode: ConsistencyMode, + pub explicit: bool, +} + +#[allow(dead_code)] +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct NamespaceModeDiff { + pub namespace: String, + pub self_id: u32, + pub other_id: u32, + pub self_mode: ConsistencyMode, + pub other_mode: ConsistencyMode, +} + +#[derive(Debug)] +pub struct ServerConfig { + namespaces: RwLock>, + default_mode: ConsistencyMode, + next_id: AtomicU32, + reserved_namespaces: Vec, +} + +impl ServerConfig { + pub fn new( + default_mode: ConsistencyMode, + mut namespaces: HashMap, + ) -> Self { + // Ensure default namespace exists with id=1 + if !namespaces.contains_key("default") { + namespaces.insert( + "default".to_string(), + NamespaceConfig { + id: 1, + name: "default".to_string(), + mode: default_mode.clone(), + explicit: true, + }, + ); + } + + // Auto-create reserved namespaces with Strong consistency + let mut next_id = namespaces.values().map(|n| n.id).max().unwrap_or(1) + 1; + for &reserved in RESERVED_NAMESPACES { + if !namespaces.contains_key(reserved) { + namespaces.insert( + reserved.to_string(), + NamespaceConfig { + id: next_id, + name: reserved.to_string(), + mode: ConsistencyMode::Strong, // Always strong for reserved + explicit: true, + }, + ); + next_id += 1; + } + } + + Self { + namespaces: RwLock::new(namespaces), + default_mode, + next_id: AtomicU32::new(next_id), + reserved_namespaces: RESERVED_NAMESPACES.iter().map(|s| s.to_string()).collect(), + } + } + + /// Check if a namespace is reserved (mode cannot be changed). + pub fn is_reserved(&self, namespace: &str) -> bool { + self.reserved_namespaces.contains(&namespace.to_string()) + } + + #[allow(dead_code)] + pub fn default_with_namespace() -> Self { + Self::new(ConsistencyMode::Strong, HashMap::new()) + } + + /// Fetch or create a namespace configuration. + pub fn get_namespace(&self, namespace: &str) -> NamespaceConfig { + { + let guard = self.namespaces.read().expect("namespace read"); + if let Some(ns) = guard.get(namespace) { + return ns.clone(); + } + } + + // Insert lazily with a unique id. + let mut guard = self.namespaces.write().expect("namespace write"); + let ns_id = self.next_id.fetch_add(1, Ordering::SeqCst); + let cfg = NamespaceConfig { + id: ns_id, + name: namespace.to_string(), + mode: self.default_mode.clone(), + explicit: false, + }; + guard.insert(namespace.to_string(), cfg.clone()); + cfg + } + + /// Update or insert a namespace and return the resulting config. + /// Returns Err if attempting to change mode of a reserved namespace. + pub fn set_namespace_mode( + &self, + namespace: &str, + mode: ConsistencyMode, + ) -> Result { + // Block mode changes for reserved namespaces + if self.is_reserved(namespace) { + let guard = self.namespaces.read().expect("namespace read"); + if let Some(existing) = guard.get(namespace) { + if existing.mode != mode { + return Err(format!( + "cannot change mode of reserved namespace '{}' (current: {:?}, requested: {:?})", + namespace, existing.mode, mode + )); + } + return Ok(existing.clone()); + } + } + + let mut guard = self.namespaces.write().expect("namespace write"); + let entry = guard.entry(namespace.to_string()); + let cfg = entry + .and_modify(|c| { + c.mode = mode.clone(); + c.explicit = true; + }) + .or_insert_with(|| NamespaceConfig { + id: self.next_id.fetch_add(1, Ordering::SeqCst), + name: namespace.to_string(), + mode: mode.clone(), + explicit: true, + }) + .clone(); + Ok(cfg) + } + + /// Merge a new set of namespace=mode pairs, keeping existing IDs where possible. + #[allow(dead_code)] + pub fn reload_from_pairs(&self, pairs: &[String]) -> Result, String> { + let mut updated = Vec::new(); + for p in pairs { + if let Some((name, mode_str)) = p.split_once('=') { + let mode = match mode_str.to_lowercase().as_str() { + "strong" => ConsistencyMode::Strong, + "eventual" => ConsistencyMode::Eventual, + _ => { + return Err(format!( + "invalid mode '{}' for namespace '{}'", + mode_str, name + )) + } + }; + let cfg = self.set_namespace_mode(name, mode)?; + updated.push(cfg); + } else { + return Err(format!( + "invalid namespace mode format '{}', expected name=mode", + p + )); + } + } + Ok(updated) + } + + pub fn list_namespaces(&self) -> Vec { + self.namespaces + .read() + .expect("namespace read") + .values() + .cloned() + .collect() + } + + pub fn mode_as_str(mode: &ConsistencyMode) -> &'static str { + match mode { + ConsistencyMode::Strong => "strong", + ConsistencyMode::Eventual => "eventual", + } + } + + /// Detect differences between this config and another (by namespace name). + #[allow(dead_code)] + pub fn diff_modes(&self, other: &ServerConfig) -> Vec { + let mine = self.list_namespaces(); + let mut diffs = Vec::new(); + for ns in mine { + let o = other.get_namespace(&ns.name); + if ns.mode != o.mode || ns.id != o.id { + diffs.push(NamespaceModeDiff { + namespace: ns.name.clone(), + self_id: ns.id, + other_id: o.id, + self_mode: ns.mode, + other_mode: o.mode, + }); + } + } + diffs + } +} + +pub fn parse_namespace_modes(pairs: &[String]) -> Result, String> { + let mut namespaces = HashMap::new(); + let mut next_id: u32 = 2; // default namespace uses id=1 + for p in pairs { + if let Some((name, mode_str)) = p.split_once('=') { + let mode = match mode_str.to_lowercase().as_str() { + "strong" => ConsistencyMode::Strong, + "eventual" => ConsistencyMode::Eventual, + _ => { + return Err(format!( + "invalid mode '{}' for namespace '{}'", + mode_str, name + )) + } + }; + namespaces.insert( + name.to_string(), + NamespaceConfig { + id: next_id, + name: name.to_string(), + mode, + explicit: true, + }, + ); + next_id += 1; + } else { + return Err(format!( + "invalid namespace mode format '{}', expected name=mode", + p + )); + } + } + Ok(namespaces) +} + +/// Encode a key with the namespace id prefix (big endian) for RocksDB ordering. +pub fn encode_namespaced_key(ns_id: u32, key: &[u8]) -> Vec { + let mut out = Vec::with_capacity(4 + key.len()); + out.extend_from_slice(&ns_id.to_be_bytes()); + out.extend_from_slice(key); + out +} + +/// Encode value with a timestamp prefix (big endian u64) for LWW resolution. +pub fn encode_value_with_ts(ts: u64, value: &[u8]) -> Vec { + let mut out = Vec::with_capacity(8 + value.len()); + out.extend_from_slice(&ts.to_be_bytes()); + out.extend_from_slice(value); + out +} + +/// Decode value with timestamp prefix; returns (ts, value_slice). +pub fn decode_value_with_ts(data: &[u8]) -> (u64, Vec) { + if data.len() < 8 { + return (0, data.to_vec()); + } + let ts = u64::from_be_bytes([ + data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], + ]); + (ts, data[8..].to_vec()) +} diff --git a/flaredb/crates/flaredb-server/src/heartbeat.rs b/flaredb/crates/flaredb-server/src/heartbeat.rs new file mode 100644 index 0000000..d18e289 --- /dev/null +++ b/flaredb/crates/flaredb-server/src/heartbeat.rs @@ -0,0 +1,47 @@ +use crate::store::Store; +use flaredb_proto::pdpb::pd_client::PdClient; +use flaredb_proto::pdpb::ListRegionsRequest; +use flaredb_types::RegionMeta; +use std::sync::Arc; +use tokio::time::{sleep, Duration}; + +/// Periodically send region/store heartbeat to PD. +pub async fn start_heartbeat(pd_addr: String, store: Arc) { + tokio::spawn(async move { + let endpoint = format!("http://{}", pd_addr); + loop { + if let Ok(mut client) = PdClient::connect(endpoint.clone()).await { + // list regions to keep routing fresh + if let Ok(resp) = client.list_regions(ListRegionsRequest {}).await { + let resp = resp.into_inner(); + let mut metas = Vec::new(); + for r in resp.regions { + let voters = if r.peers.is_empty() { + Vec::new() + } else { + r.peers.clone() + }; + metas.push(( + RegionMeta { + id: r.id, + start_key: r.start_key, + end_key: r.end_key, + }, + if voters.is_empty() { + vec![store.store_id()] + } else { + voters + }, + )); + } + if !metas.is_empty() { + let _ = store.refresh_regions(metas).await; + } + } + // send basic heartbeat info (store id + regions held) + // PD heartbeat RPC is not defined yet; placeholder refresh via list. + } + sleep(Duration::from_secs(30)).await; + } + }); +} diff --git a/flaredb/crates/flaredb-server/src/lib.rs b/flaredb/crates/flaredb-server/src/lib.rs new file mode 100644 index 0000000..74e88ef --- /dev/null +++ b/flaredb/crates/flaredb-server/src/lib.rs @@ -0,0 +1,7 @@ +pub mod config; +pub mod heartbeat; +pub mod merkle; +pub mod pd_client; +pub mod raft_service; +pub mod service; +pub mod store; diff --git a/flaredb/crates/flaredb-server/src/main.rs b/flaredb/crates/flaredb-server/src/main.rs new file mode 100644 index 0000000..72846f2 --- /dev/null +++ b/flaredb/crates/flaredb-server/src/main.rs @@ -0,0 +1,320 @@ +use clap::Parser; +use flaredb_proto::kvrpc::kv_cas_server::KvCasServer; +use flaredb_proto::kvrpc::kv_raw_server::KvRawServer; +use flaredb_proto::raft_server::raft_service_server::RaftServiceServer; +use flaredb_storage::rocks_engine::RocksEngine; +use flaredb_types::RegionMeta; +use std::collections::HashMap; +use std::sync::Arc; +use tokio::sync::Mutex; +use tokio::time::{sleep, Duration}; +use tonic::transport::Server; +use tonic_health::server::health_reporter; +use tracing::info; +use tracing_subscriber::EnvFilter; + +mod config; +mod heartbeat; +mod merkle; +mod pd_client; +mod raft_service; +mod service; + +use pd_client::PdEvent; +mod store; + +use pd_client::PdClient; + +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + #[arg(long, default_value = "1")] + store_id: u64, + + #[arg(long, default_value = "127.0.0.1:50051")] + addr: String, + + #[arg(long, default_value = "data")] + data_dir: String, + + #[arg(long, default_value = "127.0.0.1:2379")] + pd_addr: String, + + /// Peers in format id=host:port (repeatable) + #[arg(long = "peer")] + peers: Vec, + + /// Namespace modes in format namespace=strong|eventual (repeatable) + #[arg(long = "namespace-mode")] + namespace_modes: Vec, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + // Initialize tracing + tracing_subscriber::fmt() + .with_env_filter(EnvFilter::from_default_env().add_directive("flaredb_server=info".parse()?)) + .init(); + + let args = Args::parse(); + let addr = args.addr.parse()?; + + info!("rdb-server listening on {}", addr); + + // Build namespace consistency config + let namespace_map = config::parse_namespace_modes(&args.namespace_modes).unwrap_or_else(|e| { + eprintln!("Failed to parse namespace modes: {}", e); + std::process::exit(1); + }); + let server_config = Arc::new(config::ServerConfig::new( + config::ConsistencyMode::Strong, + namespace_map, + )); + + // Parse peer addresses for cluster membership + let mut voters = vec![args.store_id]; + let mut peer_addrs: HashMap = HashMap::new(); + // Add self address + peer_addrs.insert(args.store_id, args.addr.clone()); + for p in &args.peers { + if let Some((id_str, addr)) = p.split_once('=') { + if let Ok(id) = id_str.parse::() { + if id != args.store_id { + voters.push(id); + peer_addrs.insert(id, addr.to_string()); + } + } + } + } + let peer_addrs = Arc::new(peer_addrs); + + let engine = Arc::new(RocksEngine::new(&args.data_dir)?); + let store = Arc::new(store::Store::new( + args.store_id, + engine.clone(), + server_config.clone(), + peer_addrs.clone(), + )); + + let service = service::KvServiceImpl::new(engine.clone(), server_config.clone(), store.clone()); + let raft_service = raft_service::RaftServiceImpl::new(store.clone(), args.store_id); + + println!("Connecting to ChainFire PD at {}...", args.pd_addr); + let pd_client_res = PdClient::connect(args.pd_addr.clone()).await; + + if let Ok(mut pd_client) = pd_client_res { + println!( + "Connected to ChainFire. Cluster ID: {}", + pd_client.cluster_id() + ); + + // Register this store with the PD + if let Err(e) = pd_client + .register_store(args.store_id, args.addr.clone()) + .await + { + eprintln!("Failed to register store: {}", e); + } else { + println!("Store {} registered with PD", args.store_id); + } + + // Start watching for metadata changes from ChainFire + let mut event_rx = pd_client.subscribe(); + if let Err(e) = pd_client.start_watch().await { + eprintln!("Failed to start PD watch: {}", e); + } else { + println!("Started watching PD for metadata changes"); + + // Spawn a background task to handle PD events + let store_clone_for_events = store.clone(); + tokio::spawn(async move { + while let Ok(event) = event_rx.recv().await { + match event { + PdEvent::RegionUpdated(region) => { + println!("Region {} updated via watch", region.id); + // Could trigger immediate region refresh here + let metas = vec![( + RegionMeta { + id: region.id, + start_key: region.start_key, + end_key: region.end_key, + }, + if region.peers.is_empty() { + vec![store_clone_for_events.store_id()] + } else { + region.peers + }, + )]; + if let Err(e) = store_clone_for_events.refresh_regions(metas).await { + eprintln!("Failed to refresh region from event: {}", e); + } + } + PdEvent::RegionRemoved(id) => { + println!("Region {} removed via watch", id); + } + PdEvent::StoreUpdated(store_info) => { + println!("Store {} updated via watch", store_info.id); + } + PdEvent::StoreRemoved(id) => { + println!("Store {} removed via watch", id); + } + } + } + }); + } + + // Initialize default region if this is the first node + if let Err(e) = pd_client.init_default_region(voters.clone()).await { + eprintln!("Failed to init default region: {}", e); + } + + // Fetch initial region metadata from PD (from cache) + let regions = pd_client.list_regions().await; + let mut region_metas = Vec::new(); + for r in regions { + let region_voters = if r.peers.is_empty() { + voters.clone() + } else { + r.peers.clone() + }; + region_metas.push(( + RegionMeta { + id: r.id, + start_key: r.start_key, + end_key: r.end_key, + }, + region_voters, + )); + } + if region_metas.is_empty() { + region_metas.push(( + RegionMeta { + id: 1, + start_key: Vec::new(), + end_key: Vec::new(), + }, + voters.clone(), + )); + } + if let Err(e) = store.bootstrap_regions(region_metas.clone()).await { + eprintln!("failed to bootstrap regions: {}", e); + } + + // Background task: heartbeat and refresh regions from PD + let store_clone = store.clone(); + let pd_addr_clone = args.pd_addr.clone(); + let store_id = args.store_id; + let server_addr = args.addr.clone(); + tokio::spawn(async move { + let client = Arc::new(Mutex::new( + PdClient::connect(pd_addr_clone.clone()).await.ok(), + )); + + loop { + sleep(Duration::from_secs(10)).await; + + let mut guard = client.lock().await; + if let Some(ref mut c) = *guard { + // Send heartbeat + let heartbeat_ok = + match c.heartbeat(store_id, server_addr.clone()).await { + Ok(_) => true, + Err(e) => { + eprintln!("Heartbeat failed: {}", e); + false + } + }; + + // If heartbeat failed, try to reconnect on next cycle + if !heartbeat_ok { + *guard = None; + continue; + } + + // Report leader status for regions we lead + for region_id in [1u64] { + // TODO: get actual regions + if let Some(node) = store_clone.get_raft_node(region_id).await { + if node.is_leader().await { + if let Err(e) = c.report_leader(region_id, store_id).await { + eprintln!("Report leader failed: {}", e); + } + } + } + } + + // Refresh regions from PD (from cache, updated via watch) + let regions = c.list_regions().await; + let metas: Vec<_> = regions + .into_iter() + .map(|r| { + let region_voters = if r.peers.is_empty() { + vec![store_clone.store_id()] + } else { + r.peers.clone() + }; + ( + RegionMeta { + id: r.id, + start_key: r.start_key, + end_key: r.end_key, + }, + region_voters, + ) + }) + .collect(); + if !metas.is_empty() { + if let Err(e) = store_clone.refresh_regions(metas).await { + eprintln!("refresh regions failed: {}", e); + } + } + } else { + // Try to reconnect + if let Some(new_client) = + PdClient::connect(pd_addr_clone.clone()).await.ok() + { + println!("Reconnected to PD"); + *guard = Some(new_client); + } + } + } + }); + } else { + eprintln!( + "Failed to connect to ChainFire PD: {:?}", + pd_client_res.err() + ); + eprintln!("Starting in standalone mode with default region..."); + let _ = store + .bootstrap_regions(vec![( + RegionMeta { + id: 1, + start_key: Vec::new(), + end_key: Vec::new(), + }, + voters.clone(), + )]) + .await; + } + + // Health check service for K8s liveness/readiness probes + let (mut health_reporter, health_service) = health_reporter(); + health_reporter + .set_serving::>() + .await; + health_reporter + .set_serving::>() + .await; + + info!("FlareDB server starting with health checks enabled"); + + Server::builder() + .add_service(health_service) + .add_service(KvRawServer::new(service.clone())) + .add_service(KvCasServer::new(service)) + .add_service(RaftServiceServer::new(raft_service)) + .serve(addr) + .await?; + + Ok(()) +} diff --git a/flaredb/crates/flaredb-server/src/merkle.rs b/flaredb/crates/flaredb-server/src/merkle.rs new file mode 100644 index 0000000..ade03a7 --- /dev/null +++ b/flaredb/crates/flaredb-server/src/merkle.rs @@ -0,0 +1,113 @@ +use crate::config::encode_namespaced_key; +use flaredb_storage::rocks_engine::RocksEngine; +use sha2::{Digest, Sha256}; +use std::sync::Arc; + +#[derive(Debug, Clone)] +pub struct MerkleLeaf { + pub start: Vec, + pub end: Vec, + pub hash: Vec, +} + +fn sha256(data: &[u8]) -> Vec { + let mut hasher = Sha256::new(); + hasher.update(data); + hasher.finalize().to_vec() +} + +/// Build merkle tree for a namespace; returns (root, leaves). +pub fn build_merkle( + engine: Arc, + namespace_id: u32, + chunk_size: usize, +) -> Result<(Vec, Vec), flaredb_storage::StorageError> { + let prefix = namespace_id.to_be_bytes(); + let db = engine.db(); + let mut iter = db.iterator(rocksdb::IteratorMode::From( + prefix.as_slice(), + rocksdb::Direction::Forward, + )); + let mut leaves = Vec::new(); + let mut current_keys = Vec::new(); + let mut start_key = None; + + while let Some(Ok((k, v))) = iter.next() { + if k.len() < 4 || &k[..4] != prefix { + if !k.starts_with(&prefix) { + break; + } + } + if start_key.is_none() { + start_key = Some(k.to_vec()); + } + // hash ts-decoded value to keep user value stable + current_keys.push(sha256(&v)); + if current_keys.len() >= chunk_size { + let leaf_hash = sha256(¤t_keys.concat()); + let end = k.to_vec(); + leaves.push(MerkleLeaf { + start: start_key.take().unwrap(), + end: end.clone(), + hash: leaf_hash, + }); + current_keys.clear(); + } + } + + if !current_keys.is_empty() { + let leaf_hash = sha256(¤t_keys.concat()); + let end = encode_namespaced_key(namespace_id, b"~end"); + leaves.push(MerkleLeaf { + start: start_key.unwrap_or_else(|| encode_namespaced_key(namespace_id, b"")), + end, + hash: leaf_hash, + }); + } + + if leaves.is_empty() { + return Ok((sha256(b""), leaves)); + } + + let mut level: Vec> = leaves.iter().map(|l| l.hash.clone()).collect(); + while level.len() > 1 { + let mut next = Vec::new(); + for chunk in level.chunks(2) { + if chunk.len() == 2 { + let mut c = Vec::new(); + c.extend_from_slice(&chunk[0]); + c.extend_from_slice(&chunk[1]); + next.push(sha256(&c)); + } else { + next.push(chunk[0].clone()); + } + } + level = next; + } + + Ok((level[0].clone(), leaves)) +} + +/// Fetch key/value pairs for a namespaced range. +pub async fn fetch_range( + engine: Arc, + namespace_id: u32, + start: &[u8], + end: &[u8], +) -> Result<(Vec>, Vec>), flaredb_storage::StorageError> { + let mut keys = Vec::new(); + let mut values = Vec::new(); + let db = engine.db(); + let mut iter = db.iterator(rocksdb::IteratorMode::From( + start, + rocksdb::Direction::Forward, + )); + while let Some(Ok((k, v))) = iter.next() { + if !k.starts_with(&namespace_id.to_be_bytes()) || (!end.is_empty() && k.as_ref() >= end) { + break; + } + keys.push(k.to_vec()); + values.push(v.to_vec()); + } + Ok((keys, values)) +} diff --git a/flaredb/crates/flaredb-server/src/pd_client.rs b/flaredb/crates/flaredb-server/src/pd_client.rs new file mode 100644 index 0000000..27c8ab3 --- /dev/null +++ b/flaredb/crates/flaredb-server/src/pd_client.rs @@ -0,0 +1,503 @@ +//! ChainFire-based Placement Driver client +//! +//! This module provides a client for interacting with ChainFire as the +//! Placement Driver (PD) for FlareDB cluster management. +//! +//! ## Watch Integration +//! +//! The client supports real-time notifications of metadata changes via +//! ChainFire's Watch API, enabling event-driven updates instead of polling. + +use flaredb_proto::chainfire::kv_client::KvClient; +use flaredb_proto::chainfire::watch_client::WatchClient; +use flaredb_proto::chainfire::{ + Event, PutRequest, RangeRequest, WatchCreateRequest, WatchRequest, +}; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::sync::Arc; +use tokio::sync::{broadcast, mpsc, RwLock}; +use tonic::transport::Channel; + +/// Key prefixes for cluster metadata in ChainFire +const PREFIX_STORES: &str = "/flaredb/stores/"; +const PREFIX_REGIONS: &str = "/flaredb/regions/"; +const KEY_CLUSTER_ID: &str = "/flaredb/cluster/id"; + +/// Store information stored in ChainFire +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StoreInfo { + pub id: u64, + pub addr: String, + pub last_heartbeat: u64, +} + +/// Region information stored in ChainFire +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RegionInfo { + pub id: u64, + pub start_key: Vec, + pub end_key: Vec, + pub peers: Vec, + pub leader_id: u64, +} + +/// Events emitted by the PD client when metadata changes +#[derive(Debug, Clone)] +pub enum PdEvent { + /// A store was added or updated + StoreUpdated(StoreInfo), + /// A store was removed + StoreRemoved(u64), + /// A region was added or updated + RegionUpdated(RegionInfo), + /// A region was removed + RegionRemoved(u64), +} + +/// Cached metadata for faster lookups +struct MetadataCache { + stores: HashMap, + regions: HashMap, +} + +impl MetadataCache { + fn new() -> Self { + Self { + stores: HashMap::new(), + regions: HashMap::new(), + } + } +} + +/// Client for interacting with ChainFire as a Placement Driver +pub struct PdClient { + kv_client: KvClient, + watch_client: WatchClient, + cluster_id: u64, + /// Cached metadata for fast access + cache: Arc>, + /// Channel to receive metadata change events + event_tx: broadcast::Sender, +} + +impl PdClient { + /// Connect to a ChainFire cluster + pub async fn connect(addr: String) -> Result> { + let endpoint = if addr.starts_with("http") { + addr + } else { + format!("http://{}", addr) + }; + let channel = Channel::from_shared(endpoint)?.connect().await?; + let kv_client = KvClient::new(channel.clone()); + let watch_client = WatchClient::new(channel); + let (event_tx, _) = broadcast::channel(256); + + let mut client = Self { + kv_client, + watch_client, + cluster_id: 0, + cache: Arc::new(RwLock::new(MetadataCache::new())), + event_tx, + }; + + // Try to get cluster ID, or generate a new one + client.cluster_id = client.get_or_init_cluster_id().await?; + + // Load initial metadata into cache + client.refresh_cache().await?; + + Ok(client) + } + + /// Subscribe to metadata change events + pub fn subscribe(&self) -> broadcast::Receiver { + self.event_tx.subscribe() + } + + /// Refresh the local cache from ChainFire + async fn refresh_cache(&mut self) -> Result<(), Box> { + let stores = self.list_stores_remote().await?; + let regions = self.list_regions_remote().await?; + + let mut cache = self.cache.write().await; + cache.stores.clear(); + for store in stores { + cache.stores.insert(store.id, store); + } + cache.regions.clear(); + for region in regions { + cache.regions.insert(region.id, region); + } + Ok(()) + } + + /// Start watching for metadata changes in the background + pub async fn start_watch( + &mut self, + ) -> Result<(), Box> { + let (tx, rx) = mpsc::channel::(32); + + // Create watch requests for stores and regions prefixes + let stores_watch = WatchRequest { + request_union: Some( + flaredb_proto::chainfire::watch_request::RequestUnion::CreateRequest( + WatchCreateRequest { + key: PREFIX_STORES.as_bytes().to_vec(), + range_end: prefix_range_end(PREFIX_STORES), + start_revision: 0, + progress_notify: false, + prev_kv: true, + watch_id: 1, + }, + ), + ), + }; + + let regions_watch = WatchRequest { + request_union: Some( + flaredb_proto::chainfire::watch_request::RequestUnion::CreateRequest( + WatchCreateRequest { + key: PREFIX_REGIONS.as_bytes().to_vec(), + range_end: prefix_range_end(PREFIX_REGIONS), + start_revision: 0, + progress_notify: false, + prev_kv: true, + watch_id: 2, + }, + ), + ), + }; + + // Send initial watch requests + tx.send(stores_watch).await.ok(); + tx.send(regions_watch).await.ok(); + + // Convert mpsc receiver to stream + let request_stream = tokio_stream::wrappers::ReceiverStream::new(rx); + + // Start watching + let response = self.watch_client.watch(request_stream).await?; + let mut stream = response.into_inner(); + + let cache = self.cache.clone(); + let event_tx = self.event_tx.clone(); + + // Spawn background task to process watch events + tokio::spawn(async move { + while let Ok(Some(resp)) = stream.message().await { + for event in resp.events { + if let Err(e) = process_watch_event(&event, &cache, &event_tx).await { + tracing::warn!("Failed to process watch event: {}", e); + } + } + } + tracing::info!("PD watch stream ended"); + }); + + Ok(()) + } + + /// Get or initialize the cluster ID + async fn get_or_init_cluster_id( + &mut self, + ) -> Result> { + let req = RangeRequest { + key: KEY_CLUSTER_ID.as_bytes().to_vec(), + range_end: Vec::new(), + limit: 1, + revision: 0, + keys_only: false, + count_only: false, + }; + + let resp = self.kv_client.range(req).await?.into_inner(); + if let Some(kv) = resp.kvs.first() { + let id_str = String::from_utf8_lossy(&kv.value); + Ok(id_str.parse().unwrap_or(1)) + } else { + // Initialize with cluster ID 1 + let put_req = PutRequest { + key: KEY_CLUSTER_ID.as_bytes().to_vec(), + value: b"1".to_vec(), + lease: 0, + prev_kv: false, + }; + self.kv_client.put(put_req).await?; + Ok(1) + } + } + + /// Get the cluster ID + pub fn cluster_id(&self) -> u64 { + self.cluster_id + } + + /// Register a store with the cluster + pub async fn register_store( + &mut self, + store_id: u64, + addr: String, + ) -> Result<(), Box> { + let info = StoreInfo { + id: store_id, + addr, + last_heartbeat: std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs(), + }; + let key = format!("{}{}", PREFIX_STORES, store_id); + let value = serde_json::to_vec(&info)?; + + let req = PutRequest { + key: key.into_bytes(), + value, + lease: 0, + prev_kv: false, + }; + self.kv_client.put(req).await?; + Ok(()) + } + + /// Send a heartbeat for a store + pub async fn heartbeat( + &mut self, + store_id: u64, + addr: String, + ) -> Result<(), Box> { + // Re-register with updated timestamp + self.register_store(store_id, addr).await + } + + /// List all registered stores from cache + pub async fn list_stores(&self) -> Vec { + let cache = self.cache.read().await; + cache.stores.values().cloned().collect() + } + + /// List all registered stores from remote (ChainFire) + async fn list_stores_remote( + &mut self, + ) -> Result, Box> { + let key = PREFIX_STORES.as_bytes().to_vec(); + let range_end = prefix_range_end(PREFIX_STORES); + + let req = RangeRequest { + key, + range_end, + limit: 0, + revision: 0, + keys_only: false, + count_only: false, + }; + + let resp = self.kv_client.range(req).await?.into_inner(); + let mut stores = Vec::new(); + for kv in resp.kvs { + if let Ok(info) = serde_json::from_slice::(&kv.value) { + stores.push(info); + } + } + Ok(stores) + } + + /// Register or update a region + pub async fn put_region( + &mut self, + region: RegionInfo, + ) -> Result<(), Box> { + let key = format!("{}{}", PREFIX_REGIONS, region.id); + let value = serde_json::to_vec(®ion)?; + + let req = PutRequest { + key: key.into_bytes(), + value, + lease: 0, + prev_kv: false, + }; + self.kv_client.put(req).await?; + Ok(()) + } + + /// List all regions from cache + pub async fn list_regions(&self) -> Vec { + let cache = self.cache.read().await; + cache.regions.values().cloned().collect() + } + + /// List all regions from remote (ChainFire) + async fn list_regions_remote( + &mut self, + ) -> Result, Box> { + let key = PREFIX_REGIONS.as_bytes().to_vec(); + let range_end = prefix_range_end(PREFIX_REGIONS); + + let req = RangeRequest { + key, + range_end, + limit: 0, + revision: 0, + keys_only: false, + count_only: false, + }; + + let resp = self.kv_client.range(req).await?.into_inner(); + let mut regions = Vec::new(); + for kv in resp.kvs { + if let Ok(info) = serde_json::from_slice::(&kv.value) { + regions.push(info); + } + } + Ok(regions) + } + + /// Get a region by ID + pub async fn get_region( + &mut self, + region_id: u64, + ) -> Result, Box> { + let key = format!("{}{}", PREFIX_REGIONS, region_id); + + let req = RangeRequest { + key: key.into_bytes(), + range_end: Vec::new(), + limit: 1, + revision: 0, + keys_only: false, + count_only: false, + }; + + let resp = self.kv_client.range(req).await?.into_inner(); + if let Some(kv) = resp.kvs.first() { + Ok(serde_json::from_slice(&kv.value).ok()) + } else { + Ok(None) + } + } + + /// Find the region containing a key (from cache) + pub async fn get_region_for_key(&self, key: &[u8]) -> Option { + let regions = self.list_regions().await; + for region in regions { + let start_ok = key >= region.start_key.as_slice(); + let end_ok = region.end_key.is_empty() || key < region.end_key.as_slice(); + if start_ok && end_ok { + return Some(region); + } + } + None + } + + /// Get the leader address for a region (from cache) + pub async fn get_leader_addr(&self, region_id: u64) -> Option { + let cache = self.cache.read().await; + if let Some(region) = cache.regions.get(®ion_id) { + if let Some(store) = cache.stores.get(®ion.leader_id) { + return Some(store.addr.clone()); + } + } + None + } + + /// Report that this store is the leader for a region + pub async fn report_leader( + &mut self, + region_id: u64, + leader_id: u64, + ) -> Result<(), Box> { + if let Some(mut region) = self.get_region(region_id).await? { + region.leader_id = leader_id; + self.put_region(region).await?; + } + Ok(()) + } + + /// Initialize default region if none exist + pub async fn init_default_region( + &mut self, + peers: Vec, + ) -> Result<(), Box> { + let regions = self.list_regions().await; + if regions.is_empty() { + let region = RegionInfo { + id: 1, + start_key: Vec::new(), + end_key: Vec::new(), + peers, + leader_id: 0, + }; + self.put_region(region).await?; + } + Ok(()) + } + + /// Get a region from cache by ID + pub async fn get_region_cached(&self, region_id: u64) -> Option { + let cache = self.cache.read().await; + cache.regions.get(®ion_id).cloned() + } + + /// Get a store from cache by ID + pub async fn get_store_cached(&self, store_id: u64) -> Option { + let cache = self.cache.read().await; + cache.stores.get(&store_id).cloned() + } +} + +// ============================================================================ +// Helper functions +// ============================================================================ + +/// Compute the range end for a prefix (etcd-style prefix matching) +fn prefix_range_end(prefix: &str) -> Vec { + let mut end = prefix.as_bytes().to_vec(); + if let Some(last) = end.last_mut() { + *last += 1; + } + end +} + +/// Process a watch event and update cache +async fn process_watch_event( + event: &Event, + cache: &Arc>, + event_tx: &broadcast::Sender, +) -> Result<(), Box> { + let kv = match &event.kv { + Some(kv) => kv, + None => return Ok(()), + }; + + let key_str = String::from_utf8_lossy(&kv.key); + let is_delete = event.r#type == 1; // DELETE = 1 + + if key_str.starts_with(PREFIX_STORES) { + let store_id_str = key_str.strip_prefix(PREFIX_STORES).unwrap_or(""); + if let Ok(store_id) = store_id_str.parse::() { + let mut cache = cache.write().await; + if is_delete { + cache.stores.remove(&store_id); + let _ = event_tx.send(PdEvent::StoreRemoved(store_id)); + } else if let Ok(info) = serde_json::from_slice::(&kv.value) { + cache.stores.insert(store_id, info.clone()); + let _ = event_tx.send(PdEvent::StoreUpdated(info)); + } + } + } else if key_str.starts_with(PREFIX_REGIONS) { + let region_id_str = key_str.strip_prefix(PREFIX_REGIONS).unwrap_or(""); + if let Ok(region_id) = region_id_str.parse::() { + let mut cache = cache.write().await; + if is_delete { + cache.regions.remove(®ion_id); + let _ = event_tx.send(PdEvent::RegionRemoved(region_id)); + } else if let Ok(info) = serde_json::from_slice::(&kv.value) { + cache.regions.insert(region_id, info.clone()); + let _ = event_tx.send(PdEvent::RegionUpdated(info)); + } + } + } + + Ok(()) +} diff --git a/flaredb/crates/flaredb-server/src/raft_service.rs b/flaredb/crates/flaredb-server/src/raft_service.rs new file mode 100644 index 0000000..fddebde --- /dev/null +++ b/flaredb/crates/flaredb-server/src/raft_service.rs @@ -0,0 +1,257 @@ +use crate::store::Store; +use flaredb_raft::{FlareNode, FlareNodeId, FlareTypeConfig}; +use flaredb_proto::raft_server::raft_service_server::RaftService; +use openraft::raft::{AppendEntriesRequest, VoteRequest}; +use flaredb_proto::raft_server::{ + FetchRangeRequest, FetchRangeResponse, ForwardEventualRequest, GetMerkleRequest, + GetMerkleResponse, GetModeRequest, GetModeResponse, ListNamespaceModesRequest, + ListNamespaceModesResponse, MerkleRange, NamespaceMode, OpenRaftAppendEntriesRequest, + OpenRaftAppendEntriesResponse, OpenRaftSnapshotRequest, OpenRaftSnapshotResponse, + OpenRaftVoteRequest, OpenRaftVoteResponse, RaftMessage, RaftResponse, + UpdateNamespaceModeRequest, UpdateNamespaceModeResponse, +}; +use std::sync::Arc; +use tonic::{Request, Response, Status}; + +pub struct RaftServiceImpl { + store: Arc, + #[allow(dead_code)] + local_id: u64, +} + +impl RaftServiceImpl { + pub fn new(store: Arc, local_id: u64) -> Self { + Self { store, local_id } + } +} + +#[tonic::async_trait] +impl RaftService for RaftServiceImpl { + async fn send(&self, _request: Request) -> Result, Status> { + // Legacy raft-rs RPC - no longer supported + Err(Status::unimplemented( + "Legacy raft-rs RPC not supported. Use OpenRaft RPCs (vote_v2, append_entries_v2).", + )) + } + + async fn get_mode( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let ns = if req.namespace.is_empty() { + "default".to_string() + } else { + req.namespace + }; + let mode = self.store.config().get_namespace(&ns).mode; + let mode_str = crate::config::ServerConfig::mode_as_str(&mode).to_string(); + Ok(Response::new(GetModeResponse { mode: mode_str })) + } + + async fn update_namespace_mode( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + if req.namespace.is_empty() { + return Err(Status::invalid_argument("namespace is required")); + } + let mode = match req.mode.to_lowercase().as_str() { + "strong" => crate::config::ConsistencyMode::Strong, + "eventual" => crate::config::ConsistencyMode::Eventual, + _ => { + return Err(Status::invalid_argument( + "mode must be 'strong' or 'eventual'", + )) + } + }; + let cfg = self + .store + .config() + .set_namespace_mode(&req.namespace, mode) + .map_err(|e| Status::failed_precondition(e))?; + let mode_str = crate::config::ServerConfig::mode_as_str(&cfg.mode).to_string(); + let ns_mode = NamespaceMode { + namespace: cfg.name.clone(), + id: cfg.id, + mode: mode_str, + from_default: !cfg.explicit, + }; + Ok(Response::new(UpdateNamespaceModeResponse { + mode: Some(ns_mode), + })) + } + + async fn forward_eventual( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + if let Some(node) = self.store.get_raft_node(req.region_id).await { + if !node.is_leader().await { + return Err(Status::failed_precondition("not leader")); + } + node.write_kv(req.namespace_id, req.key, req.value, req.ts) + .await + .map_err(|e| Status::internal(format!("raft write failed: {e}")))?; + return Ok(Response::new(RaftResponse {})); + } + Err(Status::failed_precondition("region not found")) + } + + async fn list_namespace_modes( + &self, + _request: Request, + ) -> Result, Status> { + let list = self + .store + .config() + .list_namespaces() + .into_iter() + .map(|cfg| NamespaceMode { + namespace: cfg.name, + id: cfg.id, + mode: crate::config::ServerConfig::mode_as_str(&cfg.mode).to_string(), + from_default: !cfg.explicit, + }) + .collect(); + Ok(Response::new(ListNamespaceModesResponse { + namespaces: list, + })) + } + + async fn get_merkle( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let chunk = if req.chunk_size == 0 { + 256 + } else { + req.chunk_size as usize + }; + let (root, leaves) = + crate::merkle::build_merkle(self.store.engine(), req.namespace_id, chunk) + .map_err(|e| Status::internal(format!("merkle: {e}")))?; + let ranges = leaves + .into_iter() + .map(|l| MerkleRange { + start_key: l.start, + end_key: l.end, + hash: l.hash, + }) + .collect(); + Ok(Response::new(GetMerkleResponse { + root, + leaves: ranges, + })) + } + + async fn fetch_range( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let (keys, values) = crate::merkle::fetch_range( + self.store.engine(), + req.namespace_id, + &req.start_key, + &req.end_key, + ) + .await + .map_err(|e| Status::internal(format!("fetch_range: {e}")))?; + Ok(Response::new(FetchRangeResponse { keys, values })) + } + + // OpenRaft RPCs + + async fn vote_v2( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let node = self + .store + .get_raft_node(req.region_id) + .await + .ok_or_else(|| Status::failed_precondition("region not found"))?; + + let vote_req: VoteRequest = serde_json::from_slice(&req.data) + .map_err(|e| Status::invalid_argument(format!("invalid vote request: {}", e)))?; + + let resp = node + .raft + .vote(vote_req) + .await + .map_err(|e| Status::internal(format!("vote failed: {}", e)))?; + + let data = serde_json::to_vec(&resp) + .map_err(|e| Status::internal(format!("serialize response: {}", e)))?; + + Ok(Response::new(OpenRaftVoteResponse { data })) + } + + async fn append_entries_v2( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let node = self + .store + .get_raft_node(req.region_id) + .await + .ok_or_else(|| Status::failed_precondition("region not found"))?; + + let append_req: AppendEntriesRequest = + serde_json::from_slice(&req.data).map_err(|e| { + Status::invalid_argument(format!("invalid append_entries request: {}", e)) + })?; + + let resp = node + .raft + .append_entries(append_req) + .await + .map_err(|e| Status::internal(format!("append_entries failed: {}", e)))?; + + let data = serde_json::to_vec(&resp) + .map_err(|e| Status::internal(format!("serialize response: {}", e)))?; + + Ok(Response::new(OpenRaftAppendEntriesResponse { data })) + } + + async fn install_snapshot_v2( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let node = self + .store + .get_raft_node(req.region_id) + .await + .ok_or_else(|| Status::failed_precondition("region not found"))?; + + let vote: openraft::Vote = serde_json::from_slice(&req.vote) + .map_err(|e| Status::invalid_argument(format!("invalid vote: {}", e)))?; + + let meta: openraft::SnapshotMeta = + serde_json::from_slice(&req.meta) + .map_err(|e| Status::invalid_argument(format!("invalid meta: {}", e)))?; + + let snapshot = openraft::Snapshot { + meta, + snapshot: Box::new(std::io::Cursor::new(req.data)), + }; + + let resp = node + .raft + .install_full_snapshot(vote, snapshot) + .await + .map_err(|e| Status::internal(format!("install_snapshot failed: {}", e)))?; + + let data = serde_json::to_vec(&resp) + .map_err(|e| Status::internal(format!("serialize response: {}", e)))?; + + Ok(Response::new(OpenRaftSnapshotResponse { data })) + } +} diff --git a/flaredb/crates/flaredb-server/src/service.rs b/flaredb/crates/flaredb-server/src/service.rs new file mode 100644 index 0000000..4667ff1 --- /dev/null +++ b/flaredb/crates/flaredb-server/src/service.rs @@ -0,0 +1,520 @@ +use crate::config::{ + decode_value_with_ts, encode_namespaced_key, encode_value_with_ts, ConsistencyMode, + ServerConfig, +}; +use crate::store::Store; +use flaredb_raft::FlareRaftNode; +use flaredb_proto::kvrpc::kv_cas_server::KvCas; +use flaredb_proto::kvrpc::kv_raw_server::KvRaw; +use flaredb_proto::kvrpc::{ + CasRequest, CasResponse, DeleteRequest, DeleteResponse, GetRequest, GetResponse, RawDeleteRequest, + RawDeleteResponse, RawGetRequest, RawGetResponse, RawPutRequest, RawPutResponse, RawScanRequest, + RawScanResponse, ScanRequest, ScanResponse, VersionedKv, +}; +use flaredb_storage::rocks_engine::RocksEngine; +use flaredb_storage::StorageEngine; +use std::sync::Arc; +use std::time::{SystemTime, UNIX_EPOCH}; +use tonic::{Request, Response, Status}; + +/// Default limit for scan operations +const DEFAULT_SCAN_LIMIT: usize = 100; +/// Maximum limit for scan operations +const MAX_SCAN_LIMIT: usize = 10000; + +#[derive(Clone)] +pub struct KvServiceImpl { + engine: Arc, + config: Arc, + store: Arc, +} + +impl KvServiceImpl { + pub fn new(engine: Arc, config: Arc, store: Arc) -> Self { + Self { + engine, + config, + store, + } + } + + fn resolve_namespace<'a>(&self, ns: &'a str) -> (&'a str, ConsistencyMode, u32) { + let name = if ns.is_empty() { "default" } else { ns }; + let cfg = self.config.get_namespace(name); + (name, cfg.mode, cfg.id) + } + + fn now_millis() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_millis() as u64 + } + + async fn route_raft_node(&self, key: &[u8]) -> Result>, Status> { + let region_id = self + .store + .route_key(key) + .await + .ok_or_else(|| Status::failed_precondition("no region found for key"))?; + Ok(self.store.get_raft_node(region_id).await) + } +} + +#[tonic::async_trait] +impl KvRaw for KvServiceImpl { + async fn raw_put( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let (ns, mode, ns_id) = self.resolve_namespace(&req.namespace); + if !matches!(mode, ConsistencyMode::Eventual) { + return Err(Status::failed_precondition(format!( + "namespace '{}' is not eventual (mode={:?})", + ns, mode + ))); + } + let encoded = encode_namespaced_key(ns_id, &req.key); + let ts = Self::now_millis(); + let encoded_val = encode_value_with_ts(ts, &req.value); + // LWW guard: skip if existing value is newer. + if let Ok(existing) = self.engine.get_raw(&encoded).await { + if let Some(val) = existing { + let (old_ts, _) = decode_value_with_ts(&val); + if old_ts > ts { + return Ok(Response::new(RawPutResponse { success: true })); + } + } + } + self.engine + .put_raw(&encoded, &encoded_val) + .await + .map_err(|e| Status::internal(e.to_string()))?; + + // Replicate via Raft + if let Some(node) = self.route_raft_node(&encoded).await? { + let _ = node.write_kv(ns_id, encoded, req.value, ts).await; + } + Ok(Response::new(RawPutResponse { success: true })) + } + + async fn raw_get( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let (ns, mode, ns_id) = self.resolve_namespace(&req.namespace); + if !matches!(mode, ConsistencyMode::Eventual) { + return Err(Status::failed_precondition(format!( + "namespace '{}' is not eventual (mode={:?})", + ns, mode + ))); + } + let encoded = encode_namespaced_key(ns_id, &req.key); + let val = self + .engine + .get_raw(&encoded) + .await + .map_err(|e| Status::internal(e.to_string()))?; + Ok(Response::new(RawGetResponse { + found: val.is_some(), + value: val.map(|v| decode_value_with_ts(&v).1).unwrap_or_default(), + })) + } + + async fn raw_scan( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let (ns, mode, ns_id) = self.resolve_namespace(&req.namespace); + if !matches!(mode, ConsistencyMode::Eventual) { + return Err(Status::failed_precondition(format!( + "namespace '{}' is not eventual (mode={:?})", + ns, mode + ))); + } + + let limit = if req.limit == 0 || req.limit as usize > MAX_SCAN_LIMIT { + DEFAULT_SCAN_LIMIT + } else { + req.limit as usize + }; + + // Encode keys with namespace prefix + let start = encode_namespaced_key(ns_id, &req.start_key); + let end = if req.end_key.is_empty() { + // Scan to end of namespace (next namespace prefix) + encode_namespaced_key(ns_id + 1, &[]) + } else { + encode_namespaced_key(ns_id, &req.end_key) + }; + + // Fetch one extra to detect has_more + let entries = self + .engine + .scan_raw(&start, &end, limit + 1) + .await + .map_err(|e| Status::internal(e.to_string()))?; + + let has_more = entries.len() > limit; + let actual_entries = if has_more { + &entries[..limit] + } else { + &entries[..] + }; + + let (keys, values): (Vec<_>, Vec<_>) = actual_entries + .iter() + .map(|(k, v)| { + // Strip namespace prefix (4 bytes) and decode value + let user_key = k[4..].to_vec(); + let (_, decoded_value) = decode_value_with_ts(v); + (user_key, decoded_value) + }) + .unzip(); + + let next_key = if has_more { + entries[limit].0[4..].to_vec() // Strip namespace prefix + } else { + vec![] + }; + + Ok(Response::new(RawScanResponse { + keys, + values, + has_more, + next_key, + })) + } + + async fn raw_delete( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let (ns, mode, ns_id) = self.resolve_namespace(&req.namespace); + if !matches!(mode, ConsistencyMode::Eventual) { + return Err(Status::failed_precondition(format!( + "namespace '{}' is not eventual (mode={:?})", + ns, mode + ))); + } + let encoded = encode_namespaced_key(ns_id, &req.key); + let ts = Self::now_millis(); + + // Check if key exists before deletion + let existed = self + .engine + .get_raw(&encoded) + .await + .map_err(|e| Status::internal(e.to_string()))? + .is_some(); + + // Delete from local storage + self.engine + .delete_raw(&encoded) + .await + .map_err(|e| Status::internal(e.to_string()))?; + + // Replicate via Raft + if let Some(node) = self.route_raft_node(&encoded).await? { + let _ = node.delete_kv(ns_id, encoded, ts).await; + } + + Ok(Response::new(RawDeleteResponse { + success: true, + existed, + })) + } +} + +#[tonic::async_trait] +impl KvCas for KvServiceImpl { + async fn compare_and_swap( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let (ns, mode, ns_id) = self.resolve_namespace(&req.namespace); + if !matches!(mode, ConsistencyMode::Strong) { + return Err(Status::failed_precondition(format!( + "namespace '{}' is not strong (mode={:?})", + ns, mode + ))); + } + let encoded = encode_namespaced_key(ns_id, &req.key); + let ts = Self::now_millis(); + + // CAS is executed atomically within the Raft state machine. + // This ensures linearizable semantics - no race between read and write. + let node = self + .route_raft_node(&encoded) + .await? + .ok_or_else(|| Status::failed_precondition("region not found for key"))?; + + if !node.is_leader().await { + return Err(Status::failed_precondition( + "not leader for strong namespace; redirect required", + )); + } + + // Single atomic CAS operation through Raft + let response = node + .cas_write(ns_id, encoded, req.value, req.expected_version, ts) + .await + .map_err(|e| Status::internal(format!("raft cas_write failed: {}", e)))?; + + // Extract CAS result from Raft response + match response { + flaredb_raft::FlareResponse::CasResult { + success, + current_version, + new_version, + } => Ok(Response::new(CasResponse { + success, + current_version, + new_version, + })), + _ => Err(Status::internal("unexpected raft response for CAS")), + } + } + + async fn get(&self, request: Request) -> Result, Status> { + let req = request.into_inner(); + let (ns, mode, ns_id) = self.resolve_namespace(&req.namespace); + if !matches!(mode, ConsistencyMode::Strong) { + return Err(Status::failed_precondition(format!( + "namespace '{}' is not strong (mode={:?})", + ns, mode + ))); + } + let encoded = encode_namespaced_key(ns_id, &req.key); + + // For strong consistency, use linearizable read through Raft state machine + let node = self + .route_raft_node(&encoded) + .await? + .ok_or_else(|| Status::failed_precondition("region not found for key"))?; + + // Linearizable read from Raft state machine (CAS data) + let val_opt = node + .linearizable_read_cas(ns_id, &encoded) + .await + .map_err(|e| Status::failed_precondition(e))?; + + if let Some((value, version, _ts)) = val_opt { + Ok(Response::new(GetResponse { + found: true, + value, + version, + })) + } else { + Ok(Response::new(GetResponse { + found: false, + value: Vec::new(), + version: 0, + })) + } + } + + async fn scan(&self, request: Request) -> Result, Status> { + let req = request.into_inner(); + let (ns, mode, ns_id) = self.resolve_namespace(&req.namespace); + if !matches!(mode, ConsistencyMode::Strong) { + return Err(Status::failed_precondition(format!( + "namespace '{}' is not strong (mode={:?})", + ns, mode + ))); + } + + let limit = if req.limit == 0 || req.limit as usize > MAX_SCAN_LIMIT { + DEFAULT_SCAN_LIMIT + } else { + req.limit as usize + }; + + // Encode keys with namespace prefix + let start = encode_namespaced_key(ns_id, &req.start_key); + let end = if req.end_key.is_empty() { + encode_namespaced_key(ns_id + 1, &[]) + } else { + encode_namespaced_key(ns_id, &req.end_key) + }; + + // For strong consistency, verify linearizable read on the first key + // This ensures we're the leader and have the latest data + if let Some(node) = self.route_raft_node(&start).await? { + node.linearizable_read_kv(ns_id, &req.start_key) + .await + .map_err(|e| Status::failed_precondition(e))?; + } + + // Fetch one extra to detect has_more + let raw_entries = self + .engine + .scan_cas(&start, &end, limit + 1) + .await + .map_err(|e| Status::internal(e.to_string()))?; + + let has_more = raw_entries.len() > limit; + let actual_entries = if has_more { + &raw_entries[..limit] + } else { + &raw_entries[..] + }; + + let entries: Vec = actual_entries + .iter() + .map(|(k, v, version)| { + // Strip namespace prefix (4 bytes) and decode value + let user_key = k[4..].to_vec(); + let (_, decoded_value) = decode_value_with_ts(v); + VersionedKv { + key: user_key, + value: decoded_value, + version: *version, + } + }) + .collect(); + + let next_key = if has_more { + raw_entries[limit].0[4..].to_vec() + } else { + vec![] + }; + + Ok(Response::new(ScanResponse { + entries, + has_more, + next_key, + })) + } + + async fn delete( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let (ns, mode, ns_id) = self.resolve_namespace(&req.namespace); + if !matches!(mode, ConsistencyMode::Strong) { + return Err(Status::failed_precondition(format!( + "namespace '{}' is not strong (mode={:?})", + ns, mode + ))); + } + let encoded = encode_namespaced_key(ns_id, &req.key); + let ts = Self::now_millis(); + + // CAS delete is executed atomically within the Raft state machine + let node = self + .route_raft_node(&encoded) + .await? + .ok_or_else(|| Status::failed_precondition("region not found for key"))?; + + if !node.is_leader().await { + return Err(Status::failed_precondition( + "not leader for strong namespace; redirect required", + )); + } + + // Single atomic CAS delete operation through Raft + let response = node + .cas_delete(ns_id, encoded, req.expected_version, ts) + .await + .map_err(|e| Status::internal(format!("raft cas_delete failed: {}", e)))?; + + // Extract CAS delete result from Raft response + match response { + flaredb_raft::FlareResponse::CasDeleteResult { + success, + current_version, + existed, + } => Ok(Response::new(DeleteResponse { + success, + current_version, + existed, + })), + _ => Err(Status::internal("unexpected raft response for CAS delete")), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::config::ServerConfig; + use flaredb_proto::kvrpc::CasRequest; + use flaredb_proto::kvrpc::GetRequest; + use flaredb_types::RegionMeta; + use std::collections::HashMap; + use std::sync::Arc; + use tempfile::TempDir; + + fn test_peer_addrs() -> Arc> { + let mut addrs = HashMap::new(); + addrs.insert(1, "127.0.0.1:50051".to_string()); + Arc::new(addrs) + } + + #[tokio::test] + async fn get_returns_value_and_version() { + let dir = TempDir::new().unwrap(); + let engine = Arc::new(RocksEngine::new(dir.path().to_str().unwrap()).unwrap()); + let store = Arc::new(crate::store::Store::new( + 1, + engine.clone(), + Arc::new(ServerConfig::default_with_namespace()), + test_peer_addrs(), + )); + store + .bootstrap_regions(vec![( + RegionMeta { + id: 1, + start_key: Vec::new(), + end_key: Vec::new(), + }, + vec![1], + )]) + .await + .unwrap(); + + // Wait for leader election in single-node cluster + if let Some(node) = store.get_raft_node(1).await { + node.trigger_election().await.unwrap(); + tokio::time::sleep(tokio::time::Duration::from_millis(500)).await; + } + + let service = KvServiceImpl::new( + engine, + Arc::new(ServerConfig::default_with_namespace()), + store, + ); + + // Write via CAS + let req = CasRequest { + key: b"k1".to_vec(), + value: b"v1".to_vec(), + expected_version: 0, + namespace: "default".to_string(), + }; + service + .compare_and_swap(Request::new(req)) + .await + .expect("cas"); + + let resp = service + .get(Request::new(GetRequest { + key: b"k1".to_vec(), + namespace: "default".to_string(), + })) + .await + .expect("get") + .into_inner(); + + assert!(resp.found); + assert_eq!(resp.version, 1); + assert_eq!(resp.value, b"v1"); + } +} diff --git a/flaredb/crates/flaredb-server/src/store.rs b/flaredb/crates/flaredb-server/src/store.rs new file mode 100644 index 0000000..a6538fd --- /dev/null +++ b/flaredb/crates/flaredb-server/src/store.rs @@ -0,0 +1,254 @@ +use crate::config::ServerConfig; +use flaredb_raft::{FlareNetworkFactory, FlareRaftNode}; +use flaredb_storage::rocks_engine::RocksEngine; +use flaredb_types::RegionMeta; +use std::collections::HashMap; +use std::sync::Arc; + +pub struct Store { + store_id: u64, + engine: Arc, + raft_nodes: tokio::sync::RwLock>>, + regions: tokio::sync::RwLock>, + config: Arc, + peer_addrs: Arc>, +} + +impl Store { + const META_KEY: &'static [u8] = b"_region_meta"; + + pub fn new( + store_id: u64, + engine: Arc, + config: Arc, + peer_addrs: Arc>, + ) -> Self { + Self { + store_id, + engine, + raft_nodes: tokio::sync::RwLock::new(HashMap::new()), + regions: tokio::sync::RwLock::new(Vec::new()), + config, + peer_addrs, + } + } + + fn validate_regions(regions: &[(RegionMeta, Vec)]) -> Result<(), String> { + let mut metas: Vec<&RegionMeta> = regions.iter().map(|(m, _)| m).collect(); + metas.sort_by(|a, b| a.start_key.cmp(&b.start_key)); + for win in metas.windows(2) { + let a = &win[0]; + let b = &win[1]; + let a_end = if a.end_key.is_empty() { + None + } else { + Some(a.end_key.as_slice()) + }; + if let Some(ae) = a_end { + if ae > b.start_key.as_slice() { + return Err(format!( + "region overlap: {} [{:?},{:?}) vs {} [{:?},{:?})", + a.id, a.start_key, a.end_key, b.id, b.start_key, b.end_key + )); + } + } else { + return Err(format!( + "region {} has infinite end but not last (next id {})", + a.id, b.id + )); + } + } + Ok(()) + } + + pub async fn bootstrap_regions( + &self, + regions_with_voters: Vec<(RegionMeta, Vec)>, + ) -> Result<(), String> { + Self::validate_regions(®ions_with_voters)?; + let mut raft_nodes = self.raft_nodes.write().await; + let mut regions = self.regions.write().await; + + for (meta, voters) in regions_with_voters { + if raft_nodes.contains_key(&meta.id) { + continue; + } + + let network_factory = FlareNetworkFactory::new(self.store_id, meta.id); + // Register peer addresses with the network factory + for (&peer_id, addr) in self.peer_addrs.iter() { + network_factory.add_peer(peer_id, addr.clone()).await; + } + // Use persistent storage backed by RocksDB + let raft_node = FlareRaftNode::new_persistent( + self.store_id, + meta.id, + network_factory, + voters, + &self.peer_addrs, + self.engine.clone(), + ) + .await + .map_err(|e| format!("Failed to create raft node: {}", e))?; + + raft_nodes.insert(meta.id, Arc::new(raft_node)); + regions.push(meta); + } + + regions.sort_by(|a, b| a.start_key.cmp(&b.start_key)); + self.persist_regions(®ions).map_err(|e| e.to_string())?; + Ok(()) + } + + pub async fn get_raft_node(&self, region_id: u64) -> Option> { + self.raft_nodes.read().await.get(®ion_id).cloned() + } + + pub async fn any_raft_node(&self) -> Option> { + self.raft_nodes.read().await.values().next().cloned() + } + + pub async fn route_key(&self, key: &[u8]) -> Option { + let regions = self.regions.read().await; + for r in regions.iter() { + let start_ok = key >= r.start_key.as_slice(); + let end_ok = r.end_key.is_empty() || key < r.end_key.as_slice(); + if start_ok && end_ok { + return Some(r.id); + } + } + None + } + + pub async fn refresh_regions(&self, metas: Vec<(RegionMeta, Vec)>) -> Result<(), String> { + Self::validate_regions(&metas)?; + self.bootstrap_regions(metas.clone()).await?; + let mut regions = self.regions.write().await; + regions.clear(); + for (meta, _) in metas { + regions.push(meta); + } + regions.sort_by(|a, b| a.start_key.cmp(&b.start_key)); + self.persist_regions(®ions).map_err(|e| e.to_string())?; + Ok(()) + } + + pub fn config(&self) -> Arc { + self.config.clone() + } + + pub fn store_id(&self) -> u64 { + self.store_id + } + + #[allow(dead_code)] + pub fn engine(&self) -> Arc { + self.engine.clone() + } + + pub async fn split_region( + &self, + region_id: u64, + split_key: Vec, + new_region_id: u64, + voters: Vec, + ) -> Result<(), String> { + let mut regions = self.regions.write().await; + let pos = regions + .iter() + .position(|r| r.id == region_id) + .ok_or_else(|| { + format!( + "region {} not found for split (split_key {:?})", + region_id, split_key + ) + })?; + let target = regions[pos].clone(); + if split_key.is_empty() + || (!target.end_key.is_empty() && split_key >= target.end_key) + || split_key <= target.start_key + { + return Err("invalid split key".to_string()); + } + + let left = RegionMeta { + id: target.id, + start_key: target.start_key.clone(), + end_key: split_key.clone(), + }; + let right = RegionMeta { + id: new_region_id, + start_key: split_key, + end_key: target.end_key.clone(), + }; + + let mut new_layout: Vec<(RegionMeta, Vec)> = regions + .iter() + .enumerate() + .filter_map(|(i, r)| { + if i == pos { + None + } else { + Some((r.clone(), voters.clone())) + } + }) + .collect(); + new_layout.push((left.clone(), voters.clone())); + new_layout.push((right.clone(), voters.clone())); + Self::validate_regions(&new_layout)?; + + regions.remove(pos); + regions.push(left); + regions.push(right); + regions.sort_by(|a, b| a.start_key.cmp(&b.start_key)); + drop(regions); + + // Create OpenRaft node for new region + let mut raft_nodes = self.raft_nodes.write().await; + if !raft_nodes.contains_key(&new_region_id) { + let network_factory = FlareNetworkFactory::new(self.store_id, new_region_id); + // Register peer addresses with the network factory + for (&peer_id, addr) in self.peer_addrs.iter() { + network_factory.add_peer(peer_id, addr.clone()).await; + } + // Use persistent storage backed by RocksDB + let raft_node = FlareRaftNode::new_persistent( + self.store_id, + new_region_id, + network_factory, + voters, + &self.peer_addrs, + self.engine.clone(), + ) + .await + .map_err(|e| format!("Failed to create raft node: {}", e))?; + raft_nodes.insert(new_region_id, Arc::new(raft_node)); + } + drop(raft_nodes); + + let updated_regions = self.regions.read().await; + self.persist_regions(&updated_regions) + .map_err(|e| e.to_string())?; + Ok(()) + } + + fn persist_regions(&self, regions: &[RegionMeta]) -> Result<(), rocksdb::Error> { + let mut buf = Vec::new(); + let len = regions.len() as u32; + buf.extend_from_slice(&len.to_be_bytes()); + for r in regions { + buf.extend_from_slice(&r.id.to_be_bytes()); + buf.extend_from_slice(&(r.start_key.len() as u32).to_be_bytes()); + buf.extend_from_slice(&(r.end_key.len() as u32).to_be_bytes()); + buf.extend_from_slice(&r.start_key); + buf.extend_from_slice(&r.end_key); + } + self.engine.db().put(Self::META_KEY, buf) + } + + #[allow(dead_code)] + pub async fn allocate_region_id(&self) -> u64 { + let regions = self.regions.read().await; + regions.iter().map(|r| r.id).max().unwrap_or(0) + 1 + } +} diff --git a/flaredb/crates/flaredb-server/tests/test_consistency.rs b/flaredb/crates/flaredb-server/tests/test_consistency.rs new file mode 100644 index 0000000..87f6449 --- /dev/null +++ b/flaredb/crates/flaredb-server/tests/test_consistency.rs @@ -0,0 +1,198 @@ +use flaredb_proto::kvrpc::kv_raw_server::KvRaw; +use flaredb_proto::kvrpc::{RawGetRequest, RawPutRequest}; +use flaredb_server::config::{parse_namespace_modes, ConsistencyMode, ServerConfig}; +use flaredb_server::service::KvServiceImpl; +use flaredb_server::store::Store; +use flaredb_storage::rocks_engine::RocksEngine; +use flaredb_types::RegionMeta; +use std::collections::HashMap; +use std::sync::Arc; +use tonic::Request; + +fn test_peer_addrs() -> Arc> { + let mut addrs = HashMap::new(); + addrs.insert(1, "127.0.0.1:50051".to_string()); + Arc::new(addrs) +} + +#[test] +fn default_config_is_strong() { + let cfg = ServerConfig::default_with_namespace(); + assert_eq!(cfg.get_namespace("default").mode, ConsistencyMode::Strong); + // unknown namespace falls back to default + assert_eq!(cfg.get_namespace("unknown").mode, ConsistencyMode::Strong); +} + +#[test] +fn parses_eventual_namespace_mode() { + let parsed = parse_namespace_modes(&vec!["tenantA=eventual".to_string()]).unwrap(); + let cfg = ServerConfig::new(ConsistencyMode::Strong, parsed); + assert_eq!(cfg.get_namespace("tenantA").mode, ConsistencyMode::Eventual); +} + +#[test] +fn encodes_and_decodes_ts_value() { + let val = b"hello".to_vec(); + let ts = 123u64; + let encoded = flaredb_server::config::encode_value_with_ts(ts, &val); + let (decoded_ts, decoded_val) = flaredb_server::config::decode_value_with_ts(&encoded); + assert_eq!(ts, decoded_ts); + assert_eq!(val, decoded_val); +} + +#[tokio::test] +async fn eventual_mode_overwrites_with_newer_ts() { + let cfg_map = parse_namespace_modes(&vec!["logs=eventual".to_string()]).unwrap(); + let server_cfg = Arc::new(ServerConfig::new(ConsistencyMode::Strong, cfg_map)); + let dir = tempfile::TempDir::new().unwrap(); + let engine = Arc::new(RocksEngine::new(dir.path().to_str().unwrap()).unwrap()); + let store = Arc::new(Store::new( + 1, + engine.clone(), + server_cfg.clone(), + test_peer_addrs(), + )); + store + .bootstrap_regions(vec![( + RegionMeta { + id: 1, + start_key: Vec::new(), + end_key: Vec::new(), + }, + vec![1], + )]) + .await + .unwrap(); + let service = KvServiceImpl::new(engine.clone(), server_cfg, store.clone()); + + // First write + service + .raw_put(Request::new(RawPutRequest { + namespace: "logs".into(), + key: b"k".to_vec(), + value: b"v1".to_vec(), + })) + .await + .unwrap(); + // Second write (newer ts) + tokio::time::sleep(std::time::Duration::from_millis(1)).await; + service + .raw_put(Request::new(RawPutRequest { + namespace: "logs".into(), + key: b"k".to_vec(), + value: b"v2".to_vec(), + })) + .await + .unwrap(); + + let resp = service + .raw_get(Request::new(RawGetRequest { + namespace: "logs".into(), + key: b"k".to_vec(), + })) + .await + .unwrap() + .into_inner(); + assert!(resp.found); + assert_eq!(resp.value, b"v2"); +} + +#[tokio::test] +async fn lww_resolves_partition_on_replay() { + // Simulate two replicas reconciling: existing older value vs incoming newer value. + let existing = flaredb_server::config::encode_value_with_ts(1, b"old"); + let incoming = flaredb_server::config::encode_value_with_ts(5, b"new"); + let (existing_ts, _) = flaredb_server::config::decode_value_with_ts(&existing); + let (incoming_ts, _) = flaredb_server::config::decode_value_with_ts(&incoming); + assert!(incoming_ts > existing_ts); + + // Apply LWW resolution the same way on_ready does. + let result = if incoming_ts >= existing_ts { + incoming + } else { + existing + }; + assert_eq!(flaredb_server::config::decode_value_with_ts(&result).1, b"new"); +} + +#[test] +fn mode_switch_preserves_id_and_applies() { + let cfg = ServerConfig::default_with_namespace(); + let default = cfg.get_namespace("default"); + assert_eq!(default.mode, ConsistencyMode::Strong); + let id = default.id; + + // Switch to eventual then back to strong; id should remain stable and explicit should be true. + let eventual = cfg + .set_namespace_mode("default", ConsistencyMode::Eventual) + .unwrap(); + assert_eq!(eventual.id, id); + assert_eq!(eventual.mode, ConsistencyMode::Eventual); + assert!(eventual.explicit); + + let strong = cfg + .set_namespace_mode("default", ConsistencyMode::Strong) + .unwrap(); + assert_eq!(strong.id, id); + assert_eq!(strong.mode, ConsistencyMode::Strong); + assert!(strong.explicit); +} + +#[test] +fn detects_mode_mismatch_between_nodes() { + let cfg_a = ServerConfig::default_with_namespace(); + let cfg_b = ServerConfig::default_with_namespace(); + + // Diverge the mode for a namespace + cfg_a + .set_namespace_mode("logs", ConsistencyMode::Eventual) + .unwrap(); + cfg_b + .set_namespace_mode("logs", ConsistencyMode::Strong) + .unwrap(); + + let diffs = cfg_a.diff_modes(&cfg_b); + assert_eq!(diffs.len(), 1); + let diff = &diffs[0]; + assert_eq!(diff.namespace, "logs"); + assert_eq!(diff.self_mode, ConsistencyMode::Eventual); + assert_eq!(diff.other_mode, ConsistencyMode::Strong); +} + +#[test] +fn reserved_namespace_auto_creation() { + use flaredb_server::config::RESERVED_NAMESPACES; + + let cfg = ServerConfig::default_with_namespace(); + + // Reserved namespaces should be automatically created with Strong mode + for &reserved in RESERVED_NAMESPACES { + let ns = cfg.get_namespace(reserved); + assert_eq!( + ns.mode, + ConsistencyMode::Strong, + "reserved namespace '{}' should be Strong", + reserved + ); + assert!(cfg.is_reserved(reserved)); + } +} + +#[test] +fn reserved_namespace_mode_change_blocked() { + let cfg = ServerConfig::default_with_namespace(); + + // Attempting to change mode of reserved namespace should fail + let result = cfg.set_namespace_mode("iam", ConsistencyMode::Eventual); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!( + err.contains("reserved namespace"), + "error should mention reserved namespace: {}", + err + ); + + // Setting the same mode (Strong) should succeed + let result = cfg.set_namespace_mode("iam", ConsistencyMode::Strong); + assert!(result.is_ok()); +} diff --git a/flaredb/crates/flaredb-server/tests/test_multi_region.rs b/flaredb/crates/flaredb-server/tests/test_multi_region.rs new file mode 100644 index 0000000..966b223 --- /dev/null +++ b/flaredb/crates/flaredb-server/tests/test_multi_region.rs @@ -0,0 +1,122 @@ +use flaredb_server::config::{encode_namespaced_key, ServerConfig}; +use flaredb_server::store::Store; +use flaredb_storage::rocks_engine::RocksEngine; +use flaredb_types::RegionMeta; +use std::collections::HashMap; +use std::sync::Arc; + +fn test_peer_addrs() -> Arc> { + let mut addrs = HashMap::new(); + addrs.insert(1, "127.0.0.1:50051".to_string()); + Arc::new(addrs) +} + +#[tokio::test] +async fn kv_routing_maps_keys_by_range() { + let dir = tempfile::TempDir::new().unwrap(); + let engine = Arc::new(RocksEngine::new(dir.path().to_str().unwrap()).unwrap()); + let cfg = Arc::new(ServerConfig::default_with_namespace()); + let ns = cfg.get_namespace("default"); + let store = Arc::new(Store::new(1, engine, cfg.clone(), test_peer_addrs())); + store + .bootstrap_regions(vec![ + ( + RegionMeta { + id: 1, + start_key: encode_namespaced_key(ns.id, b""), + end_key: encode_namespaced_key(ns.id, b"m"), + }, + vec![1], + ), + ( + RegionMeta { + id: 2, + start_key: encode_namespaced_key(ns.id, b"m"), + end_key: Vec::new(), + }, + vec![1], + ), + ]) + .await + .unwrap(); + + let k1 = encode_namespaced_key(ns.id, b"a"); + let k2 = encode_namespaced_key(ns.id, b"z"); + assert_eq!(store.route_key(&k1).await, Some(1)); + assert_eq!(store.route_key(&k2).await, Some(2)); +} + +#[tokio::test] +async fn rejects_overlapping_regions_and_can_refresh_from_pd_meta() { + let dir = tempfile::TempDir::new().unwrap(); + let engine = Arc::new(RocksEngine::new(dir.path().to_str().unwrap()).unwrap()); + let store = Arc::new(Store::new( + 1, + engine.clone(), + Arc::new(ServerConfig::default_with_namespace()), + test_peer_addrs(), + )); + + // Overlap detection + let err = store + .bootstrap_regions(vec![ + ( + RegionMeta { + id: 1, + start_key: encode_namespaced_key(1, b""), + end_key: encode_namespaced_key(1, b"z"), + }, + vec![1], + ), + ( + RegionMeta { + id: 2, + start_key: encode_namespaced_key(1, b"y"), + end_key: Vec::new(), + }, + vec![1], + ), + ]) + .await + .unwrap_err(); + assert!(err.contains("overlap")); + + // PD supplies two disjoint regions; refresh replaces layout + store + .bootstrap_regions(vec![( + RegionMeta { + id: 10, + start_key: encode_namespaced_key(1, b""), + end_key: Vec::new(), + }, + vec![1], + )]) + .await + .unwrap(); + store + .refresh_regions(vec![ + ( + RegionMeta { + id: 1, + start_key: encode_namespaced_key(1, b""), + end_key: encode_namespaced_key(1, b"m"), + }, + vec![1], + ), + ( + RegionMeta { + id: 2, + start_key: encode_namespaced_key(1, b"m"), + end_key: Vec::new(), + }, + vec![1], + ), + ]) + .await + .unwrap(); + + let k1 = encode_namespaced_key(1, b"a"); + let k2 = encode_namespaced_key(1, b"x"); + assert_eq!(store.route_key(&k1).await, Some(1)); + assert_eq!(store.route_key(&k2).await, Some(2)); +} diff --git a/flaredb/crates/flaredb-server/tests/test_split.rs b/flaredb/crates/flaredb-server/tests/test_split.rs new file mode 100644 index 0000000..6e2ae87 --- /dev/null +++ b/flaredb/crates/flaredb-server/tests/test_split.rs @@ -0,0 +1,71 @@ +use flaredb_server::config::{encode_namespaced_key, ServerConfig}; +use flaredb_server::store::Store; +use flaredb_storage::rocks_engine::RocksEngine; +use flaredb_types::RegionMeta; +use std::collections::HashMap; +use std::sync::Arc; + +fn test_peer_addrs() -> Arc> { + let mut addrs = HashMap::new(); + addrs.insert(1, "127.0.0.1:50051".to_string()); + Arc::new(addrs) +} + +#[tokio::test] +async fn split_creates_two_regions_and_routes_keys() { + let dir = tempfile::TempDir::new().unwrap(); + let engine = Arc::new(RocksEngine::new(dir.path().to_str().unwrap()).unwrap()); + let cfg = Arc::new(ServerConfig::default_with_namespace()); + let ns = cfg.get_namespace("default"); + let store = Arc::new(Store::new(1, engine, cfg.clone(), test_peer_addrs())); + + store + .bootstrap_regions(vec![( + RegionMeta { + id: 1, + start_key: encode_namespaced_key(ns.id, b""), + end_key: Vec::new(), + }, + vec![1], + )]) + .await + .unwrap(); + + store + .split_region(1, encode_namespaced_key(ns.id, b"m"), 2, vec![1]) + .await + .unwrap(); + + let k1 = encode_namespaced_key(ns.id, b"a"); + let k2 = encode_namespaced_key(ns.id, b"z"); + assert_eq!(store.route_key(&k1).await, Some(1)); + assert_eq!(store.route_key(&k2).await, Some(2)); +} + +#[tokio::test] +async fn split_rejects_invalid_keys() { + let dir = tempfile::TempDir::new().unwrap(); + let engine = Arc::new(RocksEngine::new(dir.path().to_str().unwrap()).unwrap()); + let cfg = Arc::new(ServerConfig::default_with_namespace()); + let ns = cfg.get_namespace("default"); + let store = Arc::new(Store::new(1, engine, cfg.clone(), test_peer_addrs())); + + store + .bootstrap_regions(vec![( + RegionMeta { + id: 1, + start_key: encode_namespaced_key(ns.id, b""), + end_key: Vec::new(), + }, + vec![1], + )]) + .await + .unwrap(); + + // split key before start + let err = store + .split_region(1, encode_namespaced_key(ns.id, b""), 2, vec![1]) + .await + .unwrap_err(); + assert!(err.contains("invalid")); +} diff --git a/flaredb/crates/flaredb-server/tests/test_split_trigger.rs b/flaredb/crates/flaredb-server/tests/test_split_trigger.rs new file mode 100644 index 0000000..917e961 --- /dev/null +++ b/flaredb/crates/flaredb-server/tests/test_split_trigger.rs @@ -0,0 +1,46 @@ +use flaredb_server::config::{encode_namespaced_key, ServerConfig}; +use flaredb_server::store::Store; +use flaredb_storage::rocks_engine::RocksEngine; +use flaredb_types::RegionMeta; +use std::collections::HashMap; +use std::sync::Arc; + +fn test_peer_addrs() -> Arc> { + let mut addrs = HashMap::new(); + addrs.insert(1, "127.0.0.1:50051".to_string()); + Arc::new(addrs) +} + +#[tokio::test] +async fn split_via_store_api_routes_keys_correctly() { + let dir = tempfile::TempDir::new().unwrap(); + let engine = Arc::new(RocksEngine::new(dir.path().to_str().unwrap()).unwrap()); + let cfg = Arc::new(ServerConfig::default_with_namespace()); + let ns = cfg.get_namespace("default"); + let store = Arc::new(Store::new(1, engine.clone(), cfg.clone(), test_peer_addrs())); + store + .bootstrap_regions(vec![( + RegionMeta { + id: 1, + start_key: encode_namespaced_key(ns.id, b""), + end_key: Vec::new(), + }, + vec![1], + )]) + .await + .unwrap(); + + // Manual split at key "m" + let new_region_id = store.allocate_region_id().await; + store + .split_region(1, encode_namespaced_key(ns.id, b"m"), new_region_id, vec![1]) + .await + .unwrap(); + + // after split, routing should differentiate + let k_low = encode_namespaced_key(ns.id, b"a"); + let k_high = encode_namespaced_key(ns.id, b"z"); + let r1 = store.route_key(&k_low).await; + let r2 = store.route_key(&k_high).await; + assert_ne!(r1, r2, "split must route keys differently"); +} diff --git a/flaredb/crates/flaredb-server/tests/test_strong_mode.rs b/flaredb/crates/flaredb-server/tests/test_strong_mode.rs new file mode 100644 index 0000000..0e01a61 --- /dev/null +++ b/flaredb/crates/flaredb-server/tests/test_strong_mode.rs @@ -0,0 +1,69 @@ +use flaredb_proto::kvrpc::kv_cas_server::KvCas; +use flaredb_proto::kvrpc::{CasRequest, GetRequest}; +use flaredb_server::config::ServerConfig; +use flaredb_server::service::KvServiceImpl; +use flaredb_server::store::Store; +use flaredb_storage::rocks_engine::RocksEngine; +use flaredb_types::RegionMeta; +use std::collections::HashMap; +use std::sync::Arc; +use tokio::time::Duration; +use tonic::Request; + +fn test_peer_addrs() -> Arc> { + let mut addrs = HashMap::new(); + addrs.insert(1, "127.0.0.1:50051".to_string()); + Arc::new(addrs) +} + +#[tokio::test] +async fn strong_write_succeeds_after_leader_election() { + let dir = tempfile::TempDir::new().unwrap(); + let engine = Arc::new(RocksEngine::new(dir.path().to_str().unwrap()).unwrap()); + let config = Arc::new(ServerConfig::default_with_namespace()); + let store = Arc::new(Store::new(1, engine.clone(), config.clone(), test_peer_addrs())); + store + .bootstrap_regions(vec![( + RegionMeta { + id: 1, + start_key: Vec::new(), + end_key: Vec::new(), + }, + vec![1], + )]) + .await + .unwrap(); + let service = KvServiceImpl::new(engine.clone(), config, store.clone()); + + // Ensure leader using OpenRaft (single-node cluster elects itself quickly) + if let Some(node) = store.get_raft_node(1).await { + node.trigger_election().await.expect("trigger election"); + // Wait for leader election + tokio::time::sleep(Duration::from_millis(500)).await; + } + + // CAS succeeds once leader + let resp = service + .compare_and_swap(Request::new(CasRequest { + namespace: "default".into(), + key: b"k".to_vec(), + value: b"v".to_vec(), + expected_version: 0, + })) + .await + .unwrap() + .into_inner(); + assert!(resp.success); + + // Read requires leader too + let get = service + .get(Request::new(GetRequest { + namespace: "default".into(), + key: b"k".to_vec(), + })) + .await + .unwrap() + .into_inner(); + assert!(get.found); + assert_eq!(get.value, b"v"); +} diff --git a/flaredb/crates/flaredb-storage/Cargo.toml b/flaredb/crates/flaredb-storage/Cargo.toml new file mode 100644 index 0000000..b6943e5 --- /dev/null +++ b/flaredb/crates/flaredb-storage/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "flaredb-storage" +version.workspace = true +edition.workspace = true + +[dependencies] +rocksdb.workspace = true +thiserror.workspace = true +async-trait.workspace = true + +[dev-dependencies] +tempfile.workspace = true +tokio.workspace = true diff --git a/flaredb/crates/flaredb-storage/src/engine.rs b/flaredb/crates/flaredb-storage/src/engine.rs new file mode 100644 index 0000000..cbfccf8 --- /dev/null +++ b/flaredb/crates/flaredb-storage/src/engine.rs @@ -0,0 +1,66 @@ +#[cfg(test)] +mod tests { + use crate::rocks_engine::RocksEngine; + use crate::StorageEngine; // Import trait + use std::sync::Arc; + use tempfile::TempDir; + + fn create_test_db() -> (Arc, TempDir) { + let temp_dir = TempDir::new().unwrap(); + let engine = Arc::new(RocksEngine::new(temp_dir.path().to_str().unwrap()).unwrap()); + (engine, temp_dir) + } + + #[tokio::test] + async fn test_raw_put_get() { + let (engine, _dir) = create_test_db(); + + let key = b"k1"; + let val = b"v1"; + + engine.put_raw(key, val).await.unwrap(); + let res = engine.get_raw(key).await.unwrap(); + assert_eq!(res, Some(val.to_vec())); + } + + #[tokio::test] + async fn test_cas_success() { + let (engine, _dir) = create_test_db(); + + let key = b"cas_key"; + let val = b"v1"; + + // Create (expected 0) + let res = engine.compare_and_swap(key, 0, val).await.unwrap(); + assert!(res.success); + assert_eq!(res.new_version, 1); // Assuming starts at 1 + let entry = engine.get_cas(key).await.unwrap().unwrap(); + assert_eq!(entry.0, 1); + assert_eq!(entry.1, val.to_vec()); + + // Update (expected 1) + let val2 = b"v2"; + let res2 = engine.compare_and_swap(key, 1, val2).await.unwrap(); + assert!(res2.success); + assert_eq!(res2.new_version, 2); + let entry2 = engine.get_cas(key).await.unwrap().unwrap(); + assert_eq!(entry2.0, 2); + assert_eq!(entry2.1, val2.to_vec()); + } + + #[tokio::test] + async fn test_cas_conflict() { + let (engine, _dir) = create_test_db(); + + let key = b"cas_conflict"; + let val = b"v1"; + + // Create + engine.compare_and_swap(key, 0, val).await.unwrap(); + + // Conflict (expected 0, but is 1) + let res = engine.compare_and_swap(key, 0, val).await.unwrap(); + assert!(!res.success); + assert_eq!(res.current_version, 1); + } +} diff --git a/flaredb/crates/flaredb-storage/src/lib.rs b/flaredb/crates/flaredb-storage/src/lib.rs new file mode 100644 index 0000000..30c2eaa --- /dev/null +++ b/flaredb/crates/flaredb-storage/src/lib.rs @@ -0,0 +1,51 @@ +pub mod engine; +pub mod rocks_engine; + +use async_trait::async_trait; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum StorageError { + #[error("RocksDB error: {0}")] + RocksDb(#[from] rocksdb::Error), + #[error("Serialization error: {0}")] + Serialization(String), +} + +#[derive(Debug)] +pub struct CasResult { + pub success: bool, + pub current_version: u64, + pub new_version: u64, +} + +#[async_trait] +pub trait StorageEngine: Send + Sync { + async fn put_raw(&self, key: &[u8], value: &[u8]) -> Result<(), StorageError>; + async fn get_raw(&self, key: &[u8]) -> Result>, StorageError>; + async fn delete_raw(&self, key: &[u8]) -> Result<(), StorageError>; + async fn get_cas(&self, key: &[u8]) -> Result)>, StorageError>; + async fn compare_and_swap( + &self, + key: &[u8], + expected_version: u64, + value: &[u8], + ) -> Result; + async fn delete_cas(&self, key: &[u8]) -> Result<(), StorageError>; + + /// Scan raw key-value pairs in range [start, end) + async fn scan_raw( + &self, + start: &[u8], + end: &[u8], + limit: usize, + ) -> Result, Vec)>, StorageError>; + + /// Scan CAS key-value pairs with versions in range [start, end) + async fn scan_cas( + &self, + start: &[u8], + end: &[u8], + limit: usize, + ) -> Result, Vec, u64)>, StorageError>; +} diff --git a/flaredb/crates/flaredb-storage/src/rocks_engine.rs b/flaredb/crates/flaredb-storage/src/rocks_engine.rs new file mode 100644 index 0000000..8d7c084 --- /dev/null +++ b/flaredb/crates/flaredb-storage/src/rocks_engine.rs @@ -0,0 +1,389 @@ +use crate::{CasResult, StorageEngine, StorageError}; +use async_trait::async_trait; +use rocksdb::{ColumnFamilyDescriptor, Options, DB}; +use std::sync::Arc; + +pub struct RocksEngine { + db: Arc, +} + +impl RocksEngine { + pub fn new(path: &str) -> Result { + let mut opts = Options::default(); + opts.create_if_missing(true); + opts.create_missing_column_families(true); + + let cf_opts = Options::default(); + let cfs = vec![ + ColumnFamilyDescriptor::new("default", cf_opts.clone()), + ColumnFamilyDescriptor::new("cas", cf_opts.clone()), + ColumnFamilyDescriptor::new("raft_log", cf_opts.clone()), + ColumnFamilyDescriptor::new("raft_state", cf_opts), + ]; + + let db = DB::open_cf_descriptors(&opts, path, cfs)?; + Ok(Self { db: Arc::new(db) }) + } + + pub fn db(&self) -> Arc { + Arc::clone(&self.db) + } + + /// Approximate bytes between [start, end). Used for split threshold decisions. + pub fn approximate_range_size(&self, start: &[u8], end: &[u8]) -> Result { + let cf = self + .db + .cf_handle("default") + .ok_or_else(|| StorageError::Serialization("CF default not found".to_string()))?; + let mut iter = self.db.iterator_cf( + &cf, + rocksdb::IteratorMode::From(start, rocksdb::Direction::Forward), + ); + let mut total: u64 = 0; + while let Some(Ok((k, v))) = iter.next() { + if !end.is_empty() && k.as_ref() >= end { + break; + } + total = total.saturating_add(k.len() as u64 + v.len() as u64); + } + Ok(total) + } + + fn get_version_from_val(val: &[u8]) -> Result { + if val.len() < 8 { + return Err(StorageError::Serialization( + "Value too short for version".to_string(), + )); + } + let mut bytes = [0u8; 8]; + bytes.copy_from_slice(&val[0..8]); + Ok(u64::from_be_bytes(bytes)) + } + + fn split_cas_value(val: &[u8]) -> Result<(u64, Vec), StorageError> { + let version = Self::get_version_from_val(val)?; + Ok((version, val[8..].to_vec())) + } + + // ==================== Raft Log Storage ==================== + + /// Get a Raft log entry by index + pub fn get_raft_log(&self, index: u64) -> Result>, StorageError> { + let cf = self + .db + .cf_handle("raft_log") + .ok_or_else(|| StorageError::Serialization("CF raft_log not found".to_string()))?; + let key = index.to_be_bytes(); + self.db.get_cf(&cf, key).map_err(StorageError::RocksDb) + } + + /// Put a Raft log entry + pub fn put_raft_log(&self, index: u64, entry: &[u8]) -> Result<(), StorageError> { + let cf = self + .db + .cf_handle("raft_log") + .ok_or_else(|| StorageError::Serialization("CF raft_log not found".to_string()))?; + let key = index.to_be_bytes(); + self.db + .put_cf(&cf, key, entry) + .map_err(StorageError::RocksDb) + } + + /// Delete Raft log entries from start_index to end_index (inclusive) + pub fn delete_raft_log_range( + &self, + start_index: u64, + end_index: u64, + ) -> Result<(), StorageError> { + let cf = self + .db + .cf_handle("raft_log") + .ok_or_else(|| StorageError::Serialization("CF raft_log not found".to_string()))?; + for index in start_index..=end_index { + let key = index.to_be_bytes(); + self.db.delete_cf(&cf, key).map_err(StorageError::RocksDb)?; + } + Ok(()) + } + + /// Get Raft log entries in range [start, end) + pub fn get_raft_log_range( + &self, + start: u64, + end: Option, + ) -> Result)>, StorageError> { + let cf = self + .db + .cf_handle("raft_log") + .ok_or_else(|| StorageError::Serialization("CF raft_log not found".to_string()))?; + + let start_key = start.to_be_bytes(); + let iter = self.db.iterator_cf( + &cf, + rocksdb::IteratorMode::From(&start_key, rocksdb::Direction::Forward), + ); + + let mut entries = Vec::new(); + for item in iter { + let (key_bytes, value) = item.map_err(StorageError::RocksDb)?; + if key_bytes.len() != 8 { + continue; + } + let mut key_arr = [0u8; 8]; + key_arr.copy_from_slice(&key_bytes); + let index = u64::from_be_bytes(key_arr); + + if let Some(end_idx) = end { + if index >= end_idx { + break; + } + } + entries.push((index, value.to_vec())); + } + Ok(entries) + } + + /// Get first and last log index + pub fn get_raft_log_bounds(&self) -> Result<(Option, Option), StorageError> { + let cf = self + .db + .cf_handle("raft_log") + .ok_or_else(|| StorageError::Serialization("CF raft_log not found".to_string()))?; + + // Get first + let first = { + let mut iter = self + .db + .iterator_cf(&cf, rocksdb::IteratorMode::Start); + iter.next() + .and_then(|r| r.ok()) + .and_then(|(k, _)| { + if k.len() == 8 { + let mut arr = [0u8; 8]; + arr.copy_from_slice(&k); + Some(u64::from_be_bytes(arr)) + } else { + None + } + }) + }; + + // Get last + let last = { + let mut iter = self.db.iterator_cf(&cf, rocksdb::IteratorMode::End); + iter.next() + .and_then(|r| r.ok()) + .and_then(|(k, _)| { + if k.len() == 8 { + let mut arr = [0u8; 8]; + arr.copy_from_slice(&k); + Some(u64::from_be_bytes(arr)) + } else { + None + } + }) + }; + + Ok((first, last)) + } + + // ==================== Raft State Storage ==================== + + /// Get Raft state value by key + pub fn get_raft_state(&self, key: &str) -> Result>, StorageError> { + let cf = self + .db + .cf_handle("raft_state") + .ok_or_else(|| StorageError::Serialization("CF raft_state not found".to_string()))?; + self.db + .get_cf(&cf, key.as_bytes()) + .map_err(StorageError::RocksDb) + } + + /// Put Raft state value + pub fn put_raft_state(&self, key: &str, value: &[u8]) -> Result<(), StorageError> { + let cf = self + .db + .cf_handle("raft_state") + .ok_or_else(|| StorageError::Serialization("CF raft_state not found".to_string()))?; + self.db + .put_cf(&cf, key.as_bytes(), value) + .map_err(StorageError::RocksDb) + } + + /// Delete Raft state value + pub fn delete_raft_state(&self, key: &str) -> Result<(), StorageError> { + let cf = self + .db + .cf_handle("raft_state") + .ok_or_else(|| StorageError::Serialization("CF raft_state not found".to_string()))?; + self.db + .delete_cf(&cf, key.as_bytes()) + .map_err(StorageError::RocksDb) + } +} + +#[async_trait] +impl StorageEngine for RocksEngine { + async fn put_raw(&self, key: &[u8], value: &[u8]) -> Result<(), StorageError> { + let cf = self + .db + .cf_handle("default") + .ok_or_else(|| StorageError::Serialization("CF default not found".to_string()))?; + + self.db + .put_cf(&cf, key, value) + .map_err(StorageError::RocksDb) + } + + async fn get_raw(&self, key: &[u8]) -> Result>, StorageError> { + let cf = self + .db + .cf_handle("default") + .ok_or_else(|| StorageError::Serialization("CF default not found".to_string()))?; + self.db.get_cf(&cf, key).map_err(StorageError::RocksDb) + } + + async fn delete_raw(&self, key: &[u8]) -> Result<(), StorageError> { + let cf = self + .db + .cf_handle("default") + .ok_or_else(|| StorageError::Serialization("CF default not found".to_string()))?; + self.db + .delete_cf(&cf, key) + .map_err(StorageError::RocksDb) + } + + async fn get_cas(&self, key: &[u8]) -> Result)>, StorageError> { + let cf = self + .db + .cf_handle("cas") + .ok_or_else(|| StorageError::Serialization("CF cas not found".to_string()))?; + + let raw = self.db.get_cf(&cf, key)?; + match raw { + Some(bytes) => Ok(Some(Self::split_cas_value(&bytes)?)), + None => Ok(None), + } + } + + async fn compare_and_swap( + &self, + key: &[u8], + expected_version: u64, + value: &[u8], + ) -> Result { + let cf = self + .db + .cf_handle("cas") + .ok_or_else(|| StorageError::Serialization("CF cas not found".to_string()))?; + + // Note: In a real distributed system with concurrent local threads, we'd need a mutex or RocksDB TransactionDB. + // For this MVP (single-node, or single-threaded access per key assumption for now), simple get/put logic is implemented. + // Since we are inside async context, we should be careful about blocking. RocksDB is blocking. + // For true atomic CAS in RocksDB without TransactionDB, we'd use MergeOperator or a custom C++ extension. + // Here we implement a "pessimistic" check-then-act which is safe ONLY if we strictly serialize access to this key. + // FlareDB architecture assumes Single-Leader (Raft) or serial execution for a given key range, so this is "acceptable" for Phase 1 MVP. + + // 1. Get current + let current_val_opt = self.db.get_cf(&cf, key)?; + let current_version = match ¤t_val_opt { + Some(v) => Self::get_version_from_val(v)?, + None => 0, + }; + + if current_version != expected_version { + return Ok(CasResult { + success: false, + current_version, + new_version: 0, + }); + } + + let new_version = current_version + 1; + let mut new_val_bytes = Vec::with_capacity(8 + value.len()); + new_val_bytes.extend_from_slice(&new_version.to_be_bytes()); + new_val_bytes.extend_from_slice(value); + + self.db.put_cf(&cf, key, &new_val_bytes)?; + + Ok(CasResult { + success: true, + current_version, + new_version, + }) + } + + async fn delete_cas(&self, key: &[u8]) -> Result<(), StorageError> { + let cf = self + .db + .cf_handle("cas") + .ok_or_else(|| StorageError::Serialization("CF cas not found".to_string()))?; + self.db + .delete_cf(&cf, key) + .map_err(StorageError::RocksDb) + } + + async fn scan_raw( + &self, + start: &[u8], + end: &[u8], + limit: usize, + ) -> Result, Vec)>, StorageError> { + let cf = self + .db + .cf_handle("default") + .ok_or_else(|| StorageError::Serialization("CF default not found".to_string()))?; + + let iter = self.db.iterator_cf( + &cf, + rocksdb::IteratorMode::From(start, rocksdb::Direction::Forward), + ); + + let mut entries = Vec::new(); + for item in iter { + if entries.len() >= limit { + break; + } + let (key, value) = item.map_err(StorageError::RocksDb)?; + // Stop if we've passed the end key + if !end.is_empty() && key.as_ref() >= end { + break; + } + entries.push((key.to_vec(), value.to_vec())); + } + Ok(entries) + } + + async fn scan_cas( + &self, + start: &[u8], + end: &[u8], + limit: usize, + ) -> Result, Vec, u64)>, StorageError> { + let cf = self + .db + .cf_handle("cas") + .ok_or_else(|| StorageError::Serialization("CF cas not found".to_string()))?; + + let iter = self.db.iterator_cf( + &cf, + rocksdb::IteratorMode::From(start, rocksdb::Direction::Forward), + ); + + let mut entries = Vec::new(); + for item in iter { + if entries.len() >= limit { + break; + } + let (key, value) = item.map_err(StorageError::RocksDb)?; + // Stop if we've passed the end key + if !end.is_empty() && key.as_ref() >= end { + break; + } + let (version, data) = Self::split_cas_value(&value)?; + entries.push((key.to_vec(), data, version)); + } + Ok(entries) + } +} diff --git a/flaredb/crates/flaredb-types/Cargo.toml b/flaredb/crates/flaredb-types/Cargo.toml new file mode 100644 index 0000000..2d3936a --- /dev/null +++ b/flaredb/crates/flaredb-types/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "flaredb-types" +version.workspace = true +edition.workspace = true + +[dependencies] +serde.workspace = true +thiserror.workspace = true +anyhow.workspace = true diff --git a/flaredb/crates/flaredb-types/src/command.rs b/flaredb/crates/flaredb-types/src/command.rs new file mode 100644 index 0000000..5a9d5ac --- /dev/null +++ b/flaredb/crates/flaredb-types/src/command.rs @@ -0,0 +1,35 @@ +//! Raft command types for log entries. + +/// Command opcode for KV operations. +pub const CMD_KV: u8 = 0; + +/// Command opcode for region split. +pub const CMD_SPLIT: u8 = 1; + +/// Encode a Split command payload. +#[allow(dead_code)] +pub fn encode_split_command(region_id: u64, split_key: &[u8], new_region_id: u64) -> Vec { + let mut buf = Vec::with_capacity(1 + 8 + 4 + split_key.len() + 8); + buf.push(CMD_SPLIT); + buf.extend_from_slice(®ion_id.to_be_bytes()); + buf.extend_from_slice(&(split_key.len() as u32).to_be_bytes()); + buf.extend_from_slice(split_key); + buf.extend_from_slice(&new_region_id.to_be_bytes()); + buf +} + +/// Decode a Split command payload. +/// Returns (region_id, split_key, new_region_id). +pub fn decode_split_command(data: &[u8]) -> Option<(u64, Vec, u64)> { + if data.len() < 1 + 8 + 4 + 8 { + return None; + } + let region_id = u64::from_be_bytes(data[1..9].try_into().ok()?); + let klen = u32::from_be_bytes(data[9..13].try_into().ok()?) as usize; + if data.len() < 13 + klen + 8 { + return None; + } + let split_key = data[13..13 + klen].to_vec(); + let new_region_id = u64::from_be_bytes(data[13 + klen..13 + klen + 8].try_into().ok()?); + Some((region_id, split_key, new_region_id)) +} diff --git a/flaredb/crates/flaredb-types/src/error.rs b/flaredb/crates/flaredb-types/src/error.rs new file mode 100644 index 0000000..0bcf2f2 --- /dev/null +++ b/flaredb/crates/flaredb-types/src/error.rs @@ -0,0 +1,34 @@ +//! Error types for FlareDB. + +use thiserror::Error; + +/// Main error type for FlareDB operations. +#[derive(Error, Debug)] +pub enum FlareError { + #[error("Storage error: {0}")] + Storage(String), + + #[error("Raft error: {0}")] + Raft(String), + + #[error("Network error: {0}")] + Network(String), + + #[error("PD error: {0}")] + PlacementDriver(String), + + #[error("Region not found: {0}")] + RegionNotFound(u64), + + #[error("Not leader for region {region_id}")] + NotLeader { region_id: u64 }, + + #[error("Invalid operation: {0}")] + InvalidOperation(String), + + #[error("Internal error: {0}")] + Internal(#[from] anyhow::Error), +} + +/// Result type alias for FlareDB operations. +pub type FlareResult = std::result::Result; diff --git a/flaredb/crates/flaredb-types/src/lib.rs b/flaredb/crates/flaredb-types/src/lib.rs new file mode 100644 index 0000000..bb7ec74 --- /dev/null +++ b/flaredb/crates/flaredb-types/src/lib.rs @@ -0,0 +1,11 @@ +//! Core types for FlareDB. +//! +//! This crate provides shared type definitions used across FlareDB components. + +pub mod command; +mod error; +mod region; + +pub use command::*; +pub use error::*; +pub use region::*; diff --git a/flaredb/crates/flaredb-types/src/region.rs b/flaredb/crates/flaredb-types/src/region.rs new file mode 100644 index 0000000..c5ad874 --- /dev/null +++ b/flaredb/crates/flaredb-types/src/region.rs @@ -0,0 +1,32 @@ +//! Region metadata types. + +use serde::{Deserialize, Serialize}; + +/// Metadata for a region (key range partition). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RegionMeta { + /// Unique region ID. + pub id: u64, + /// Start key (inclusive). Empty means start of keyspace. + pub start_key: Vec, + /// End key (exclusive). Empty means end of keyspace (infinity). + pub end_key: Vec, +} + +impl RegionMeta { + /// Create a new region covering the entire keyspace. + pub fn full(id: u64) -> Self { + Self { + id, + start_key: Vec::new(), + end_key: Vec::new(), + } + } + + /// Check if a key belongs to this region. + pub fn contains_key(&self, key: &[u8]) -> bool { + let start_ok = key >= self.start_key.as_slice(); + let end_ok = self.end_key.is_empty() || key < self.end_key.as_slice(); + start_ok && end_ok + } +} diff --git a/flaredb/data/CURRENT b/flaredb/data/CURRENT new file mode 100644 index 0000000..aa5bb8e --- /dev/null +++ b/flaredb/data/CURRENT @@ -0,0 +1 @@ +MANIFEST-000005 diff --git a/flaredb/data/IDENTITY b/flaredb/data/IDENTITY new file mode 100644 index 0000000..cf98e97 --- /dev/null +++ b/flaredb/data/IDENTITY @@ -0,0 +1 @@ +75e7dbab-ce09-4522-a98d-2fcf0772294c \ No newline at end of file diff --git a/flaredb/data/LOCK b/flaredb/data/LOCK new file mode 100644 index 0000000..e69de29 diff --git a/flaredb/data/LOG b/flaredb/data/LOG new file mode 100644 index 0000000..8ddcf31 --- /dev/null +++ b/flaredb/data/LOG @@ -0,0 +1,587 @@ +2025/12/01-00:28:55.781522 403546 RocksDB version: 10.5.1 +2025/12/01-00:28:55.781610 403546 Git sha 0 +2025/12/01-00:28:55.781615 403546 Compile date 1980-01-01 00:00:00 +2025/12/01-00:28:55.781625 403546 DB SUMMARY +2025/12/01-00:28:55.781630 403546 Host name (Env): cn-nixos-think +2025/12/01-00:28:55.781634 403546 DB Session ID: DR3O07DHLF82N8IYLP28 +2025/12/01-00:28:55.781660 403546 SST files in data dir, Total Num: 0, files: +2025/12/01-00:28:55.781665 403546 Write Ahead Log file in data: +2025/12/01-00:28:55.781669 403546 Options.error_if_exists: 0 +2025/12/01-00:28:55.781674 403546 Options.create_if_missing: 1 +2025/12/01-00:28:55.781678 403546 Options.paranoid_checks: 1 +2025/12/01-00:28:55.781682 403546 Options.flush_verify_memtable_count: 1 +2025/12/01-00:28:55.781686 403546 Options.compaction_verify_record_count: 1 +2025/12/01-00:28:55.781689 403546 Options.track_and_verify_wals_in_manifest: 0 +2025/12/01-00:28:55.781693 403546 Options.track_and_verify_wals: 0 +2025/12/01-00:28:55.781697 403546 Options.verify_sst_unique_id_in_manifest: 1 +2025/12/01-00:28:55.781700 403546 Options.env: 0x555555c9ae60 +2025/12/01-00:28:55.781704 403546 Options.fs: PosixFileSystem +2025/12/01-00:28:55.781709 403546 Options.info_log: 0x555555cf9520 +2025/12/01-00:28:55.781712 403546 Options.max_file_opening_threads: 16 +2025/12/01-00:28:55.781716 403546 Options.statistics: (nil) +2025/12/01-00:28:55.781719 403546 Options.use_fsync: 0 +2025/12/01-00:28:55.781723 403546 Options.max_log_file_size: 0 +2025/12/01-00:28:55.781728 403546 Options.max_manifest_file_size: 1073741824 +2025/12/01-00:28:55.781731 403546 Options.log_file_time_to_roll: 0 +2025/12/01-00:28:55.781735 403546 Options.keep_log_file_num: 1000 +2025/12/01-00:28:55.781739 403546 Options.recycle_log_file_num: 0 +2025/12/01-00:28:55.781743 403546 Options.allow_fallocate: 1 +2025/12/01-00:28:55.781747 403546 Options.allow_mmap_reads: 0 +2025/12/01-00:28:55.781750 403546 Options.allow_mmap_writes: 0 +2025/12/01-00:28:55.781754 403546 Options.use_direct_reads: 0 +2025/12/01-00:28:55.781758 403546 Options.use_direct_io_for_flush_and_compaction: 0 +2025/12/01-00:28:55.781762 403546 Options.create_missing_column_families: 1 +2025/12/01-00:28:55.781766 403546 Options.db_log_dir: +2025/12/01-00:28:55.781769 403546 Options.wal_dir: +2025/12/01-00:28:55.781773 403546 Options.table_cache_numshardbits: 6 +2025/12/01-00:28:55.781776 403546 Options.WAL_ttl_seconds: 0 +2025/12/01-00:28:55.781780 403546 Options.WAL_size_limit_MB: 0 +2025/12/01-00:28:55.781784 403546 Options.max_write_batch_group_size_bytes: 1048576 +2025/12/01-00:28:55.781788 403546 Options.manifest_preallocation_size: 4194304 +2025/12/01-00:28:55.781792 403546 Options.is_fd_close_on_exec: 1 +2025/12/01-00:28:55.781795 403546 Options.advise_random_on_open: 1 +2025/12/01-00:28:55.781799 403546 Options.db_write_buffer_size: 0 +2025/12/01-00:28:55.781803 403546 Options.write_buffer_manager: 0x555555cf9710 +2025/12/01-00:28:55.781807 403546 Options.use_adaptive_mutex: 0 +2025/12/01-00:28:55.781810 403546 Options.rate_limiter: (nil) +2025/12/01-00:28:55.781815 403546 Options.sst_file_manager.rate_bytes_per_sec: 0 +2025/12/01-00:28:55.781818 403546 Options.wal_recovery_mode: 2 +2025/12/01-00:28:55.781822 403546 Options.enable_thread_tracking: 0 +2025/12/01-00:28:55.781826 403546 Options.enable_pipelined_write: 0 +2025/12/01-00:28:55.781830 403546 Options.unordered_write: 0 +2025/12/01-00:28:55.781834 403546 Options.allow_concurrent_memtable_write: 1 +2025/12/01-00:28:55.781837 403546 Options.enable_write_thread_adaptive_yield: 1 +2025/12/01-00:28:55.781841 403546 Options.write_thread_max_yield_usec: 100 +2025/12/01-00:28:55.781845 403546 Options.write_thread_slow_yield_usec: 3 +2025/12/01-00:28:55.781849 403546 Options.row_cache: None +2025/12/01-00:28:55.781853 403546 Options.wal_filter: None +2025/12/01-00:28:55.781857 403546 Options.avoid_flush_during_recovery: 0 +2025/12/01-00:28:55.781861 403546 Options.allow_ingest_behind: 0 +2025/12/01-00:28:55.781864 403546 Options.two_write_queues: 0 +2025/12/01-00:28:55.781868 403546 Options.manual_wal_flush: 0 +2025/12/01-00:28:55.781872 403546 Options.wal_compression: 0 +2025/12/01-00:28:55.781876 403546 Options.background_close_inactive_wals: 0 +2025/12/01-00:28:55.781879 403546 Options.atomic_flush: 0 +2025/12/01-00:28:55.781883 403546 Options.avoid_unnecessary_blocking_io: 0 +2025/12/01-00:28:55.781903 403546 Options.prefix_seek_opt_in_only: 0 +2025/12/01-00:28:55.781906 403546 Options.persist_stats_to_disk: 0 +2025/12/01-00:28:55.781910 403546 Options.write_dbid_to_manifest: 1 +2025/12/01-00:28:55.781914 403546 Options.write_identity_file: 1 +2025/12/01-00:28:55.781917 403546 Options.log_readahead_size: 0 +2025/12/01-00:28:55.781921 403546 Options.file_checksum_gen_factory: Unknown +2025/12/01-00:28:55.781925 403546 Options.best_efforts_recovery: 0 +2025/12/01-00:28:55.781929 403546 Options.max_bgerror_resume_count: 2147483647 +2025/12/01-00:28:55.781932 403546 Options.bgerror_resume_retry_interval: 1000000 +2025/12/01-00:28:55.781936 403546 Options.allow_data_in_errors: 0 +2025/12/01-00:28:55.781939 403546 Options.db_host_id: __hostname__ +2025/12/01-00:28:55.781942 403546 Options.enforce_single_del_contracts: true +2025/12/01-00:28:55.781947 403546 Options.metadata_write_temperature: kUnknown +2025/12/01-00:28:55.781951 403546 Options.wal_write_temperature: kUnknown +2025/12/01-00:28:55.781955 403546 Options.max_background_jobs: 2 +2025/12/01-00:28:55.781959 403546 Options.max_background_compactions: -1 +2025/12/01-00:28:55.781962 403546 Options.max_subcompactions: 1 +2025/12/01-00:28:55.781966 403546 Options.avoid_flush_during_shutdown: 0 +2025/12/01-00:28:55.781970 403546 Options.writable_file_max_buffer_size: 1048576 +2025/12/01-00:28:55.781974 403546 Options.delayed_write_rate : 16777216 +2025/12/01-00:28:55.781978 403546 Options.max_total_wal_size: 0 +2025/12/01-00:28:55.781981 403546 Options.delete_obsolete_files_period_micros: 21600000000 +2025/12/01-00:28:55.781985 403546 Options.stats_dump_period_sec: 600 +2025/12/01-00:28:55.781989 403546 Options.stats_persist_period_sec: 600 +2025/12/01-00:28:55.781992 403546 Options.stats_history_buffer_size: 1048576 +2025/12/01-00:28:55.781996 403546 Options.max_open_files: -1 +2025/12/01-00:28:55.782000 403546 Options.bytes_per_sync: 0 +2025/12/01-00:28:55.782004 403546 Options.wal_bytes_per_sync: 0 +2025/12/01-00:28:55.782008 403546 Options.strict_bytes_per_sync: 0 +2025/12/01-00:28:55.782011 403546 Options.compaction_readahead_size: 2097152 +2025/12/01-00:28:55.782015 403546 Options.max_background_flushes: -1 +2025/12/01-00:28:55.782019 403546 Options.daily_offpeak_time_utc: +2025/12/01-00:28:55.782023 403546 Compression algorithms supported: +2025/12/01-00:28:55.782027 403546 kCustomCompressionFE supported: 0 +2025/12/01-00:28:55.782031 403546 kCustomCompressionFC supported: 0 +2025/12/01-00:28:55.782035 403546 kCustomCompressionF8 supported: 0 +2025/12/01-00:28:55.782039 403546 kCustomCompressionF7 supported: 0 +2025/12/01-00:28:55.782043 403546 kCustomCompressionB2 supported: 0 +2025/12/01-00:28:55.782046 403546 kLZ4Compression supported: 1 +2025/12/01-00:28:55.782050 403546 kCustomCompression88 supported: 0 +2025/12/01-00:28:55.782054 403546 kCustomCompressionD8 supported: 0 +2025/12/01-00:28:55.782057 403546 kCustomCompression9F supported: 0 +2025/12/01-00:28:55.782060 403546 kCustomCompressionD6 supported: 0 +2025/12/01-00:28:55.782063 403546 kCustomCompressionA9 supported: 0 +2025/12/01-00:28:55.782067 403546 kCustomCompressionEC supported: 0 +2025/12/01-00:28:55.782071 403546 kCustomCompressionA3 supported: 0 +2025/12/01-00:28:55.782074 403546 kCustomCompressionCB supported: 0 +2025/12/01-00:28:55.782077 403546 kCustomCompression90 supported: 0 +2025/12/01-00:28:55.782080 403546 kCustomCompressionA0 supported: 0 +2025/12/01-00:28:55.782083 403546 kCustomCompressionC6 supported: 0 +2025/12/01-00:28:55.782086 403546 kCustomCompression9D supported: 0 +2025/12/01-00:28:55.782090 403546 kCustomCompression8B supported: 0 +2025/12/01-00:28:55.782094 403546 kCustomCompressionA8 supported: 0 +2025/12/01-00:28:55.782098 403546 kCustomCompression8D supported: 0 +2025/12/01-00:28:55.782101 403546 kCustomCompression97 supported: 0 +2025/12/01-00:28:55.782104 403546 kCustomCompression98 supported: 0 +2025/12/01-00:28:55.782107 403546 kCustomCompressionAC supported: 0 +2025/12/01-00:28:55.782110 403546 kCustomCompressionE9 supported: 0 +2025/12/01-00:28:55.782112 403546 kCustomCompression96 supported: 0 +2025/12/01-00:28:55.782116 403546 kCustomCompressionB1 supported: 0 +2025/12/01-00:28:55.782119 403546 kCustomCompression95 supported: 0 +2025/12/01-00:28:55.782123 403546 kCustomCompression84 supported: 0 +2025/12/01-00:28:55.782127 403546 kCustomCompression91 supported: 0 +2025/12/01-00:28:55.782131 403546 kCustomCompressionAB supported: 0 +2025/12/01-00:28:55.782135 403546 kCustomCompressionB3 supported: 0 +2025/12/01-00:28:55.782139 403546 kCustomCompression81 supported: 0 +2025/12/01-00:28:55.782142 403546 kCustomCompressionDC supported: 0 +2025/12/01-00:28:55.782146 403546 kBZip2Compression supported: 1 +2025/12/01-00:28:55.782150 403546 kCustomCompressionBB supported: 0 +2025/12/01-00:28:55.782153 403546 kCustomCompression9C supported: 0 +2025/12/01-00:28:55.782157 403546 kCustomCompressionC9 supported: 0 +2025/12/01-00:28:55.782161 403546 kCustomCompressionCC supported: 0 +2025/12/01-00:28:55.782165 403546 kCustomCompression92 supported: 0 +2025/12/01-00:28:55.782169 403546 kCustomCompressionB9 supported: 0 +2025/12/01-00:28:55.782173 403546 kCustomCompression8F supported: 0 +2025/12/01-00:28:55.782177 403546 kCustomCompression8A supported: 0 +2025/12/01-00:28:55.782181 403546 kCustomCompression9B supported: 0 +2025/12/01-00:28:55.782184 403546 kZSTD supported: 1 +2025/12/01-00:28:55.782189 403546 kCustomCompressionAA supported: 0 +2025/12/01-00:28:55.782193 403546 kCustomCompressionA2 supported: 0 +2025/12/01-00:28:55.782196 403546 kZlibCompression supported: 1 +2025/12/01-00:28:55.782200 403546 kXpressCompression supported: 0 +2025/12/01-00:28:55.782204 403546 kCustomCompressionFD supported: 0 +2025/12/01-00:28:55.782208 403546 kCustomCompressionE2 supported: 0 +2025/12/01-00:28:55.782212 403546 kLZ4HCCompression supported: 1 +2025/12/01-00:28:55.782216 403546 kCustomCompressionA6 supported: 0 +2025/12/01-00:28:55.782219 403546 kCustomCompression85 supported: 0 +2025/12/01-00:28:55.782223 403546 kCustomCompressionA4 supported: 0 +2025/12/01-00:28:55.782227 403546 kCustomCompression86 supported: 0 +2025/12/01-00:28:55.782231 403546 kCustomCompression83 supported: 0 +2025/12/01-00:28:55.782234 403546 kCustomCompression87 supported: 0 +2025/12/01-00:28:55.782238 403546 kCustomCompression89 supported: 0 +2025/12/01-00:28:55.782242 403546 kCustomCompression8C supported: 0 +2025/12/01-00:28:55.782245 403546 kCustomCompressionDB supported: 0 +2025/12/01-00:28:55.782249 403546 kCustomCompressionF3 supported: 0 +2025/12/01-00:28:55.782253 403546 kCustomCompressionE6 supported: 0 +2025/12/01-00:28:55.782256 403546 kCustomCompression8E supported: 0 +2025/12/01-00:28:55.782260 403546 kCustomCompressionDA supported: 0 +2025/12/01-00:28:55.782264 403546 kCustomCompression93 supported: 0 +2025/12/01-00:28:55.782268 403546 kCustomCompression94 supported: 0 +2025/12/01-00:28:55.782272 403546 kCustomCompression9E supported: 0 +2025/12/01-00:28:55.782275 403546 kCustomCompressionB4 supported: 0 +2025/12/01-00:28:55.782279 403546 kCustomCompressionFB supported: 0 +2025/12/01-00:28:55.782283 403546 kCustomCompressionB5 supported: 0 +2025/12/01-00:28:55.782287 403546 kCustomCompressionD5 supported: 0 +2025/12/01-00:28:55.782291 403546 kCustomCompressionB8 supported: 0 +2025/12/01-00:28:55.782295 403546 kCustomCompressionD1 supported: 0 +2025/12/01-00:28:55.782299 403546 kCustomCompressionBA supported: 0 +2025/12/01-00:28:55.782303 403546 kCustomCompressionBC supported: 0 +2025/12/01-00:28:55.782306 403546 kCustomCompressionCE supported: 0 +2025/12/01-00:28:55.782310 403546 kCustomCompressionBD supported: 0 +2025/12/01-00:28:55.782314 403546 kCustomCompressionC4 supported: 0 +2025/12/01-00:28:55.782318 403546 kCustomCompression9A supported: 0 +2025/12/01-00:28:55.782322 403546 kCustomCompression99 supported: 0 +2025/12/01-00:28:55.782326 403546 kCustomCompressionBE supported: 0 +2025/12/01-00:28:55.782330 403546 kCustomCompressionE5 supported: 0 +2025/12/01-00:28:55.782333 403546 kCustomCompressionD9 supported: 0 +2025/12/01-00:28:55.782337 403546 kCustomCompressionC1 supported: 0 +2025/12/01-00:28:55.782341 403546 kCustomCompressionC5 supported: 0 +2025/12/01-00:28:55.782345 403546 kCustomCompressionC2 supported: 0 +2025/12/01-00:28:55.782349 403546 kCustomCompressionA5 supported: 0 +2025/12/01-00:28:55.782352 403546 kCustomCompressionC7 supported: 0 +2025/12/01-00:28:55.782356 403546 kCustomCompressionBF supported: 0 +2025/12/01-00:28:55.782360 403546 kCustomCompressionE8 supported: 0 +2025/12/01-00:28:55.782364 403546 kCustomCompressionC8 supported: 0 +2025/12/01-00:28:55.782368 403546 kCustomCompressionAF supported: 0 +2025/12/01-00:28:55.782372 403546 kCustomCompressionCA supported: 0 +2025/12/01-00:28:55.782375 403546 kCustomCompressionCD supported: 0 +2025/12/01-00:28:55.782379 403546 kCustomCompressionC0 supported: 0 +2025/12/01-00:28:55.782383 403546 kCustomCompressionCF supported: 0 +2025/12/01-00:28:55.782387 403546 kCustomCompressionF9 supported: 0 +2025/12/01-00:28:55.782391 403546 kCustomCompressionD0 supported: 0 +2025/12/01-00:28:55.782395 403546 kCustomCompressionD2 supported: 0 +2025/12/01-00:28:55.782399 403546 kCustomCompressionAD supported: 0 +2025/12/01-00:28:55.782402 403546 kCustomCompressionD3 supported: 0 +2025/12/01-00:28:55.782406 403546 kCustomCompressionD4 supported: 0 +2025/12/01-00:28:55.782410 403546 kCustomCompressionD7 supported: 0 +2025/12/01-00:28:55.782414 403546 kCustomCompression82 supported: 0 +2025/12/01-00:28:55.782417 403546 kCustomCompressionDD supported: 0 +2025/12/01-00:28:55.782421 403546 kCustomCompressionC3 supported: 0 +2025/12/01-00:28:55.782424 403546 kCustomCompressionEE supported: 0 +2025/12/01-00:28:55.782428 403546 kCustomCompressionDE supported: 0 +2025/12/01-00:28:55.782432 403546 kCustomCompressionDF supported: 0 +2025/12/01-00:28:55.782435 403546 kCustomCompressionA7 supported: 0 +2025/12/01-00:28:55.782439 403546 kCustomCompressionE0 supported: 0 +2025/12/01-00:28:55.782443 403546 kCustomCompressionF1 supported: 0 +2025/12/01-00:28:55.782446 403546 kCustomCompressionE1 supported: 0 +2025/12/01-00:28:55.782450 403546 kCustomCompressionF5 supported: 0 +2025/12/01-00:28:55.782453 403546 kCustomCompression80 supported: 0 +2025/12/01-00:28:55.782457 403546 kCustomCompressionE3 supported: 0 +2025/12/01-00:28:55.782461 403546 kCustomCompressionE4 supported: 0 +2025/12/01-00:28:55.782465 403546 kCustomCompressionB0 supported: 0 +2025/12/01-00:28:55.782469 403546 kCustomCompressionEA supported: 0 +2025/12/01-00:28:55.782476 403546 kCustomCompressionFA supported: 0 +2025/12/01-00:28:55.782480 403546 kCustomCompressionE7 supported: 0 +2025/12/01-00:28:55.782484 403546 kCustomCompressionAE supported: 0 +2025/12/01-00:28:55.782487 403546 kCustomCompressionEB supported: 0 +2025/12/01-00:28:55.782491 403546 kCustomCompressionED supported: 0 +2025/12/01-00:28:55.782494 403546 kCustomCompressionB6 supported: 0 +2025/12/01-00:28:55.782498 403546 kCustomCompressionEF supported: 0 +2025/12/01-00:28:55.782502 403546 kCustomCompressionF0 supported: 0 +2025/12/01-00:28:55.782505 403546 kCustomCompressionB7 supported: 0 +2025/12/01-00:28:55.782509 403546 kCustomCompressionF2 supported: 0 +2025/12/01-00:28:55.782513 403546 kCustomCompressionA1 supported: 0 +2025/12/01-00:28:55.782517 403546 kCustomCompressionF4 supported: 0 +2025/12/01-00:28:55.782521 403546 kSnappyCompression supported: 1 +2025/12/01-00:28:55.782524 403546 kCustomCompressionF6 supported: 0 +2025/12/01-00:28:55.782528 403546 Fast CRC32 supported: Not supported on x86 +2025/12/01-00:28:55.782533 403546 DMutex implementation: pthread_mutex_t +2025/12/01-00:28:55.782538 403546 Jemalloc supported: 0 +2025/12/01-00:28:55.784126 403546 [db/db_impl/db_impl_open.cc:312] Creating manifest 1 +2025/12/01-00:28:55.785922 403546 [db/version_set.cc:6122] Recovering from manifest file: data/MANIFEST-000001 +2025/12/01-00:28:55.786118 403546 [db/column_family.cc:690] --------------- Options for column family [default]: +2025/12/01-00:28:55.786124 403546 Options.comparator: leveldb.BytewiseComparator +2025/12/01-00:28:55.786129 403546 Options.merge_operator: None +2025/12/01-00:28:55.786133 403546 Options.compaction_filter: None +2025/12/01-00:28:55.786137 403546 Options.compaction_filter_factory: None +2025/12/01-00:28:55.786141 403546 Options.sst_partitioner_factory: None +2025/12/01-00:28:55.786144 403546 Options.memtable_factory: SkipListFactory +2025/12/01-00:28:55.786147 403546 Options.table_factory: BlockBasedTable +2025/12/01-00:28:55.786182 403546 table_factory options: flush_block_policy_factory: FlushBlockBySizePolicyFactory (0x555555ceacc0) + cache_index_and_filter_blocks: 0 + cache_index_and_filter_blocks_with_high_priority: 1 + pin_l0_filter_and_index_blocks_in_cache: 0 + pin_top_level_index_and_filter: 1 + index_type: 0 + data_block_index_type: 0 + index_shortening: 1 + data_block_hash_table_util_ratio: 0.750000 + checksum: 4 + no_block_cache: 0 + block_cache: 0x555555ceb020 + block_cache_name: LRUCache + block_cache_options: + capacity : 33554432 + num_shard_bits : 6 + strict_capacity_limit : 0 + memory_allocator : None + high_pri_pool_ratio: 0.500 + low_pri_pool_ratio: 0.000 + persistent_cache: (nil) + block_size: 4096 + block_size_deviation: 10 + block_restart_interval: 16 + index_block_restart_interval: 1 + metadata_block_size: 4096 + partition_filters: 0 + use_delta_encoding: 1 + filter_policy: nullptr + whole_key_filtering: 1 + verify_compression: 0 + read_amp_bytes_per_bit: 0 + format_version: 6 + enable_index_compression: 1 + block_align: 0 + max_auto_readahead_size: 262144 + prepopulate_block_cache: 0 + initial_auto_readahead_size: 8192 + num_file_reads_for_auto_readahead: 2 +2025/12/01-00:28:55.786200 403546 Options.write_buffer_size: 67108864 +2025/12/01-00:28:55.786204 403546 Options.max_write_buffer_number: 2 +2025/12/01-00:28:55.786208 403546 Options.compression: Snappy +2025/12/01-00:28:55.786212 403546 Options.bottommost_compression: Disabled +2025/12/01-00:28:55.786216 403546 Options.prefix_extractor: nullptr +2025/12/01-00:28:55.786219 403546 Options.memtable_insert_with_hint_prefix_extractor: nullptr +2025/12/01-00:28:55.786223 403546 Options.num_levels: 7 +2025/12/01-00:28:55.786227 403546 Options.min_write_buffer_number_to_merge: 1 +2025/12/01-00:28:55.786231 403546 Options.max_write_buffer_size_to_maintain: 0 +2025/12/01-00:28:55.786234 403546 Options.bottommost_compression_opts.window_bits: -14 +2025/12/01-00:28:55.786241 403546 Options.bottommost_compression_opts.level: 32767 +2025/12/01-00:28:55.786245 403546 Options.bottommost_compression_opts.strategy: 0 +2025/12/01-00:28:55.786249 403546 Options.bottommost_compression_opts.max_dict_bytes: 0 +2025/12/01-00:28:55.786252 403546 Options.bottommost_compression_opts.zstd_max_train_bytes: 0 +2025/12/01-00:28:55.786257 403546 Options.bottommost_compression_opts.parallel_threads: 1 +2025/12/01-00:28:55.786260 403546 Options.bottommost_compression_opts.enabled: false +2025/12/01-00:28:55.786264 403546 Options.bottommost_compression_opts.max_dict_buffer_bytes: 0 +2025/12/01-00:28:55.786267 403546 Options.bottommost_compression_opts.use_zstd_dict_trainer: true +2025/12/01-00:28:55.786271 403546 Options.compression_opts.window_bits: -14 +2025/12/01-00:28:55.786274 403546 Options.compression_opts.level: 32767 +2025/12/01-00:28:55.786278 403546 Options.compression_opts.strategy: 0 +2025/12/01-00:28:55.786282 403546 Options.compression_opts.max_dict_bytes: 0 +2025/12/01-00:28:55.786285 403546 Options.compression_opts.zstd_max_train_bytes: 0 +2025/12/01-00:28:55.786289 403546 Options.compression_opts.use_zstd_dict_trainer: true +2025/12/01-00:28:55.786293 403546 Options.compression_opts.parallel_threads: 1 +2025/12/01-00:28:55.786296 403546 Options.compression_opts.enabled: false +2025/12/01-00:28:55.786300 403546 Options.compression_opts.max_dict_buffer_bytes: 0 +2025/12/01-00:28:55.786304 403546 Options.level0_file_num_compaction_trigger: 4 +2025/12/01-00:28:55.786308 403546 Options.level0_slowdown_writes_trigger: 20 +2025/12/01-00:28:55.786312 403546 Options.level0_stop_writes_trigger: 36 +2025/12/01-00:28:55.786316 403546 Options.target_file_size_base: 67108864 +2025/12/01-00:28:55.786319 403546 Options.target_file_size_multiplier: 1 +2025/12/01-00:28:55.786323 403546 Options.max_bytes_for_level_base: 268435456 +2025/12/01-00:28:55.786327 403546 Options.level_compaction_dynamic_level_bytes: 1 +2025/12/01-00:28:55.786331 403546 Options.max_bytes_for_level_multiplier: 10.000000 +2025/12/01-00:28:55.786335 403546 Options.max_bytes_for_level_multiplier_addtl[0]: 1 +2025/12/01-00:28:55.786339 403546 Options.max_bytes_for_level_multiplier_addtl[1]: 1 +2025/12/01-00:28:55.786343 403546 Options.max_bytes_for_level_multiplier_addtl[2]: 1 +2025/12/01-00:28:55.786347 403546 Options.max_bytes_for_level_multiplier_addtl[3]: 1 +2025/12/01-00:28:55.786350 403546 Options.max_bytes_for_level_multiplier_addtl[4]: 1 +2025/12/01-00:28:55.786354 403546 Options.max_bytes_for_level_multiplier_addtl[5]: 1 +2025/12/01-00:28:55.786358 403546 Options.max_bytes_for_level_multiplier_addtl[6]: 1 +2025/12/01-00:28:55.786362 403546 Options.max_sequential_skip_in_iterations: 8 +2025/12/01-00:28:55.786365 403546 Options.memtable_op_scan_flush_trigger: 0 +2025/12/01-00:28:55.786369 403546 Options.memtable_avg_op_scan_flush_trigger: 0 +2025/12/01-00:28:55.786373 403546 Options.max_compaction_bytes: 1677721600 +2025/12/01-00:28:55.786377 403546 Options.arena_block_size: 1048576 +2025/12/01-00:28:55.786381 403546 Options.soft_pending_compaction_bytes_limit: 68719476736 +2025/12/01-00:28:55.786385 403546 Options.hard_pending_compaction_bytes_limit: 274877906944 +2025/12/01-00:28:55.786388 403546 Options.disable_auto_compactions: 0 +2025/12/01-00:28:55.786395 403546 Options.compaction_style: kCompactionStyleLevel +2025/12/01-00:28:55.786400 403546 Options.compaction_pri: kMinOverlappingRatio +2025/12/01-00:28:55.786404 403546 Options.compaction_options_universal.size_ratio: 1 +2025/12/01-00:28:55.786408 403546 Options.compaction_options_universal.min_merge_width: 2 +2025/12/01-00:28:55.786411 403546 Options.compaction_options_universal.max_merge_width: 4294967295 +2025/12/01-00:28:55.786418 403546 Options.compaction_options_universal.max_size_amplification_percent: 200 +2025/12/01-00:28:55.786421 403546 Options.compaction_options_universal.compression_size_percent: -1 +2025/12/01-00:28:55.786427 403546 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize +2025/12/01-00:28:55.786431 403546 Options.compaction_options_universal.max_read_amp: -1 +2025/12/01-00:28:55.786435 403546 Options.compaction_options_universal.reduce_file_locking: 0 +2025/12/01-00:28:55.786439 403546 Options.compaction_options_fifo.max_table_files_size: 1073741824 +2025/12/01-00:28:55.786442 403546 Options.compaction_options_fifo.allow_compaction: 0 +2025/12/01-00:28:55.786453 403546 Options.table_properties_collectors: +2025/12/01-00:28:55.786457 403546 Options.inplace_update_support: 0 +2025/12/01-00:28:55.786460 403546 Options.inplace_update_num_locks: 10000 +2025/12/01-00:28:55.786464 403546 Options.memtable_prefix_bloom_size_ratio: 0.000000 +2025/12/01-00:28:55.786468 403546 Options.memtable_whole_key_filtering: 0 +2025/12/01-00:28:55.786472 403546 Options.memtable_huge_page_size: 0 +2025/12/01-00:28:55.786476 403546 Options.bloom_locality: 0 +2025/12/01-00:28:55.786480 403546 Options.max_successive_merges: 0 +2025/12/01-00:28:55.786483 403546 Options.strict_max_successive_merges: 0 +2025/12/01-00:28:55.786486 403546 Options.optimize_filters_for_hits: 0 +2025/12/01-00:28:55.786490 403546 Options.paranoid_file_checks: 0 +2025/12/01-00:28:55.786494 403546 Options.force_consistency_checks: 1 +2025/12/01-00:28:55.786498 403546 Options.report_bg_io_stats: 0 +2025/12/01-00:28:55.786501 403546 Options.disallow_memtable_writes: 0 +2025/12/01-00:28:55.786505 403546 Options.ttl: 2592000 +2025/12/01-00:28:55.786509 403546 Options.periodic_compaction_seconds: 0 +2025/12/01-00:28:55.786513 403546 Options.default_temperature: kUnknown +2025/12/01-00:28:55.786517 403546 Options.preclude_last_level_data_seconds: 0 +2025/12/01-00:28:55.786520 403546 Options.preserve_internal_time_seconds: 0 +2025/12/01-00:28:55.786524 403546 Options.enable_blob_files: false +2025/12/01-00:28:55.786528 403546 Options.min_blob_size: 0 +2025/12/01-00:28:55.786532 403546 Options.blob_file_size: 268435456 +2025/12/01-00:28:55.786536 403546 Options.blob_compression_type: NoCompression +2025/12/01-00:28:55.786540 403546 Options.enable_blob_garbage_collection: false +2025/12/01-00:28:55.786544 403546 Options.blob_garbage_collection_age_cutoff: 0.250000 +2025/12/01-00:28:55.786547 403546 Options.blob_garbage_collection_force_threshold: 1.000000 +2025/12/01-00:28:55.786552 403546 Options.blob_compaction_readahead_size: 0 +2025/12/01-00:28:55.786555 403546 Options.blob_file_starting_level: 0 +2025/12/01-00:28:55.786559 403546 Options.experimental_mempurge_threshold: 0.000000 +2025/12/01-00:28:55.786563 403546 Options.memtable_max_range_deletions: 0 +2025/12/01-00:28:55.787467 403546 [db/version_set.cc:6172] Recovered from manifest file:data/MANIFEST-000001 succeeded,manifest_file_number is 1, next_file_number is 3, last_sequence is 0, log_number is 0,prev_log_number is 0,max_column_family is 0,min_log_number_to_keep is 0 +2025/12/01-00:28:55.787476 403546 [db/version_set.cc:6187] Column family [default] (ID 0), log number is 0 +2025/12/01-00:28:55.787481 403546 [db/db_impl/db_impl_open.cc:686] DB ID: 75e7dbab-ce09-4522-a98d-2fcf0772294c +2025/12/01-00:28:55.787610 403546 [db/version_set.cc:5630] Creating manifest 5 +2025/12/01-00:28:55.790293 403546 [db/column_family.cc:690] --------------- Options for column family [cas]: +2025/12/01-00:28:55.790302 403546 Options.comparator: leveldb.BytewiseComparator +2025/12/01-00:28:55.790306 403546 Options.merge_operator: None +2025/12/01-00:28:55.790310 403546 Options.compaction_filter: None +2025/12/01-00:28:55.790315 403546 Options.compaction_filter_factory: None +2025/12/01-00:28:55.790318 403546 Options.sst_partitioner_factory: None +2025/12/01-00:28:55.790322 403546 Options.memtable_factory: SkipListFactory +2025/12/01-00:28:55.790326 403546 Options.table_factory: BlockBasedTable +2025/12/01-00:28:55.790355 403546 table_factory options: flush_block_policy_factory: FlushBlockBySizePolicyFactory (0x555555ceacc0) + cache_index_and_filter_blocks: 0 + cache_index_and_filter_blocks_with_high_priority: 1 + pin_l0_filter_and_index_blocks_in_cache: 0 + pin_top_level_index_and_filter: 1 + index_type: 0 + data_block_index_type: 0 + index_shortening: 1 + data_block_hash_table_util_ratio: 0.750000 + checksum: 4 + no_block_cache: 0 + block_cache: 0x555555ceb020 + block_cache_name: LRUCache + block_cache_options: + capacity : 33554432 + num_shard_bits : 6 + strict_capacity_limit : 0 + memory_allocator : None + high_pri_pool_ratio: 0.500 + low_pri_pool_ratio: 0.000 + persistent_cache: (nil) + block_size: 4096 + block_size_deviation: 10 + block_restart_interval: 16 + index_block_restart_interval: 1 + metadata_block_size: 4096 + partition_filters: 0 + use_delta_encoding: 1 + filter_policy: nullptr + whole_key_filtering: 1 + verify_compression: 0 + read_amp_bytes_per_bit: 0 + format_version: 6 + enable_index_compression: 1 + block_align: 0 + max_auto_readahead_size: 262144 + prepopulate_block_cache: 0 + initial_auto_readahead_size: 8192 + num_file_reads_for_auto_readahead: 2 +2025/12/01-00:28:55.790361 403546 Options.write_buffer_size: 67108864 +2025/12/01-00:28:55.790366 403546 Options.max_write_buffer_number: 2 +2025/12/01-00:28:55.790371 403546 Options.compression: Snappy +2025/12/01-00:28:55.790376 403546 Options.bottommost_compression: Disabled +2025/12/01-00:28:55.790381 403546 Options.prefix_extractor: nullptr +2025/12/01-00:28:55.790386 403546 Options.memtable_insert_with_hint_prefix_extractor: nullptr +2025/12/01-00:28:55.790390 403546 Options.num_levels: 7 +2025/12/01-00:28:55.790394 403546 Options.min_write_buffer_number_to_merge: 1 +2025/12/01-00:28:55.790399 403546 Options.max_write_buffer_size_to_maintain: 0 +2025/12/01-00:28:55.790403 403546 Options.bottommost_compression_opts.window_bits: -14 +2025/12/01-00:28:55.790408 403546 Options.bottommost_compression_opts.level: 32767 +2025/12/01-00:28:55.790413 403546 Options.bottommost_compression_opts.strategy: 0 +2025/12/01-00:28:55.790417 403546 Options.bottommost_compression_opts.max_dict_bytes: 0 +2025/12/01-00:28:55.790421 403546 Options.bottommost_compression_opts.zstd_max_train_bytes: 0 +2025/12/01-00:28:55.790427 403546 Options.bottommost_compression_opts.parallel_threads: 1 +2025/12/01-00:28:55.790431 403546 Options.bottommost_compression_opts.enabled: false +2025/12/01-00:28:55.790436 403546 Options.bottommost_compression_opts.max_dict_buffer_bytes: 0 +2025/12/01-00:28:55.790440 403546 Options.bottommost_compression_opts.use_zstd_dict_trainer: true +2025/12/01-00:28:55.790444 403546 Options.compression_opts.window_bits: -14 +2025/12/01-00:28:55.790448 403546 Options.compression_opts.level: 32767 +2025/12/01-00:28:55.790451 403546 Options.compression_opts.strategy: 0 +2025/12/01-00:28:55.790455 403546 Options.compression_opts.max_dict_bytes: 0 +2025/12/01-00:28:55.790459 403546 Options.compression_opts.zstd_max_train_bytes: 0 +2025/12/01-00:28:55.790463 403546 Options.compression_opts.use_zstd_dict_trainer: true +2025/12/01-00:28:55.790466 403546 Options.compression_opts.parallel_threads: 1 +2025/12/01-00:28:55.790470 403546 Options.compression_opts.enabled: false +2025/12/01-00:28:55.790474 403546 Options.compression_opts.max_dict_buffer_bytes: 0 +2025/12/01-00:28:55.790478 403546 Options.level0_file_num_compaction_trigger: 4 +2025/12/01-00:28:55.790482 403546 Options.level0_slowdown_writes_trigger: 20 +2025/12/01-00:28:55.790486 403546 Options.level0_stop_writes_trigger: 36 +2025/12/01-00:28:55.790489 403546 Options.target_file_size_base: 67108864 +2025/12/01-00:28:55.790493 403546 Options.target_file_size_multiplier: 1 +2025/12/01-00:28:55.790497 403546 Options.max_bytes_for_level_base: 268435456 +2025/12/01-00:28:55.790500 403546 Options.level_compaction_dynamic_level_bytes: 1 +2025/12/01-00:28:55.790504 403546 Options.max_bytes_for_level_multiplier: 10.000000 +2025/12/01-00:28:55.790508 403546 Options.max_bytes_for_level_multiplier_addtl[0]: 1 +2025/12/01-00:28:55.790512 403546 Options.max_bytes_for_level_multiplier_addtl[1]: 1 +2025/12/01-00:28:55.790516 403546 Options.max_bytes_for_level_multiplier_addtl[2]: 1 +2025/12/01-00:28:55.790520 403546 Options.max_bytes_for_level_multiplier_addtl[3]: 1 +2025/12/01-00:28:55.790523 403546 Options.max_bytes_for_level_multiplier_addtl[4]: 1 +2025/12/01-00:28:55.790527 403546 Options.max_bytes_for_level_multiplier_addtl[5]: 1 +2025/12/01-00:28:55.790531 403546 Options.max_bytes_for_level_multiplier_addtl[6]: 1 +2025/12/01-00:28:55.790535 403546 Options.max_sequential_skip_in_iterations: 8 +2025/12/01-00:28:55.790538 403546 Options.memtable_op_scan_flush_trigger: 0 +2025/12/01-00:28:55.790542 403546 Options.memtable_avg_op_scan_flush_trigger: 0 +2025/12/01-00:28:55.790546 403546 Options.max_compaction_bytes: 1677721600 +2025/12/01-00:28:55.790550 403546 Options.arena_block_size: 1048576 +2025/12/01-00:28:55.790553 403546 Options.soft_pending_compaction_bytes_limit: 68719476736 +2025/12/01-00:28:55.790558 403546 Options.hard_pending_compaction_bytes_limit: 274877906944 +2025/12/01-00:28:55.790561 403546 Options.disable_auto_compactions: 0 +2025/12/01-00:28:55.790566 403546 Options.compaction_style: kCompactionStyleLevel +2025/12/01-00:28:55.790571 403546 Options.compaction_pri: kMinOverlappingRatio +2025/12/01-00:28:55.790576 403546 Options.compaction_options_universal.size_ratio: 1 +2025/12/01-00:28:55.790581 403546 Options.compaction_options_universal.min_merge_width: 2 +2025/12/01-00:28:55.790585 403546 Options.compaction_options_universal.max_merge_width: 4294967295 +2025/12/01-00:28:55.790590 403546 Options.compaction_options_universal.max_size_amplification_percent: 200 +2025/12/01-00:28:55.790596 403546 Options.compaction_options_universal.compression_size_percent: -1 +2025/12/01-00:28:55.790602 403546 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize +2025/12/01-00:28:55.790607 403546 Options.compaction_options_universal.max_read_amp: -1 +2025/12/01-00:28:55.790612 403546 Options.compaction_options_universal.reduce_file_locking: 0 +2025/12/01-00:28:55.790617 403546 Options.compaction_options_fifo.max_table_files_size: 1073741824 +2025/12/01-00:28:55.790621 403546 Options.compaction_options_fifo.allow_compaction: 0 +2025/12/01-00:28:55.790630 403546 Options.table_properties_collectors: +2025/12/01-00:28:55.790635 403546 Options.inplace_update_support: 0 +2025/12/01-00:28:55.790640 403546 Options.inplace_update_num_locks: 10000 +2025/12/01-00:28:55.790644 403546 Options.memtable_prefix_bloom_size_ratio: 0.000000 +2025/12/01-00:28:55.790650 403546 Options.memtable_whole_key_filtering: 0 +2025/12/01-00:28:55.790655 403546 Options.memtable_huge_page_size: 0 +2025/12/01-00:28:55.790660 403546 Options.bloom_locality: 0 +2025/12/01-00:28:55.790665 403546 Options.max_successive_merges: 0 +2025/12/01-00:28:55.790670 403546 Options.strict_max_successive_merges: 0 +2025/12/01-00:28:55.790675 403546 Options.optimize_filters_for_hits: 0 +2025/12/01-00:28:55.790680 403546 Options.paranoid_file_checks: 0 +2025/12/01-00:28:55.790684 403546 Options.force_consistency_checks: 1 +2025/12/01-00:28:55.790690 403546 Options.report_bg_io_stats: 0 +2025/12/01-00:28:55.790694 403546 Options.disallow_memtable_writes: 0 +2025/12/01-00:28:55.790699 403546 Options.ttl: 2592000 +2025/12/01-00:28:55.790704 403546 Options.periodic_compaction_seconds: 0 +2025/12/01-00:28:55.790709 403546 Options.default_temperature: kUnknown +2025/12/01-00:28:55.790713 403546 Options.preclude_last_level_data_seconds: 0 +2025/12/01-00:28:55.790718 403546 Options.preserve_internal_time_seconds: 0 +2025/12/01-00:28:55.790723 403546 Options.enable_blob_files: false +2025/12/01-00:28:55.790726 403546 Options.min_blob_size: 0 +2025/12/01-00:28:55.790730 403546 Options.blob_file_size: 268435456 +2025/12/01-00:28:55.790734 403546 Options.blob_compression_type: NoCompression +2025/12/01-00:28:55.790738 403546 Options.enable_blob_garbage_collection: false +2025/12/01-00:28:55.790742 403546 Options.blob_garbage_collection_age_cutoff: 0.250000 +2025/12/01-00:28:55.790745 403546 Options.blob_garbage_collection_force_threshold: 1.000000 +2025/12/01-00:28:55.790750 403546 Options.blob_compaction_readahead_size: 0 +2025/12/01-00:28:55.790753 403546 Options.blob_file_starting_level: 0 +2025/12/01-00:28:55.790757 403546 Options.experimental_mempurge_threshold: 0.000000 +2025/12/01-00:28:55.790761 403546 Options.memtable_max_range_deletions: 0 +2025/12/01-00:28:55.790849 403546 [db/db_impl/db_impl.cc:3674] Created column family [cas] (ID 1) +2025/12/01-00:28:55.793552 403546 [db/db_impl/db_impl_open.cc:2622] SstFileManager instance 0x555555cf2ad0 +2025/12/01-00:28:55.793697 403546 DB pointer 0x555555cf7a00 +2025/12/01-00:28:55.793973 403600 [db/db_impl/db_impl.cc:1116] ------- DUMPING STATS ------- +2025/12/01-00:28:55.793986 403600 [db/db_impl/db_impl.cc:1118] +** DB Stats ** +Uptime(secs): 0.0 total, 0.0 interval +Cumulative writes: 0 writes, 0 keys, 0 commit groups, 0.0 writes per commit group, ingest: 0.00 GB, 0.00 MB/s +Cumulative WAL: 0 writes, 0 syncs, 0.00 writes per sync, written: 0.00 GB, 0.00 MB/s +Cumulative stall: 00:00:0.000 H:M:S, 0.0 percent +Interval writes: 0 writes, 0 keys, 0 commit groups, 0.0 writes per commit group, ingest: 0.00 MB, 0.00 MB/s +Interval WAL: 0 writes, 0 syncs, 0.00 writes per sync, written: 0.00 GB, 0.00 MB/s +Interval stall: 00:00:0.000 H:M:S, 0.0 percent +Write Stall (count): write-buffer-manager-limit-stops: 0 + +** Compaction Stats [default] ** +Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) WPreComp(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) CompMergeCPU(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB) +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Sum 0/0 0.00 KB 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.00 0 0.000 0 0 0.0 0.0 + Int 0/0 0.00 KB 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.00 0 0.000 0 0 0.0 0.0 + +** Compaction Stats [default] ** +Priority Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) WPreComp(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) CompMergeCPU(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB) +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + +Blob file count: 0, total size: 0.0 GB, garbage size: 0.0 GB, space amp: 0.0 + +Uptime(secs): 0.0 total, 0.0 interval +Flush(GB): cumulative 0.000, interval 0.000 +AddFile(GB): cumulative 0.000, interval 0.000 +AddFile(Total Files): cumulative 0, interval 0 +AddFile(L0 Files): cumulative 0, interval 0 +AddFile(Keys): cumulative 0, interval 0 +Cumulative compaction: 0.00 GB write, 0.00 MB/s write, 0.00 GB read, 0.00 MB/s read, 0.0 seconds +Interval compaction: 0.00 GB write, 0.00 MB/s write, 0.00 GB read, 0.00 MB/s read, 0.0 seconds +Estimated pending compaction bytes: 0 +Write Stall (count): cf-l0-file-count-limit-delays-with-ongoing-compaction: 0, cf-l0-file-count-limit-stops-with-ongoing-compaction: 0, l0-file-count-limit-delays: 0, l0-file-count-limit-stops: 0, memtable-limit-delays: 0, memtable-limit-stops: 0, pending-compaction-bytes-delays: 0, pending-compaction-bytes-stops: 0, total-delays: 0, total-stops: 0 +Block cache LRUCache@0x555555ceb020#403546 capacity: 32.00 MB seed: 959817517 usage: 0.09 KB table_size: 1024 occupancy: 1 collections: 1 last_copies: 1 last_secs: 4.9e-05 secs_since: 0 +Block cache entry stats(count,size,portion): Misc(1,0.00 KB,0%) + +** File Read Latency Histogram By Level [default] ** + +** Compaction Stats [cas] ** +Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) WPreComp(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) CompMergeCPU(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB) +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Sum 0/0 0.00 KB 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.00 0 0.000 0 0 0.0 0.0 + Int 0/0 0.00 KB 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.00 0 0.000 0 0 0.0 0.0 + +** Compaction Stats [cas] ** +Priority Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) WPreComp(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) CompMergeCPU(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB) +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + +Blob file count: 0, total size: 0.0 GB, garbage size: 0.0 GB, space amp: 0.0 + +Uptime(secs): 0.0 total, 0.0 interval +Flush(GB): cumulative 0.000, interval 0.000 +AddFile(GB): cumulative 0.000, interval 0.000 +AddFile(Total Files): cumulative 0, interval 0 +AddFile(L0 Files): cumulative 0, interval 0 +AddFile(Keys): cumulative 0, interval 0 +Cumulative compaction: 0.00 GB write, 0.00 MB/s write, 0.00 GB read, 0.00 MB/s read, 0.0 seconds +Interval compaction: 0.00 GB write, 0.00 MB/s write, 0.00 GB read, 0.00 MB/s read, 0.0 seconds +Estimated pending compaction bytes: 0 +Write Stall (count): cf-l0-file-count-limit-delays-with-ongoing-compaction: 0, cf-l0-file-count-limit-stops-with-ongoing-compaction: 0, l0-file-count-limit-delays: 0, l0-file-count-limit-stops: 0, memtable-limit-delays: 0, memtable-limit-stops: 0, pending-compaction-bytes-delays: 0, pending-compaction-bytes-stops: 0, total-delays: 0, total-stops: 0 +Block cache LRUCache@0x555555ceb020#403546 capacity: 32.00 MB seed: 959817517 usage: 0.09 KB table_size: 1024 occupancy: 1 collections: 1 last_copies: 1 last_secs: 4.9e-05 secs_since: 0 +Block cache entry stats(count,size,portion): Misc(1,0.00 KB,0%) + +** File Read Latency Histogram By Level [cas] ** diff --git a/flaredb/data/MANIFEST-000005 b/flaredb/data/MANIFEST-000005 new file mode 100644 index 0000000000000000000000000000000000000000..aeeb145c9d3c5ba2303c93325c28b877fe13fd33 GIT binary patch literal 170 zcmd;G`HNAVfw9p+#oRR2JS8bHNjEvwz*5)5)W}FT(b6JC*C;JH&A{B;$jH(pnN?fE znSp`P)WN<(fq{`xDkrrpH76xW&#AH`wLG&p)j2=6Ah9U1B)_QBfsyeGGfxx?10xdy q3&S5BwV!MZjGPS2Y%C0OZmyBk$FH4<1*Gr/dev/null 2>&1; then + exec nix develop -c "$0" "$@" +fi + +cleanup() { + if [[ -n "${SERVER_PID:-}" ]]; then + kill "$SERVER_PID" >/dev/null 2>&1 || true + fi + if [[ -n "${PD_PID:-}" ]]; then + kill "$PD_PID" >/dev/null 2>&1 || true + fi +} +trap cleanup EXIT + +echo "Building workspace..." +cargo build + +echo "Running tests..." +cargo test + +echo "Starting PD..." +cargo run --bin rdb-pd -- --addr 127.0.0.1:2379 >/tmp/rdb-pd.log 2>&1 & +PD_PID=$! +sleep 2 + +echo "Starting Server..." +cargo run --bin rdb-server -- --pd-addr 127.0.0.1:2379 --addr 127.0.0.1:50051 --data-dir /tmp/rdb-server >/tmp/rdb-server.log 2>&1 & +SERVER_PID=$! +sleep 2 + +echo "Running Client Verification..." + +echo "Testing TSO..." +cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2379 --addr 127.0.0.1:50051 tso + +echo "Testing Raw Put/Get..." +cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2379 --addr 127.0.0.1:50051 raw-put --key foo --value bar +cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2379 --addr 127.0.0.1:50051 raw-get --key foo + +echo "Testing CAS success..." +cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2379 --addr 127.0.0.1:50051 cas --key cas1 --value v1 --expected 0 + +echo "Testing CAS conflict..." +set +e +cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2379 --addr 127.0.0.1:50051 cas --key cas1 --value v2 --expected 0 +set -e + +echo "Verification Complete!" diff --git a/flaredb/scripts/verify-multiraft.sh b/flaredb/scripts/verify-multiraft.sh new file mode 100644 index 0000000..8e2e49d --- /dev/null +++ b/flaredb/scripts/verify-multiraft.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Run key Multi-Raft test suites. +echo "[verify] Running multi-region routing tests..." +nix develop -c cargo test -q rdb-server::tests::test_multi_region + +echo "[verify] Running split tests..." +nix develop -c cargo test -q rdb-server::tests::test_split + +echo "[verify] Running confchange/move tests..." +nix develop -c cargo test -q rdb-server::tests::test_confchange_move + +echo "[verify] Done." diff --git a/flaredb/scripts/verify-raft.sh b/flaredb/scripts/verify-raft.sh new file mode 100755 index 0000000..4e22844 --- /dev/null +++ b/flaredb/scripts/verify-raft.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +export LIBCLANG_PATH=${LIBCLANG_PATH:-/nix/store/0zn99g048j67syaq97rczq5z0j8dsvc8-clang-21.1.2-lib/lib} + +echo "[verify] formatting..." +cargo fmt --all + +echo "[verify] running rdb-server tests..." +nix-shell -p protobuf --run "LIBCLANG_PATH=${LIBCLANG_PATH} cargo test -p rdb-server --tests" + +echo "[verify] done." diff --git a/flaredb/scripts/verify-sharding.sh b/flaredb/scripts/verify-sharding.sh new file mode 100755 index 0000000..20d9c2d --- /dev/null +++ b/flaredb/scripts/verify-sharding.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +set -e + +echo "Building workspace..." +cargo build + +echo "Starting PD..." +cargo run --bin rdb-pd & +PD_PID=$! +sleep 2 + +echo "Starting Server 1 (127.0.0.1:50001, data1)..." +# Port 50001 +cargo run --bin rdb-server -- --addr 127.0.0.1:50001 --data-dir data1 --pd-addr 127.0.0.1:2379 & +S1_PID=$! + +echo "Starting Server 2 (127.0.0.1:50002, data2)..." +# Port 50002 +cargo run --bin rdb-server -- --addr 127.0.0.1:50002 --data-dir data2 --pd-addr 127.0.0.1:2379 & +S2_PID=$! + +sleep 5 # Wait for registration + +echo "Running Client Verification (Sharding)..." + +# Put 'a' (Should go to S1) +echo "Testing Put 'a'..." +cargo run --bin rdb-client -- --addr 127.0.0.1:50001 --pd-addr 127.0.0.1:2379 raw-put --key a --value val_a + +# Put 'z' (Should go to S2) +echo "Testing Put 'z'..." +cargo run --bin rdb-client -- --addr 127.0.0.1:50001 --pd-addr 127.0.0.1:2379 raw-put --key z --value val_z + +# Cleanup +kill $PD_PID +kill $S1_PID +kill $S2_PID +rm -rf data1 data2 + +echo "Sharding Verification Complete!" diff --git a/flaredb/specs/001-distributed-core/checklists/requirements.md b/flaredb/specs/001-distributed-core/checklists/requirements.md new file mode 100644 index 0000000..7edb6d5 --- /dev/null +++ b/flaredb/specs/001-distributed-core/checklists/requirements.md @@ -0,0 +1,34 @@ +# Specification Quality Checklist: Core Distributed Architecture (Phase 1) + +**Purpose**: Validate specification completeness and quality before proceeding to planning +**Created**: 2025-11-30 +**Feature**: specs/001-distributed-core/spec.md + +## Content Quality + +- [x] No implementation details (languages, frameworks, APIs) - *Exception: Specific Rust/RocksDB constraints are part of the user request/architecture definition.* +- [x] Focused on user value and business needs +- [x] Written for non-technical stakeholders - *Target audience is database developers.* +- [x] All mandatory sections completed + +## Requirement Completeness + +- [x] No [NEEDS CLARIFICATION] markers remain +- [x] Requirements are testable and unambiguous +- [x] Success criteria are measurable +- [x] Success criteria are technology-agnostic - *Allowed tech-specifics due to nature of task.* +- [x] All acceptance scenarios are defined +- [x] Edge cases are identified - *Implicit in CAS failure scenarios.* +- [x] Scope is clearly bounded +- [x] Dependencies and assumptions identified + +## Feature Readiness + +- [x] All functional requirements have clear acceptance criteria +- [x] User scenarios cover primary flows +- [x] Feature meets measurable outcomes defined in Success Criteria +- [x] No implementation details leak into specification - *See above exception.* + +## Notes + +- The specification heavily references technical components (RocksDB, Cargo, gRPC) because the "Feature" is literally "Implement the Core Architecture". This is acceptable for this specific foundational task. diff --git a/flaredb/specs/001-distributed-core/contracts/kvrpc.proto b/flaredb/specs/001-distributed-core/contracts/kvrpc.proto new file mode 100644 index 0000000..408a08e --- /dev/null +++ b/flaredb/specs/001-distributed-core/contracts/kvrpc.proto @@ -0,0 +1,55 @@ +syntax = "proto3"; + +package kvrpc; + +// Raw (Eventual Consistency) Operations +service KvRaw { + rpc RawPut(RawPutRequest) returns (RawPutResponse); + rpc RawGet(RawGetRequest) returns (RawGetResponse); +} + +message RawPutRequest { + bytes key = 1; + bytes value = 2; +} + +message RawPutResponse { + bool success = 1; +} + +message RawGetRequest { + bytes key = 1; +} + +message RawGetResponse { + bool found = 1; + bytes value = 2; +} + +// CAS (Strong Consistency / Optimistic) Operations +service KvCas { + rpc CompareAndSwap(CasRequest) returns (CasResponse); + rpc Get(GetRequest) returns (GetResponse); +} + +message CasRequest { + bytes key = 1; + bytes value = 2; + uint64 expected_version = 3; // 0 implies "create if not exists" +} + +message CasResponse { + bool success = 1; + uint64 current_version = 2; // Returns current version on failure (for retry) + uint64 new_version = 3; // Returns assigned version on success +} + +message GetRequest { + bytes key = 1; +} + +message GetResponse { + bool found = 1; + bytes value = 2; + uint64 version = 3; +} diff --git a/flaredb/specs/001-distributed-core/contracts/pdpb.proto b/flaredb/specs/001-distributed-core/contracts/pdpb.proto new file mode 100644 index 0000000..fbed2a2 --- /dev/null +++ b/flaredb/specs/001-distributed-core/contracts/pdpb.proto @@ -0,0 +1,56 @@ +syntax = "proto3"; + +package pdpb; + +// TSO Service +service Tso { + rpc GetTimestamp(TsoRequest) returns (TsoResponse); +} + +message TsoRequest { + uint32 count = 1; +} + +message TsoResponse { + uint64 timestamp = 1; // Physical << 16 | Logical + uint32 count = 2; +} + +// Cluster Management Service +service Pd { + // Store Registration + rpc RegisterStore(RegisterStoreRequest) returns (RegisterStoreResponse); + + // Region Discovery + rpc GetRegion(GetRegionRequest) returns (GetRegionResponse); +} + +message RegisterStoreRequest { + string addr = 1; // e.g., "127.0.0.1:50051" +} + +message RegisterStoreResponse { + uint64 store_id = 1; + uint64 cluster_id = 2; // Verify cluster match +} + +message GetRegionRequest { + bytes key = 1; +} + +message GetRegionResponse { + Region region = 1; + Store leader = 2; +} + +message Region { + uint64 id = 1; + bytes start_key = 2; + bytes end_key = 3; // empty = infinity + // In future: repeated Peer peers = 4; +} + +message Store { + uint64 id = 1; + string addr = 2; +} diff --git a/flaredb/specs/001-distributed-core/data-model.md b/flaredb/specs/001-distributed-core/data-model.md new file mode 100644 index 0000000..0386dbd --- /dev/null +++ b/flaredb/specs/001-distributed-core/data-model.md @@ -0,0 +1,52 @@ +# Data Model: Core Distributed Architecture (Phase 1) + +## Entities + +### 1. Key-Value Pair (Raw) +- **Key**: `Vec` (Arbitrary bytes) +- **Value**: `Vec` (Arbitrary bytes) +- **Scope**: `rdb-storage` (Raw Put) + +### 2. Key-Value Pair (Versioned / CAS) +- **Key**: `Vec` +- **Value**: `Vec` (Metadata + Payload) +- **Version**: `u64` (Monotonic sequence) +- **Scope**: `rdb-storage` (CAS) + +### 3. TSO Timestamp +- **Physical**: `u64` (48 bits, milliseconds) +- **Logical**: `u64` (16 bits, counter) +- **Combined**: `u64` (Physical << 16 | Logical) +- **Scope**: `rdb-pd` + +## State Transitions (CAS) + +1. **Empty -> Created**: + - Current Version: 0 (or None) + - Expected Version: 0 + - New Version: TSO / Sequence > 0 + - Result: Success + +2. **Updated -> Updated**: + - Current Version: N + - Expected Version: N + - New Version: M (M > N) + - Result: Success + +3. **Conflict**: + - Current Version: N + - Expected Version: M (M != N) + - Result: Failure (Returns N) + +## Storage Schema (RocksDB Column Families) + +1. **default** (`CF_DEFAULT`): + - Stores data for Raw Puts. + - Key: `Key` + - Value: `Value` + +2. **cas** (`CF_CAS` - *Proposed name for CAS data separation*): + - Stores versioned data. + - Key: `Key` + - Value: `[Version: 8 bytes][Data...]` + - *Note: Storing version in value simplifies atomic update via Read-Modify-Write or MergeOperator.* diff --git a/flaredb/specs/001-distributed-core/plan.md b/flaredb/specs/001-distributed-core/plan.md new file mode 100644 index 0000000..e476221 --- /dev/null +++ b/flaredb/specs/001-distributed-core/plan.md @@ -0,0 +1,95 @@ +# Implementation Plan: Core Distributed Architecture (Phase 1) + +**Branch**: `001-distributed-core` | **Date**: 2025-11-30 | **Spec**: [specs/001-distributed-core/spec.md](specs/001-distributed-core/spec.md) +**Input**: Feature specification from `/specs/001-distributed-core/spec.md` + +**Note**: This template is filled in by the `/speckit.plan` command. See `.specify/templates/commands/plan.md` for the execution workflow. + +## Summary + +Implement the foundational architecture for FlareDB, a distributed key-value store with CAS support. This includes setting up a Rust Cargo Workspace with 5 crates (`rdb-proto`, `rdb-storage`, `rdb-server`, `rdb-pd`, `rdb-client`), defining gRPC interfaces, implementing a RocksDB-based local storage engine, and verifying basic client-server interaction. + +## Technical Context + +**Language/Version**: Rust (Latest Stable) +**Primary Dependencies**: +- `tonic` (gRPC) +- `prost` (Protobuf) +- `rocksdb` (Storage Engine) +- `tokio` (Async Runtime) +- `clap` (CLI) +**Storage**: RocksDB (embedded via crate) +**Testing**: `cargo test` (Unit), `cargo nextest` (Optional), Custom Integration Scripts +**Target Platform**: Linux (x86_64), managed via Nix Flake +**Project Type**: Rust Cargo Workspace (Monorepo) with Nix environment +**Performance Goals**: Low-latency CAS operations (local storage baseline) +**Constraints**: Single-node verification for Phase 1, but architecture must support distributed extension. +**Scale/Scope**: 5 crates, ~2000 LOC estimate. + +## Constitution Check + +*GATE: Must pass before Phase 0 research. Re-check after Phase 1 design.* + +- **I. Reliability & Testing**: + - Plan includes unit tests for `rdb-storage` (SC-002). + - Plan includes integration verification (SC-003). + - Compliant. +- **II. Agility & Evolution**: + - Architecture uses standard crates (`tonic`, `rocksdb`) to avoid reinventing wheels. + - Monorepo structure allows easy refactoring across crates. + - Compliant. +- **III. Simplicity & Readability**: + - Separation of concerns: Proto vs Storage vs Server vs PD vs Client. + - Clear interfaces defined in `rdb-proto`. + - Compliant. + +## Project Structure + +### Documentation (this feature) + +```text +specs/001-distributed-core/ +├── plan.md # This file (/speckit.plan command output) +├── research.md # Phase 0 output (/speckit.plan command) +├── data-model.md # Phase 1 output (/speckit.plan command) +├── quickstart.md # Phase 1 output (/speckit.plan command) +├── contracts/ # Phase 1 output (/speckit.plan command) +└── tasks.md # Phase 2 output (/speckit.tasks command - NOT created by /speckit.plan) +``` + +### Source Code (repository root) + +```text +flake.nix # Nix development environment definition +flake.lock # Lockfile for Nix dependencies +Cargo.toml # Workspace definition +rdb-proto/ +├── Cargo.toml +├── build.rs +└── src/ # Generated protos +rdb-storage/ +├── Cargo.toml +└── src/ # RocksDB wrapper, CAS logic +rdb-server/ +├── Cargo.toml +└── src/ # gRPC Server, Handlers +rdb-pd/ +├── Cargo.toml +└── src/ # Placement Driver (TSO) +rdb-client/ +├── Cargo.toml +└── src/ # Smart SDK +rdb-cli/ # (Optional for Phase 1, but good to have) +├── Cargo.toml +└── src/ +``` + +**Structure Decision**: Standard Rust Workspace layout to ensure modularity and separation of concerns as per the architecture design. + +## Complexity Tracking + +> **Fill ONLY if Constitution Check has violations that must be justified** + +| Violation | Why Needed | Simpler Alternative Rejected Because | +|-----------|------------|-------------------------------------| +| N/A | | | diff --git a/flaredb/specs/001-distributed-core/quickstart.md b/flaredb/specs/001-distributed-core/quickstart.md new file mode 100644 index 0000000..20152a6 --- /dev/null +++ b/flaredb/specs/001-distributed-core/quickstart.md @@ -0,0 +1,64 @@ +# Quickstart Verification Guide: Core Distributed Architecture + +This guide verifies the core components (PD, Server, Client) and storage engine behavior. + +## Prerequisites + +- Rust Toolchain (`rustc`, `cargo`) +- `protoc` (Protocol Buffers compiler) +- CMake (for building RocksDB) + +## 1. Build Workspace + +```bash +cargo build +``` + +## 2. Run Integration Test + +This feature includes a comprehensive integration test script. + +```bash +# Run the custom verification script (to be implemented in tasks) +# ./scripts/verify-core.sh +``` + +## 3. Manual Verification Steps + +### A. Start PD (Placement Driver) + +```bash +cargo run --bin rdb-pd +# Should listen on default port (e.g., 2379) +``` + +### B. Start Server (Storage Node) + +```bash +cargo run --bin rdb-server -- --pd-addr 127.0.0.1:2379 +# Should listen on default port (e.g., 50051) +``` + +### C. Run Client Operations + +```bash +# Get TSO +cargo run --bin rdb-client -- tso +# Output: Timestamp: 1735689... + +# Raw Put +cargo run --bin rdb-client -- raw-put --key foo --value bar +# Output: Success + +# Raw Get +cargo run --bin rdb-client -- raw-get --key foo +# Output: bar + +# CAS (Create) +cargo run --bin rdb-client -- cas --key meta1 --value "{json}" --expected 0 +# Output: Success, Version: 1735689... + +# CAS (Conflict) +cargo run --bin rdb-client -- cas --key meta1 --value "{new}" --expected 0 +# Output: Conflict! Current Version: 1735689... +``` diff --git a/flaredb/specs/001-distributed-core/research.md b/flaredb/specs/001-distributed-core/research.md new file mode 100644 index 0000000..824debe --- /dev/null +++ b/flaredb/specs/001-distributed-core/research.md @@ -0,0 +1,19 @@ +# Research: Core Distributed Architecture (Phase 1) + +**Decision**: Use `rocksdb` crate for local storage engine. +**Rationale**: Industry standard for LSM-tree storage. Provides necessary primitives (WriteBatch, Column Families) for building a KV engine. `tikv/rust-rocksdb` is the most mature binding. +**Alternatives considered**: `sled` (pure Rust, but less mature/performant for this scale), `mdbx` (B-tree, read-optimized, not suitable for high write throughput target). + +**Decision**: Use `tonic` + `prost` for gRPC. +**Rationale**: De facto standard in Rust ecosystem. Async-first, integrates perfectly with `tokio`. +**Alternatives considered**: `grpc-rs` (C-core wrapper, complex build), `tarpc` (Rust-specific, less interoperable). + +**Decision**: Use `tokio` as async runtime. +**Rationale**: Required by `tonic`. Most mature ecosystem. + +**Decision**: Monorepo Workspace Structure. +**Rationale**: Allows atomic commits across protocol, server, and client. Simplifies dependency management during rapid early development (Agility Principle). + +## Clarification Resolution + +*No [NEEDS CLARIFICATION] items were present in the spec. Technical context was sufficiently defined in the chat history.* diff --git a/flaredb/specs/001-distributed-core/spec.md b/flaredb/specs/001-distributed-core/spec.md new file mode 100644 index 0000000..a1faf95 --- /dev/null +++ b/flaredb/specs/001-distributed-core/spec.md @@ -0,0 +1,87 @@ +# Feature Specification: Core Distributed Architecture (Phase 1) + +**Feature Branch**: `001-distributed-core` +**Created**: 2025-11-30 +**Status**: Draft +**Input**: User description: "Implement the core architecture of FlareDB based on the design in chat.md..." + +## User Scenarios & Testing *(mandatory)* + + + +### User Story 1 - Core Storage Engine Verification (Priority: P1) + +As a database developer, I need a robust local storage engine that supports both CAS (Compare-And-Swap) and Raw writes, so that I can build distributed logic on top of it. + +**Why this priority**: This is the fundamental layer. Without a working storage engine with correct CAS logic, upper layers cannot function. + +**Independent Test**: Write a Rust unit test using `rdb-storage` that: +1. Creates a DB instance. +2. Performs a `raw_put`. +3. Performs a `compare_and_swap` that succeeds. +4. Performs a `compare_and_swap` that fails due to version mismatch. + +**Acceptance Scenarios**: + +1. **Given** an empty DB, **When** I `raw_put` key="k1", val="v1", **Then** `get` returns "v1". +2. **Given** key="k1" with version 0 (non-existent), **When** I `cas` with expected=0, **Then** write succeeds and version increments. +3. **Given** key="k1" with version 10, **When** I `cas` with expected=5, **Then** it returns a Conflict error with current version 10. + +--- + +### User Story 2 - Basic RPC Transport (Priority: P1) + +As a client developer, I want to connect to the server via gRPC and perform basic operations, so that I can verify the communication pipeline. + +**Why this priority**: Validates the network layer (`rdb-proto`, `tonic` integration) and the basic server shell. + +**Independent Test**: Start `rdb-server` and run a minimal `rdb-client` script that connects and sends a request. + +**Acceptance Scenarios**: + +1. **Given** a running `rdb-server`, **When** `rdb-client` sends a `GetTsoRequest` to PD (mocked or real), **Then** it receives a valid timestamp. +2. **Given** a running `rdb-server`, **When** `rdb-client` sends a `RawPutRequest`, **Then** the server accepts it and it persists to disk. + +--- + +### User Story 3 - Placement Driver TSO (Priority: P2) + +As a system, I need a source of monotonic timestamps (TSO) from `rdb-pd`, so that I can order transactions in the future. + +**Why this priority**: Essential for the "Smart Client" architecture and future MVCC/CAS logic. + +**Independent Test**: Run `rdb-pd` and hammer it with TSO requests from multiple threads. + +**Acceptance Scenarios**: + +1. **Given** a running `rdb-pd`, **When** I request timestamps repeatedly, **Then** each returned timestamp is strictly greater than the previous one. + +--- + +## Requirements *(mandatory)* + +### Functional Requirements + +- **FR-001**: The project MUST be organized as a Cargo Workspace with members: `rdb-proto`, `rdb-storage`, `rdb-server`, `rdb-pd`, `rdb-client`. +- **FR-002**: `rdb-proto` MUST define gRPC services (`kvrpc.proto`, `pdpb.proto`) covering CAS, Raw Put, and TSO operations. +- **FR-003**: `rdb-storage` MUST wrap RocksDB and expose `compare_and_swap(key, expected_ver, new_val)` and `put_raw(key, val)`. +- **FR-004**: `rdb-storage` MUST store metadata (version) and data efficiently using Column Families: `default` (raw), `cas` (value as `[u64_be version][bytes value]`), and `raft_log`/`raft_state` for Raft metadata. +- **FR-005**: `rdb-pd` MUST implement a TSO (Timestamp Oracle) service providing unique, monotonic `u64` timestamps. +- **FR-006**: `rdb-server` MUST implement the gRPC handlers defined in `rdb-proto` and delegate to `rdb-storage`. +- **FR-007**: `rdb-client` MUST provide a Rust API that abstracts the gRPC calls for `cas_put`, `raw_put`, and `get`. + +### Key Entities + +- **Region**: A logical range of keys (for future sharding). +- **Version**: A `u64` representing the modification timestamp/sequence of a key. +- **TSO**: Global Timestamp Oracle. + +## Success Criteria *(mandatory)* + +### Measurable Outcomes + +- **SC-001**: Full workspace compiles with `cargo build`. +- **SC-002**: `rdb-storage` unit tests pass covering CAS success/failure paths. +- **SC-003**: Integration script (`scripts/verify-core.sh`) or equivalent CI step runs end-to-end: start PD and Server, client obtains TSO, performs RawPut and RawGet (value must match), performs CAS success and CAS conflict, exits 0. diff --git a/flaredb/specs/001-distributed-core/tasks.md b/flaredb/specs/001-distributed-core/tasks.md new file mode 100644 index 0000000..1b35f52 --- /dev/null +++ b/flaredb/specs/001-distributed-core/tasks.md @@ -0,0 +1,220 @@ +--- +description: "Task list template for feature implementation" +--- + +# Tasks: Core Distributed Architecture (Phase 1) + +**Input**: Design documents from `/specs/001-distributed-core/` +**Prerequisites**: plan.md (required), spec.md (required for user stories), research.md, data-model.md, contracts/ + +**Tests**: The examples below include test tasks. Tests are STANDARD per the Constitution (Principle I). Include them for all functional logic unless explicitly skipped. + +**Organization**: Tasks are grouped by user story to enable independent implementation and testing of each story. + +## Format: `[ID] [P?] [Story] Description` + +- **[P]**: Can run in parallel (different files, no dependencies) +- **[Story]**: Which user story this task belongs to (e.g., US1, US2, US3) +- Include exact file paths in descriptions + +## Path Conventions + +- **Single project**: `src/`, `tests/` at repository root +- **Web app**: `backend/src/`, `frontend/src/` +- **Mobile**: `api/src/`, `ios/src/` or `android/src/` +- Paths shown below assume single project - adjust based on plan.md structure + +## Phase 1: Setup (Shared Infrastructure) + +**Purpose**: Project initialization and basic structure with Nix environment + +- [X] T000 Create `flake.nix` to provide rust, protobuf, clang, and rocksdb dependencies +- [X] T001 Create Cargo workspace in `Cargo.toml` with 5 crates: `rdb-proto`, `rdb-storage`, `rdb-server`, `rdb-pd`, `rdb-client`, `rdb-cli` +- [X] T002 Initialize `rdb-proto` crate with `tonic-build` and `prost` dependencies in `rdb-proto/Cargo.toml` +- [X] T003 [P] Initialize `rdb-storage` crate with `rocksdb` dependency in `rdb-storage/Cargo.toml` +- [X] T004 [P] Initialize `rdb-server`, `rdb-pd`, `rdb-client` crates with `tokio` and `tonic` dependencies + +--- + +## Phase 2: Foundational (Blocking Prerequisites) + +**Purpose**: Core infrastructure that MUST be complete before ANY user story can be implemented + +**⚠️ CRITICAL**: No user story work can begin until this phase is complete + +- [X] T005 Create `kvrpc.proto` in `rdb-proto/src/kvrpc.proto` per contract definition +- [X] T006 Create `pdpb.proto` in `rdb-proto/src/pdpb.proto` per contract definition +- [X] T007 Implement `build.rs` in `rdb-proto/build.rs` to compile protos +- [X] T008 Export generated protos in `rdb-proto/src/lib.rs` + +**Checkpoint**: Foundation ready - user story implementation can now begin in parallel + +--- + +## Phase 3: User Story 1 - Core Storage Engine Verification (Priority: P1) 🎯 MVP + +**Goal**: A robust local storage engine (RocksDB wrapper) with correct CAS logic. + +**Independent Test**: Run unit tests in `rdb-storage` covering Raw Put and CAS success/conflict scenarios. + +### Tests for User Story 1 (STANDARD - per constitution) ⚠️ + +> **NOTE**: Write these tests FIRST, ensure they FAIL before implementation + +- [X] T009 [US1] Create unit tests for `StorageEngine::put_raw` in `rdb-storage/src/engine.rs` +- [X] T010 [US1] Create unit tests for `StorageEngine::compare_and_swap` (success/fail) in `rdb-storage/src/engine.rs` + +### Implementation for User Story 1 + +- [X] T011 [US1] Implement `StorageEngine` trait definition in `rdb-storage/src/lib.rs` +- [X] T012 [US1] Implement `RocksEngine` struct wrapping RocksDB in `rdb-storage/src/rocks_engine.rs` +- [X] T013 [US1] Implement `put_raw` using `CF_DEFAULT` in `rdb-storage/src/rocks_engine.rs` +- [X] T014 [US1] Implement `compare_and_swap` using RocksDB transaction/merge in `rdb-storage/src/rocks_engine.rs` +- [X] T015 [US1] Verify all tests pass + +**Checkpoint**: At this point, User Story 1 should be fully functional and testable independently + +--- + +## Phase 4: User Story 2 - Basic RPC Transport (Priority: P1) + +**Goal**: Verify gRPC communication pipeline between Client and Server. + +**Independent Test**: Run `rdb-server` and connect with a minimal `rdb-client`. + +### Tests for User Story 2 (STANDARD - per constitution) ⚠️ + +- [X] T016 [P] [US2] Create integration test `tests/test_rpc_connect.rs` in `rdb-client` to verify connection + +### Implementation for User Story 2 + +- [X] T017 [P] [US2] Implement `KvService` gRPC handler in `rdb-server/src/service.rs` delegating to storage +- [X] T018 [P] [US2] Implement gRPC server startup in `rdb-server/src/main.rs` +- [X] T019 [US2] Implement `RdbClient` struct wrapping `tonic::transport::Channel` in `rdb-client/src/client.rs` +- [X] T020 [US2] Implement `raw_put` and `cas` methods in `RdbClient` calling gRPC +- [X] T021 [US2] Verify integration test passes + +**Checkpoint**: At this point, User Stories 1 AND 2 should both work independently + +--- + +## Phase 5: User Story 3 - Placement Driver TSO (Priority: P2) + +**Goal**: Source of monotonic timestamps (TSO). + +**Independent Test**: Run `rdb-pd` and verify monotonic TSO generation. + +### Tests for User Story 3 (STANDARD - per constitution) ⚠️ + +- [X] T022 [P] [US3] Create unit test for `TsoOracle` in `rdb-pd/src/tso.rs` + +### Implementation for User Story 3 + +- [X] T023 [P] [US3] Implement `TsoOracle` logic (monotonic u64) in `rdb-pd/src/tso.rs` +- [X] T024 [US3] Implement `TsoService` gRPC handler in `rdb-pd/src/service.rs` +- [X] T025 [US3] Implement PD server startup in `rdb-pd/src/main.rs` +- [X] T026 [US3] Add `get_tso` method to `RdbClient` in `rdb-client/src/client.rs` + +**Checkpoint**: All user stories should now be independently functional + +--- + +## Phase 6: Polish & Cross-Cutting Concerns + +**Purpose**: Improvements that affect multiple user stories + +- [X] T027 Create `scripts/verify-core.sh` for comprehensive integration verification +- [X] T028 Run `quickstart.md` verification steps manually +- [X] T029 Format code with `cargo fmt` and lint with `cargo clippy` + +--- + +## Phase 7: RPC Get & Raft Enhancements + +**Purpose**: Complete client/server Get coverage and initial Raft persistence surface + +- [X] T030 [US2] Implement and verify server Get path returning value+version via CAS CF in `rdb-server/src/service.rs` +- [X] T031 [US2] Implement client `raw_get`/`get` APIs and CLI with integration test in `rdb-client` +- [X] T032 [US2] Add integration test covering Get (RawGet + CAS Get) in `rdb-client/tests` +- [X] T033 [P] Add Raft log/HardState/ConfState persistence and wire Raft service to peer dispatch in `rdb-server` (single-region, single-node baseline) + +--- + +## Dependencies & Execution Order + +### Phase Dependencies + +- **Setup (Phase 1)**: No dependencies - can start immediately +- **Foundational (Phase 2)**: Depends on Setup completion - BLOCKS all user stories +- **User Stories (Phase 3+)**: All depend on Foundational phase completion + - User stories can then proceed in parallel (if staffed) + - Or sequentially in priority order (P1 → P2 → P3) +- **Polish (Final Phase)**: Depends on all desired user stories being complete + +### User Story Dependencies + +- **User Story 1 (P1)**: Can start after Foundational (Phase 2) - Core Storage logic +- **User Story 2 (P1)**: Can start after Foundational (Phase 2) - RPC Layer (Technically depends on US1 storage implementation for full end-to-end, but server shell can be built in parallel) +- **User Story 3 (P2)**: Can start after Foundational (Phase 2) - Independent PD service + +### Within Each User Story + +- Tests (if included) MUST be written and FAIL before implementation +- Models before services +- Services before endpoints +- Core implementation before integration +- Story complete before moving to next priority + +### Parallel Opportunities + +- All Setup tasks marked [P] can run in parallel +- All Foundational tasks marked [P] can run in parallel (within Phase 2) +- Once Foundational phase completes, all user stories can start in parallel (if team capacity allows) +- All tests for a user story marked [P] can run in parallel +- Models within a story marked [P] can run in parallel +- Different user stories can be worked on in parallel by different team members + +--- + +## Parallel Example: User Story 1 + +```bash +# Launch all tests for User Story 1 together (if tests requested): +Task: "Create unit tests for StorageEngine::put_raw in rdb-storage/src/engine.rs" +Task: "Create unit tests for StorageEngine::compare_and_swap (success/fail) in rdb-storage/src/engine.rs" + +# Launch all models for User Story 1 together: +Task: "Implement StorageEngine trait definition in rdb-storage/src/lib.rs" +Task: "Implement RocksEngine struct wrapping RocksDB in rdb-storage/src/rocks_engine.rs" +``` + +--- + +## Implementation Strategy + +### MVP First (User Story 1 Only) + +1. Complete Phase 1: Setup +2. Complete Phase 2: Foundational (CRITICAL - blocks all stories) +3. Complete Phase 3: User Story 1 +4. **STOP and VALIDATE**: Test User Story 1 independently +5. Deploy/demo if ready + +### Incremental Delivery + +1. Complete Setup + Foundational → Foundation ready +2. Add User Story 1 → Test independently → Deploy/Demo (MVP!) +3. Add User Story 2 → Test independently → Deploy/Demo +4. Add User Story 3 → Test independently → Deploy/Demo +5. Each story adds value without breaking previous stories + +### Parallel Team Strategy + +With multiple developers: + +1. Team completes Setup + Foundational together +2. Once Foundational is done: + - Developer A: User Story 1 + - Developer B: User Story 2 + - Developer C: User Story 3 +3. Stories complete and integrate independently diff --git a/flaredb/specs/001-multi-raft/spec.md b/flaredb/specs/001-multi-raft/spec.md new file mode 100644 index 0000000..c67d914 --- /dev/null +++ b/flaredb/specs/001-multi-raft/spec.md @@ -0,0 +1,115 @@ +# Feature Specification: [FEATURE NAME] + +**Feature Branch**: `[###-feature-name]` +**Created**: [DATE] +**Status**: Draft +**Input**: User description: "$ARGUMENTS" + +## User Scenarios & Testing *(mandatory)* + + + +### User Story 1 - [Brief Title] (Priority: P1) + +[Describe this user journey in plain language] + +**Why this priority**: [Explain the value and why it has this priority level] + +**Independent Test**: [Describe how this can be tested independently - e.g., "Can be fully tested by [specific action] and delivers [specific value]"] + +**Acceptance Scenarios**: + +1. **Given** [initial state], **When** [action], **Then** [expected outcome] +2. **Given** [initial state], **When** [action], **Then** [expected outcome] + +--- + +### User Story 2 - [Brief Title] (Priority: P2) + +[Describe this user journey in plain language] + +**Why this priority**: [Explain the value and why it has this priority level] + +**Independent Test**: [Describe how this can be tested independently] + +**Acceptance Scenarios**: + +1. **Given** [initial state], **When** [action], **Then** [expected outcome] + +--- + +### User Story 3 - [Brief Title] (Priority: P3) + +[Describe this user journey in plain language] + +**Why this priority**: [Explain the value and why it has this priority level] + +**Independent Test**: [Describe how this can be tested independently] + +**Acceptance Scenarios**: + +1. **Given** [initial state], **When** [action], **Then** [expected outcome] + +--- + +[Add more user stories as needed, each with an assigned priority] + +### Edge Cases + + + +- What happens when [boundary condition]? +- How does system handle [error scenario]? + +## Requirements *(mandatory)* + + + +### Functional Requirements + +- **FR-001**: System MUST [specific capability, e.g., "allow users to create accounts"] +- **FR-002**: System MUST [specific capability, e.g., "validate email addresses"] +- **FR-003**: Users MUST be able to [key interaction, e.g., "reset their password"] +- **FR-004**: System MUST [data requirement, e.g., "persist user preferences"] +- **FR-005**: System MUST [behavior, e.g., "log all security events"] + +*Example of marking unclear requirements:* + +- **FR-006**: System MUST authenticate users via [NEEDS CLARIFICATION: auth method not specified - email/password, SSO, OAuth?] +- **FR-007**: System MUST retain user data for [NEEDS CLARIFICATION: retention period not specified] + +### Key Entities *(include if feature involves data)* + +- **[Entity 1]**: [What it represents, key attributes without implementation] +- **[Entity 2]**: [What it represents, relationships to other entities] + +## Success Criteria *(mandatory)* + + + +### Measurable Outcomes + +- **SC-001**: [Measurable metric, e.g., "Users can complete account creation in under 2 minutes"] +- **SC-002**: [Measurable metric, e.g., "System handles 1000 concurrent users without degradation"] +- **SC-003**: [User satisfaction metric, e.g., "90% of users successfully complete primary task on first attempt"] +- **SC-004**: [Business metric, e.g., "Reduce support tickets related to [X] by 50%"] diff --git a/flaredb/specs/002-raft-features/checklists/requirements.md b/flaredb/specs/002-raft-features/checklists/requirements.md new file mode 100644 index 0000000..7c1f78e --- /dev/null +++ b/flaredb/specs/002-raft-features/checklists/requirements.md @@ -0,0 +1,34 @@ +# Specification Quality Checklist: Raft Core Replication + +**Purpose**: Validate specification completeness and quality before proceeding to planning +**Created**: 2025-12-01 +**Feature**: specs/001-raft-features/spec.md + +## Content Quality + +- [X] No implementation details (languages, frameworks, APIs) +- [X] Focused on user value and business needs +- [X] Written for non-technical stakeholders +- [X] All mandatory sections completed + +## Requirement Completeness + +- [X] No [NEEDS CLARIFICATION] markers remain +- [X] Requirements are testable and unambiguous +- [X] Success criteria are measurable +- [X] Success criteria are technology-agnostic (no implementation details) +- [X] All acceptance scenarios are defined +- [X] Edge cases are identified +- [X] Scope is clearly bounded +- [X] Dependencies and assumptions identified + +## Feature Readiness + +- [X] All functional requirements have clear acceptance criteria +- [X] User scenarios cover primary flows +- [X] Feature meets measurable outcomes defined in Success Criteria +- [X] No implementation details leak into specification + +## Notes + +- Items marked incomplete require spec updates before `/speckit.clarify` or `/speckit.plan` diff --git a/flaredb/specs/002-raft-features/contracts/raft-service.md b/flaredb/specs/002-raft-features/contracts/raft-service.md new file mode 100644 index 0000000..3bb5683 --- /dev/null +++ b/flaredb/specs/002-raft-features/contracts/raft-service.md @@ -0,0 +1,35 @@ +# Raft Service Contract (gRPC) + +## Overview + +Single RPC entrypoint for Raft message exchange; uses raft-rs `Message` protobuf encoding (prost). + +## Service + +``` +service RaftService { + rpc Send(RaftMessage) returns (RaftResponse); +} +``` + +## Messages + +- **RaftMessage** + - `message: bytes` (serialized `raft::eraftpb::Message` via prost) + +- **RaftResponse** + - Empty payload; errors conveyed via gRPC status + +## Expectations + +- Client (peer) wraps raft-rs `Message` and posts to remote peer via `Send`. +- Receivers decode and feed into `RawNode::step`, then drive `on_ready` to persist/apply. +- Transport must retry/transient-handle UNAVAILABLE; fail fast on INVALID_ARGUMENT decode errors. + +## Test Hooks + +- Integration harness should: + - Start 3 peers with distinct addresses. + - Wire RaftService between peers. + - Propose on leader; verify followers receive and persist entries. + - Simulate follower stop/restart and verify catch-up via `Send`. diff --git a/flaredb/specs/002-raft-features/data-model.md b/flaredb/specs/002-raft-features/data-model.md new file mode 100644 index 0000000..d97f404 --- /dev/null +++ b/flaredb/specs/002-raft-features/data-model.md @@ -0,0 +1,34 @@ +# Data Model: Raft Core Replication + +## Entities + +- **Peer** + - Fields: `id (u64)`, `region_id (u64)`, `state (Leader/Follower/Candidate)`, `term (u64)`, `commit_index (u64)`, `last_applied (u64)` + - Relationships: owns `RaftStorage`; exchanges `RaftLogEntry` with other peers. + - Constraints: single region scope for this phase; fixed voter set of 3. + +- **RaftLogEntry** + - Fields: `index (u64)`, `term (u64)`, `command (bytes)`, `context (bytes, optional)` + - Relationships: persisted in `raft_log` CF; applied to state machine when committed. + - Constraints: indices strictly increasing; term monotonic per election; applied in order. + +- **HardState** + - Fields: `current_term (u64)`, `voted_for (u64)`, `commit_index (u64)` + - Relationships: persisted in `raft_state` CF; loaded at startup before participating. + - Constraints: must be flushed atomically with log appends when advancing commit index. + +- **ConfState** + - Fields: `voters (Vec)` + - Relationships: persisted in `raft_state` CF; defines quorum (majority of 3). + - Constraints: static for this phase; changes require future joint consensus. + +- **ReplicationState** + - Fields: `match_index (u64)`, `next_index (u64)`, `pending (bool)` + - Relationships: maintained per follower in memory; not persisted. + - Constraints: drives AppendEntries backoff and progress. + +## State Transitions + +- Peer transitions: Follower → Candidate → Leader on election; Leader → Follower on higher term or failed election. +- Log application: when `commit_index` advances, apply entries in order to state machine; `last_applied` increases monotonically. +- Recovery: on restart, load `HardState`, `ConfState`, and log; reconcile with leader via AppendEntries (truncate/append) before applying new entries. diff --git a/flaredb/specs/002-raft-features/plan.md b/flaredb/specs/002-raft-features/plan.md new file mode 100644 index 0000000..4b921a1 --- /dev/null +++ b/flaredb/specs/002-raft-features/plan.md @@ -0,0 +1,69 @@ +# Implementation Plan: Raft Core Replication + +**Branch**: `002-raft-features` | **Date**: 2025-12-01 | **Spec**: [specs/002-raft-features/spec.md](specs/002-raft-features/spec.md) +**Input**: Feature specification from `/specs/002-raft-features/spec.md` + +**Note**: This template is filled in by the `/speckit.plan` command. See `.specify/templates/commands/plan.md` for the execution workflow. + +## Summary + +Implement Raft core replication for FlareDB: single-node bootstrap with durable log/hard/conf state, majority replication across a fixed 3-node cluster, and follower recovery/catch-up. Build on the existing Rust workspace (raft-rs, RocksDB) with tonic-based transport already present in the repo. + +## Technical Context + +**Language/Version**: Rust (stable, via Nix flake) +**Primary Dependencies**: `raft` (tikv/raft-rs 0.7, prost codec), `tokio`, `tonic`/`prost`, `rocksdb`, `slog` +**Storage**: RocksDB column families (`raft_log`, `raft_state`) for log, hard state, and conf state +**Testing**: `cargo test` (unit/integration), scripted multi-node harness to be added for replication scenarios +**Target Platform**: Linux (x86_64), Nix dev shell +**Project Type**: Rust workspace (multi-crate: rdb-proto, rdb-storage, rdb-server, rdb-pd, rdb-client, rdb-cli) +**Performance Goals**: From spec SCs — single-node commit ≤2s; 3-node majority commit ≤3s; follower catch-up ≤5s after rejoin +**Constraints**: Fixed 3-node membership for this phase; no dynamic add/remove; minority must not commit +**Scale/Scope**: Cluster size 3; log volume moderate (dev/test scale) sufficient to validate recovery and catch-up + +## Constitution Check + +*GATE: Must pass before Phase 0 research. Re-check after Phase 1 design.* + +- Test-First: Plan includes unit/integration tests for Raft storage, proposal/commit, and recovery paths. +- Reliability & Coverage: CI to run `cargo test`; integration harness to cover cross-node replication. +- Simplicity & Readability: Use existing crates (raft-rs, rocksdb); avoid bespoke protocols. +- Observability: Ensure structured logs on Raft events/errors; failures must be actionable. +- Versioning & Compatibility: Proto changes, if any, must be called out; fixed membership avoids dynamic reconfig in this phase. +No constitution violations identified; gate PASS. + +## Project Structure + +### Documentation (this feature) + +```text +specs/002-raft-features/ +├── plan.md # This file +├── research.md # Phase 0 output +├── data-model.md # Phase 1 output +├── quickstart.md # Phase 1 output +├── contracts/ # Phase 1 output +└── tasks.md # Phase 2 output (via /speckit.tasks) +``` + +### Source Code (repository root) + +```text +Cargo.toml # workspace +rdb-proto/ # proto definitions +rdb-storage/ # RocksDB storage + Raft CFs +rdb-server/ # Raft peer, gRPC services +rdb-pd/ # placement driver (not primary in this feature) +rdb-client/ # client SDK/CLI (control hooks if needed) +rdb-cli/ # auxiliary CLI +scripts/ # verification scripts +tests/ # integration harness (to be added under rdb-server or workspace) +``` + +**Structure Decision**: Use existing Rust workspace layout; place Raft-focused tests/harness under `rdb-server/tests` or workspace `tests/` as appropriate; contracts under `specs/002-raft-features/contracts/`. + +## Complexity Tracking + +| Violation | Why Needed | Simpler Alternative Rejected Because | +|-----------|------------|-------------------------------------| +| N/A | | | diff --git a/flaredb/specs/002-raft-features/quickstart.md b/flaredb/specs/002-raft-features/quickstart.md new file mode 100644 index 0000000..289add7 --- /dev/null +++ b/flaredb/specs/002-raft-features/quickstart.md @@ -0,0 +1,39 @@ +# Quickstart: Raft Core Replication + +## Prerequisites +- Nix dev shell: `nix develop` +- Ports available: 50051, 50052, 50053 (Raft/gRPC) +- Clean data dirs for each node + +## 1) Build & Unit Tests +```bash +nix develop -c cargo build +nix develop -c cargo test -p rdb-server -- service::tests::get_returns_value_and_version +nix develop -c cargo test -p rdb-server -- peer::tests::single_node_propose_persists_log +``` + +## 2) Start a 3-Node Cluster (manual) +```bash +# Terminal 1 +nix develop -c cargo run --bin rdb-server -- --addr 127.0.0.1:50051 --data-dir /tmp/rdb-node1 +# Terminal 2 +nix develop -c cargo run --bin rdb-server -- --addr 127.0.0.1:50052 --data-dir /tmp/rdb-node2 +# Terminal 3 +nix develop -c cargo run --bin rdb-server -- --addr 127.0.0.1:50053 --data-dir /tmp/rdb-node3 +``` + +## 3) Propose & Verify (temporary approach) +- Use the forthcoming integration harness (under `rdb-server/tests`) to: + - Elect a leader (campaign) + - Propose a command (e.g., `"hello"`) + - Assert at least two nodes have the entry at the same index/term and commit +- For now, run: +```bash +nix develop -c cargo test -p rdb-server -- --ignored +``` +(ignored tests will host the multi-node harness once added) + +## 4) Recovery Check +- Stop one follower process, keep leader + other follower running. +- Propose another entry. +- Restart the stopped follower with the same data dir; verify logs show catch-up and committed entries applied (via test harness assertions). diff --git a/flaredb/specs/002-raft-features/research.md b/flaredb/specs/002-raft-features/research.md new file mode 100644 index 0000000..8768ede --- /dev/null +++ b/flaredb/specs/002-raft-features/research.md @@ -0,0 +1,23 @@ +# Research: Raft Core Replication (002-raft-features) + +## Decisions + +- **Raft library**: Use `raft` (tikv/raft-rs 0.7, prost-codec). + - *Rationale*: Battle-tested implementation, already wired in repo; supports necessary APIs for storage/transport. + - *Alternatives considered*: `openraft` (heavier refactor), custom Raft (too risky/time-consuming). + +- **Log/State persistence**: Persist log entries, hard state, conf state in RocksDB CFs (`raft_log`, `raft_state`). + - *Rationale*: RocksDB already provisioned and used; column families align with separation of concerns; durable restart semantics. + - *Alternatives considered*: In-memory (unsafe for recovery), separate files (adds new IO path, no benefit). + +- **Cluster scope**: Fixed 3-node membership for this phase; no dynamic add/remove. + - *Rationale*: Matches spec clarification; reduces scope to core replication/recovery; simpler correctness surface. + - *Alternatives considered*: Joint consensus/dynamic membership (out of scope now). + +- **Transport**: Continue with tonic/prost gRPC messages for Raft network exchange. + - *Rationale*: Existing RaftService in repo; shared proto tooling; avoids new protocol surface. + - *Alternatives considered*: custom TCP/UDP transport (unnecessary for current goals). + +- **Testing approach**: Unit tests for storage/persistence; single-node campaign/propose; multi-node integration harness to validate majority commit and follower catch-up. + - *Rationale*: Aligns with constitution Test-First; exercises durability and replication behaviors. + - *Alternatives considered*: manual ad-hoc testing (insufficient coverage). diff --git a/flaredb/specs/002-raft-features/spec.md b/flaredb/specs/002-raft-features/spec.md new file mode 100644 index 0000000..93acca0 --- /dev/null +++ b/flaredb/specs/002-raft-features/spec.md @@ -0,0 +1,92 @@ +# Feature Specification: Raft Core Replication + +**Feature Branch**: `002-raft-features` +**Created**: 2025-12-01 +**Status**: Draft +**Input**: User description: "Raft関連の機能についてお願いします。" + +## Clarifications + +### Session 2025-12-01 +- Q: Should this phase assume fixed 3-node membership or include dynamic membership? → A: Fixed 3-node, extensible for future scaling. + +## User Scenarios & Testing *(mandatory)* + +### User Story 1 - Single-Node Raft Baseline (Priority: P1) + +As a platform engineer, I want a single-node Raft instance to accept proposals, elect a leader, and persist committed entries so I can validate the log/storage plumbing before scaling out. + +**Why this priority**: Establishes correctness of log append/apply and persistence; blocks multi-node rollout. + +**Independent Test**: Start one node, trigger self-election, propose an entry, verify it is committed and applied to storage with the expected data. + +**Acceptance Scenarios**: + +1. **Given** a single node started fresh, **When** it campaigns, **Then** it becomes leader and can accept proposals. +2. **Given** a proposed entry "e1", **When** it commits, **Then** storage contains "e1" and last index increments by 1. + +--- + +### User Story 2 - Multi-Node Replication (Priority: P1) + +As a platform engineer, I want a 3-node Raft cluster to replicate entries to a majority so that writes remain durable under follower failure. + +**Why this priority**: Majority replication is the core availability guarantee of Raft. + +**Independent Test**: Start 3 nodes, elect a leader, propose an entry; verify leader and at least one follower store the entry at the same index/term and report commit. + +**Acceptance Scenarios**: + +1. **Given** a 3-node cluster, **When** a leader is elected, **Then** at least two nodes acknowledge commit for the same index/term. +2. **Given** a committed entry on the leader, **When** one follower is stopped, **Then** the other follower still receives and persists the entry. + +--- + +### User Story 3 - Failure and Recovery (Priority: P2) + +As an operator, I want a stopped follower to recover and catch up without losing committed data so that the cluster can heal after restarts. + +**Why this priority**: Ensures durability across restarts and supports rolling maintenance. + +**Independent Test**: Commit an entry, stop a follower, commit another entry, restart the follower; verify it restores state and applies all committed entries. + +**Acceptance Scenarios**: + +1. **Given** a follower stopped after entry N is committed, **When** the cluster commits entry N+1 while it is down, **Then** on restart the follower installs both entries in order. +2. **Given** divergent logs on restart, **When** leader sends AppendEntries, **Then** follower truncates/aligns to leader and preserves committed suffix. + +--- + +### Edge Cases + +- Leader crash immediately after commit but before followers apply. +- Network partition isolating a minority vs. majority; minority must not commit new entries. +- Log holes or conflicting terms on recovery must be reconciled to leader’s log. + +## Requirements *(mandatory)* + +### Functional Requirements + +- **FR-001**: The system MUST support single-node leader election and proposal handling without external coordination. +- **FR-002**: The system MUST replicate log entries to a majority in a 3-node cluster before marking them committed. +- **FR-003**: The system MUST persist log entries, hard state (term, vote), and conf state to durable storage so that restarts preserve committed progress. +- **FR-004**: The system MUST apply committed entries to the underlying storage engine in log order without gaps. +- **FR-005**: The system MUST prevent a node in a minority partition from committing new entries while isolated. +- **FR-006**: On restart, a node MUST reconcile its log with the leader (truncate/append) to match the committed log and reapply missing committed entries. +- **FR-007**: For this phase, operate a fixed 3-node membership (no dynamic add/remove), but architecture must allow future extension to scale out safely. + +### Key Entities + +- **Peer**: A Raft node with ID, region scope, in-memory state machine, and access to durable Raft storage. +- **Raft Log Entry**: Indexed record containing term and opaque command bytes; persisted and replicated. +- **Hard State**: Term, vote, commit index persisted to ensure safety across restarts. +- **Conf State**: Voter set defining the quorum for replication. + +## Success Criteria *(mandatory)* + +### Measurable Outcomes + +- **SC-001**: Single-node bootstraps and accepts a proposal within 2 seconds, committing it and persisting the entry. +- **SC-002**: In a 3-node cluster, a committed entry is present on at least two nodes within 3 seconds of proposal. +- **SC-003**: After a follower restart, all previously committed entries are restored and applied in order within 5 seconds of rejoining a healthy leader. +- **SC-004**: During a minority partition, isolated nodes do not advance commit index or apply uncommitted entries. diff --git a/flaredb/specs/002-raft-features/tasks.md b/flaredb/specs/002-raft-features/tasks.md new file mode 100644 index 0000000..bec8e33 --- /dev/null +++ b/flaredb/specs/002-raft-features/tasks.md @@ -0,0 +1,128 @@ +--- +description: "Task list for Raft Core Replication" +--- + +# Tasks: Raft Core Replication + +**Input**: Design documents from `/specs/002-raft-features/` +**Prerequisites**: plan.md (required), spec.md (required for user stories), research.md, data-model.md, contracts/ + +**Tests**: Required per constitution; include unit/integration tests for Raft storage, proposal/commit, replication, and recovery. + +**Organization**: Tasks are grouped by user story to enable independent implementation and testing. + +## Format: `[ID] [P?] [Story] Description` + +- **[P]**: Can run in parallel (different files, no dependencies) +- **[Story]**: Which user story this task belongs to (e.g., US1, US2, US3) +- Include exact file paths in descriptions + +## Phase 1: Setup (Shared Infrastructure) + +**Purpose**: Ensure tooling and layout are ready for Raft feature work. + +- [X] T001 Verify Raft proto service definition matches contract in `rdb-proto/src/raft_server.proto` +- [X] T002 Ensure Raft gRPC server/client wiring is enabled in `rdb-server/src/main.rs` and `rdb-server/src/raft_service.rs` + +--- + +## Phase 2: Foundational (Blocking Prerequisites) + +**Purpose**: Durable Raft storage primitives required by all stories. + +- [X] T003 Implement complete Raft storage persistence (log/hard state/conf state read/write) in `rdb-server/src/raft_storage.rs` +- [X] T004 Add unit tests for Raft storage persistence (log append, load, truncate) in `rdb-server/src/raft_storage.rs` +- [X] T005 Ensure Peer ready loop persists entries and hard state before apply in `rdb-server/src/peer.rs` + +**Checkpoint**: Raft storage durability verified. + +--- + +## Phase 3: User Story 1 - Single-Node Raft Baseline (Priority: P1) + +**Goal**: Single node can self-elect, propose, commit, and apply entries to storage. + +**Independent Test**: Run unit/integration tests that start one peer, campaign, propose a command, and verify commit/apply and durable log. + +### Tests +- [X] T006 [US1] Add single-node campaign/propose/apply test in `rdb-server/src/peer.rs` (cfg(test)) or `rdb-server/tests/test_single_node.rs` + +### Implementation +- [X] T007 [US1] Implement Peer campaign/propose handling with log apply in `rdb-server/src/peer.rs` +- [X] T008 [US1] Expose a simple propose entry point (e.g., CLI or helper) for single-node testing in `rdb-server/src/main.rs` +- [X] T009 [US1] Validate single-node flow passes tests and persists entries (run `cargo test -p rdb-server -- single_node`) + +**Checkpoint**: Single-node Raft end-to-end verified. + +--- + +## Phase 4: User Story 2 - Multi-Node Replication (Priority: P1) + +**Goal**: 3-node cluster replicates entries to a majority; leader/follower paths wired via gRPC. + +**Independent Test**: Integration harness spins up 3 nodes, elects leader, proposes entry, asserts commit on at least 2 nodes. + +### Tests +- [X] T010 [US2] Create 3-node integration test harness in `rdb-server/tests/test_replication.rs` to validate majority commit + +### Implementation +- [X] T011 [US2] Wire RaftService transport send/receive to dispatch messages to peers in `rdb-server/src/raft_service.rs` +- [X] T012 [P] [US2] Implement peer registry/peer manager to track remote addresses and send Raft messages in `rdb-server/src/peer_manager.rs` +- [X] T013 [US2] Update server startup to create/join fixed 3-node cluster with configured peers in `rdb-server/src/main.rs` +- [X] T014 [US2] Ensure ready loop sends outbound messages produced by RawNode in `rdb-server/src/peer.rs` +- [X] T015 [US2] Verify majority replication via integration harness (run `cargo test -p rdb-server -- test_replication`) + +**Checkpoint**: Majority replication validated on 3 nodes. + +--- + +## Phase 5: User Story 3 - Failure and Recovery (Priority: P2) + +**Goal**: Followers can restart and catch up without losing committed entries; isolation prevents commits. + +**Independent Test**: Integration test stops a follower, commits entry while down, restarts follower, and verifies log reconciliation and apply. + +### Tests +- [X] T016 [US3] Add follower restart/catch-up integration test in `rdb-server/tests/test_recovery.rs` +- [X] T016 [US3] Add follower restart/catch-up integration test in `rdb-server/tests/test_recovery.rs` (in progress; currently ignored in `test_replication.rs`) + +### Implementation +- [X] T017 [US3] Implement startup recovery: load HardState/ConfState/log and reconcile via AppendEntries in `rdb-server/src/peer.rs` +- [X] T018 [US3] Handle log truncate/append on conflict and apply committed entries after recovery in `rdb-server/src/peer.rs` +- [X] T019 [US3] Add isolation guard: prevent commit advancement on minority partition detection (e.g., via quorum checks) in `rdb-server/src/peer.rs` +- [X] T020 [US3] Validate recovery/integration tests pass (run `cargo test -p rdb-server -- test_recovery`) + +**Checkpoint**: Recovery and partition safety validated. + +--- + +## Phase 6: Polish & Cross-Cutting Concerns + +**Purpose**: Hardening and operability. + +- [X] T021 Add structured Raft logging (term/index/apply/commit) in `rdb-server` with slog +- [X] T022 Add quickstart or script to launch 3-node cluster and run replication test in `scripts/verify-raft.sh` +- [X] T023 Run full workspace tests and format/lint (`cargo test`, `cargo fmt`, `cargo clippy`) + +--- + +## Dependencies & Execution Order + +- Foundational (Phase 2) blocks all Raft user stories. +- US1 must complete before US2/US3 (builds basic propose/apply). +- US2 should precede US3 (replication before recovery). +- Polish runs last. + +## Parallel Examples + +- T011 (transport wiring) and T012 (peer manager) can proceed in parallel once T003–T005 are done. +- US2 tests (T010) can be authored in parallel with transport implementation, then enabled once wiring lands. +- Logging and script polish (T021–T022) can run in parallel after core stories complete. + +## Implementation Strategy + +1. Complete Foundational (durable storage). +2. Deliver US1 (single-node MVP). +3. Deliver US2 (majority replication). +4. Deliver US3 (recovery/partition safety). +5. Polish (logging, scripts, fmt/clippy). diff --git a/flaredb/specs/003-kvs-consistency/checklists/requirements.md b/flaredb/specs/003-kvs-consistency/checklists/requirements.md new file mode 100644 index 0000000..ee9c125 --- /dev/null +++ b/flaredb/specs/003-kvs-consistency/checklists/requirements.md @@ -0,0 +1,34 @@ +# Specification Quality Checklist: Distributed KVS Consistency Modes + +**Purpose**: Validate specification completeness and quality before proceeding to planning +**Created**: 2025-12-01 +**Feature**: specs/003-kvs-consistency/spec.md + +## Content Quality + +- [X] No implementation details (languages, frameworks, APIs) +- [X] Focused on user value and business needs +- [X] Written for non-technical stakeholders +- [X] All mandatory sections completed + +## Requirement Completeness + +- [X] No [NEEDS CLARIFICATION] markers remain +- [X] Requirements are testable and unambiguous +- [X] Success criteria are measurable +- [X] Success criteria are technology-agnostic (no implementation details) +- [X] All acceptance scenarios are defined +- [X] Edge cases are identified +- [X] Scope is clearly bounded +- [X] Dependencies and assumptions identified + +## Feature Readiness + +- [X] All functional requirements have clear acceptance criteria +- [X] User scenarios cover primary flows +- [X] Feature meets measurable outcomes defined in Success Criteria +- [X] No implementation details leak into specification + +## Notes + +- Items marked incomplete require spec updates before `/speckit.clarify` or `/speckit.plan` diff --git a/flaredb/specs/003-kvs-consistency/contracts/kv_cas.md b/flaredb/specs/003-kvs-consistency/contracts/kv_cas.md new file mode 100644 index 0000000..5a11081 --- /dev/null +++ b/flaredb/specs/003-kvs-consistency/contracts/kv_cas.md @@ -0,0 +1,29 @@ +# KvCas contracts (strong consistency) + +## CompareAndSwap +- **RPC**: `kvrpc.KvCas/CompareAndSwap` +- **Request**: + - `namespace: string` (empty => `default`) + - `key: bytes` + - `value: bytes` + - `expected_version: uint64` +- **Response**: + - `success: bool` + - `current_version: uint64` + - `new_version: uint64` +- **Semantics**: + - Allowed only for `strong` namespaces; returns `FailedPrecondition` otherwise or when not leader (redirect required). + - Proposes via Raft; state machine applies with LWW timestamp wrapper. + +## Get +- **RPC**: `kvrpc.KvCas/Get` +- **Request**: + - `namespace: string` (empty => `default`) + - `key: bytes` +- **Response**: + - `found: bool` + - `value: bytes` + - `version: uint64` +- **Semantics**: + - Allowed only for `strong` namespaces; returns `FailedPrecondition` if not leader. + - Reads versioned value (timestamp-prefixed) and returns decoded value plus version. diff --git a/flaredb/specs/003-kvs-consistency/contracts/kv_raw.md b/flaredb/specs/003-kvs-consistency/contracts/kv_raw.md new file mode 100644 index 0000000..f5ca4f9 --- /dev/null +++ b/flaredb/specs/003-kvs-consistency/contracts/kv_raw.md @@ -0,0 +1,25 @@ +# KvRaw contracts (eventual consistency) + +## RawPut +- **RPC**: `kvrpc.KvRaw/RawPut` +- **Request**: + - `namespace: string` (empty => `default`) + - `key: bytes` + - `value: bytes` +- **Response**: + - `success: bool` +- **Semantics**: + - Allowed only for namespaces in `eventual` mode; returns `FailedPrecondition` otherwise. + - Writes locally with LWW timestamp prefix and queues best-effort async replication via Raft when a leader is present. + +## RawGet +- **RPC**: `kvrpc.KvRaw/RawGet` +- **Request**: + - `namespace: string` (empty => `default`) + - `key: bytes` +- **Response**: + - `found: bool` + - `value: bytes` (empty if not found) +- **Semantics**: + - Allowed only for `eventual` namespaces; returns `FailedPrecondition` otherwise. + - Returns value decoded from LWW-encoded payload (drops the timestamp). diff --git a/flaredb/specs/003-kvs-consistency/contracts/raft_service.md b/flaredb/specs/003-kvs-consistency/contracts/raft_service.md new file mode 100644 index 0000000..546c815 --- /dev/null +++ b/flaredb/specs/003-kvs-consistency/contracts/raft_service.md @@ -0,0 +1,33 @@ +# RaftService contracts (namespace mode ops) + +## GetMode + +- **RPC**: `RaftService/GetMode` +- **Request**: `namespace: string` (empty => `default`) +- **Response**: `mode: string` (`"strong"` or `"eventual"`) + +## UpdateNamespaceMode + +- **RPC**: `RaftService/UpdateNamespaceMode` +- **Request**: + - `namespace: string` (required) + - `mode: string` (`"strong"` or `"eventual"`, required) +- **Response**: `mode` object + - `namespace: string` + - `id: uint32` + - `mode: string` + - `from_default: bool` (true if created implicitly) + +## ListNamespaceModes + +- **RPC**: `RaftService/ListNamespaceModes` +- **Request**: empty +- **Response**: `namespaces[]` + - `namespace: string` + - `id: uint32` + - `mode: string` + - `from_default: bool` + +### Error cases +- `InvalidArgument` when mode is not `"strong"` or `"eventual"` or namespace is empty for updates. +- `FailedPrecondition` if Raft messages are addressed to a different peer. diff --git a/flaredb/specs/003-kvs-consistency/data-model.md b/flaredb/specs/003-kvs-consistency/data-model.md new file mode 100644 index 0000000..d035af5 --- /dev/null +++ b/flaredb/specs/003-kvs-consistency/data-model.md @@ -0,0 +1,26 @@ +# Data Model: Namespace Consistency + +- Namespace + - id: u32 + - name: string + - mode: ConsistencyMode (strong | eventual) + - explicit: bool (true when user-configured; false when created implicitly) + +- NamespaceModeDiff + - namespace: string + - self_id: u32 + - other_id: u32 + - self_mode: ConsistencyMode + - other_mode: ConsistencyMode + +- ClusterConfig + - namespaces: [Namespace] + - default_mode: ConsistencyMode + +- ConsistencyMode + - values: strong | eventual + +- ConvergenceLag + - p50_ms: u64 + - p95_ms: u64 + - max_ms: u64 diff --git a/flaredb/specs/003-kvs-consistency/plan.md b/flaredb/specs/003-kvs-consistency/plan.md new file mode 100644 index 0000000..1ee608d --- /dev/null +++ b/flaredb/specs/003-kvs-consistency/plan.md @@ -0,0 +1,76 @@ +# Implementation Plan: Distributed KVS Consistency Modes + +**Branch**: `003-kvs-consistency` | **Date**: 2025-12-01 | **Spec**: specs/003-kvs-consistency/spec.md +**Input**: Feature specification from `/specs/003-kvs-consistency/spec.md` + +**Note**: This template is filled in by the `/speckit.plan` command. See `.specify/templates/commands/plan.md` for the execution workflow. + +## Summary + +Deliver a deployable distributed KVS supporting strong consistency (quorum read/write) and eventual consistency (LWW default), with namespace-level mode selection, safe mode switching, convergence/recovery behavior, and observability. + +## Technical Context + +**Language/Version**: Rust (stable, via Nix flake) +**Primary Dependencies**: raft-rs, tonic/prost gRPC, RocksDB, tokio +**Storage**: RocksDB for raft log/state and KV data +**Testing**: cargo test (unit/integration), extend rdb-server multi-node tests for namespace/mode behaviors +**Target Platform**: Linux server (Nix dev shell) +**Project Type**: Distributed server (rdb-server) with gRPC API/CLI +**Performance Goals**: Strong mode quorum commit p95 ~1–2s; eventual mode convergence within a few seconds under normal network; observable lag metrics +**Constraints**: Constitution (test-first, observability, compatibility); fixed membership scope for this phase; namespace-level mode config +**Scale/Scope**: Small cluster (3–5 nodes) dev target; multiple namespaces with per-namespace mode + +## Constitution Check + +*GATE: Must pass before Phase 0 research. Re-check after Phase 1 design.* + +- Test-First: Add/extend integration tests for strong/eventual modes, namespace config, convergence/recovery. +- Reliability & Coverage: Keep existing Raft tests green; new tests cover mode behaviors and failures. +- Simplicity & Readability: Reuse existing crates and current server structure; avoid bespoke protocols. +- Observability: Structured logs/metrics for mode, convergence lag, quorum status, config state. +- Versioning & Compatibility: Call out any gRPC/contract changes; fixed membership scope maintained. + +## Project Structure + +### Documentation (this feature) + +```text +specs/003-kvs-consistency/ +├── plan.md +├── research.md +├── data-model.md +├── quickstart.md +├── contracts/ +└── tasks.md # via /speckit.tasks +``` + +### Source Code (repository root) + +```text +rdb-server/ + src/ + peer.rs + peer_manager.rs + raft_service.rs + config/ # add namespace/mode config handling + api/ # gRPC handlers (mode/config endpoints if needed) + tests/ + test_replication.rs (extend for mode/namespace cases) + +rdb-proto/ + src/*.proto # update if API exposes mode/config + +scripts/ + verify-raft.sh # update or add mode verification script +``` + +**Structure Decision**: Extend existing rdb-server layout with namespace/mode config, tests under rdb-server/tests, contracts under specs/003-kvs-consistency/contracts. + +## Complexity Tracking + +> **Fill ONLY if Constitution Check has violations that must be justified** + +| Violation | Why Needed | Simpler Alternative Rejected Because | +|-----------|------------|-------------------------------------| +| N/A | | | diff --git a/flaredb/specs/003-kvs-consistency/quickstart.md b/flaredb/specs/003-kvs-consistency/quickstart.md new file mode 100644 index 0000000..3183d20 --- /dev/null +++ b/flaredb/specs/003-kvs-consistency/quickstart.md @@ -0,0 +1,78 @@ +# Quickstart: Namespace Consistency Modes + +This guide shows how to operate namespace-level consistency (strong vs eventual) now that runtime mode updates are supported. + +## Boot a local cluster + +```bash +# Start three nodes with explicit namespace modes (default=strong, logs=eventual) +cargo run -p rdb-server -- --store-id 1 --addr 127.0.0.1:50051 --namespace-mode logs=eventual +cargo run -p rdb-server -- --store-id 2 --addr 127.0.0.1:50052 --peer 1=127.0.0.1:50051 --namespace-mode logs=eventual +cargo run -p rdb-server -- --store-id 3 --addr 127.0.0.1:50053 --peer 1=127.0.0.1:50051 --namespace-mode logs=eventual +``` + +## Inspect current modes + +`RaftService/GetMode` (single namespace) and `RaftService/ListNamespaceModes` (all namespaces) expose the active configuration and whether a namespace was implicitly created from the default. + +```bash +# List all namespaces and their modes +grpcurl -plaintext 127.0.0.1:50051 raftpb.RaftService/ListNamespaceModes + +# Check a specific namespace +grpcurl -plaintext -d '{"namespace":"logs"}' 127.0.0.1:50051 raftpb.RaftService/GetMode +``` + +The response includes `from_default=true` when the namespace was auto-created using the default mode. + +## Update a namespace mode (rolling safe) + +Mode updates are applied in-memory and picked up immediately by peers; roll across nodes to avoid divergence. + +```bash +# Switch "logs" to strong consistency on node 1 +grpcurl -plaintext -d '{"namespace":"logs","mode":"strong"}' \ + 127.0.0.1:50051 raftpb.RaftService/UpdateNamespaceMode + +# Repeat on each node; verify all agree +grpcurl -plaintext 127.0.0.1:50051 raftpb.RaftService/ListNamespaceModes +grpcurl -plaintext 127.0.0.1:50052 raftpb.RaftService/ListNamespaceModes +grpcurl -plaintext 127.0.0.1:50053 raftpb.RaftService/ListNamespaceModes +``` + +If nodes return different modes for the same namespace, treat it as a mismatch and reapply the update on the outlier nodes. + +## Client usage (KV) + +Strong namespaces use CAS/read/write through the Raft leader; eventual namespaces accept `RawPut/RawGet` locally with LWW replication. + +```bash +# Eventual write/read +grpcurl -plaintext -d '{"namespace":"logs","key":"a","value":"b"}' \ + 127.0.0.1:50051 kvrpc.KvRaw/RawPut +grpcurl -plaintext -d '{"namespace":"logs","key":"a"}' \ + 127.0.0.1:50052 kvrpc.KvRaw/RawGet + +# Strong write/read +grpcurl -plaintext -d '{"namespace":"default","key":"a","value":"b","expected_version":0}' \ + 127.0.0.1:50051 kvrpc.KvCas/CompareAndSwap +grpcurl -plaintext -d '{"namespace":"default","key":"a"}' \ + 127.0.0.1:50051 kvrpc.KvCas/Get +``` + +## Ops checklist + +- Use `ListNamespaceModes` to confirm all nodes share the same mode set before traffic. +- Apply mode updates namespace-by-namespace on each node (or automate via PD) until `from_default=false` everywhere for configured namespaces. +- Keep the default namespace as strong unless explicitly relaxed. + +## Verification + +Run the hardened verify script before committing: + +```bash +scripts/verify-raft.sh +# Expected: cargo fmt clean, all rdb-server tests pass (strong/eventual mode flows) +``` + +This executes `cargo fmt` and `cargo test -p rdb-server --tests` in the Nix shell with protobuf/libclang prepared. diff --git a/flaredb/specs/003-kvs-consistency/research.md b/flaredb/specs/003-kvs-consistency/research.md new file mode 100644 index 0000000..5be7db4 --- /dev/null +++ b/flaredb/specs/003-kvs-consistency/research.md @@ -0,0 +1,15 @@ +# Research: Distributed KVS Consistency Modes (003-kvs-consistency) + +## Decisions + +- **Consistency scope**: Namespace-level selection of strong or eventual consistency. + - *Rationale*: Different tenants/workloads can choose per requirement. + - *Alternatives considered*: Cluster-wide only (too rigid). + +- **Eventual consistency conflict resolution**: Default LWW (last-write-wins); allow alternative policies via config. + - *Rationale*: Simple baseline with deterministic resolution; extensible for advanced policies. + - *Alternatives considered*: Version vectors/CRDT as default (more complex to operate by default). + +## Open Questions + +- None (resolved by spec clarifications). diff --git a/flaredb/specs/003-kvs-consistency/spec.md b/flaredb/specs/003-kvs-consistency/spec.md new file mode 100644 index 0000000..af96692 --- /dev/null +++ b/flaredb/specs/003-kvs-consistency/spec.md @@ -0,0 +1,88 @@ +# Feature Specification: Distributed KVS Consistency Modes + +**Feature Branch**: `003-kvs-consistency` +**Created**: 2025-12-01 +**Status**: Draft +**Input**: User description: "とりあえず分散KVSの部分を使えるようにし、強整合性モードと結果整合性モードを実用可能な状態に持っていくまでの仕様を考えてください。" + +## User Scenarios & Testing *(mandatory)* + +### User Story 1 - 強整合性クラスタを安全に稼働 (Priority: P1) + +SRE/オペレータは、固定メンバー(例: 3ノード)のKVSクラスタを強整合性モードで起動し、書き込み・読み出しが常に最新状態で返ることを保証したい。 + +**Why this priority**: 強整合性がS3メタデータやSNSイベントの正確さの土台になるため。 + +**Independent Test**: 少なくとも3ノード構成で、リーダー経由のPut/Getが直ちに反映し、ダウン直後もコミット済みデータが失われないことを検証。 + +**Acceptance Scenarios**: + +1. **Given** 3ノードが強整合性モードで起動済み、**When** リーダーにキーを書き込み、**Then** 即座に全ノードで最新値が読み出せる(リーダーからの再取得)。 +2. **Given** 1ノードを停止、**When** 残り2ノードで読み書き、**Then** コミットは継続しデータ欠損がない(クォーラム成立時のみコミット)。 + +--- + +### User Story 2 - 結果整合性モードで高スループット運用 (Priority: P1) + +オペレータは、イベント処理や一時的なスパイク負荷向けに結果整合性モードを選択し、高スループットな書き込みを許容しつつ、一定時間内に最終的に同期させたい。 + +**Why this priority**: 書き込み偏重ワークロードでの性能確保とコスト最適化のため。 + +**Independent Test**: 結果整合性モードで大量Put後、一定のタイムウィンドウ内に全ノードへ反映し、古い値が一定時間内に整合することを確認。 + +**Acceptance Scenarios**: + +1. **Given** 結果整合性モードでキーを書き込み、**When** 1秒以内に別ノードから読み出し、**Then** 必ずしも最新とは限らないが一定時間後(例: 数秒以内)に最新値へ収束する。 +2. **Given** ネットワーク分断後に復旧、**When** 再同期処理が走る、**Then** コンフリクトは定義済みポリシー(例: last-write-wins)で解決される。 + +--- + +### User Story 3 - モード切替と運用観測 (Priority: P2) + +オペレータは、環境やワークロードに応じて強整合性/結果整合性モードを設定単位で切り替え、状態監視と異常検知ができることを望む。 + +**Why this priority**: 運用現場での柔軟性と安全性の両立が必要なため。 + +**Independent Test**: モード設定変更後の再起動またはローリング適用で、設定が反映され、メトリクス/ログで確認できる。 + +**Acceptance Scenarios**: + +1. **Given** クラスタ設定を強整合性→結果整合性に変更、**When** ローリングで適用、**Then** 全ノードが新モードで稼働し、メトリクスにモードが反映される。 +2. **Given** モード不一致のノードが存在、**When** オペレータが状況を確認、**Then** 管理UI/CLI/ログで不一致を検知でき、是正手順が明示される。 + +### Edge Cases + +- メンバー数がクォーラムを下回った状態での書き込み要求(強整合性では拒否、結果整合性ではキューイング/部分反映)。 +- ネットワーク分断後の再結合時、双方が進んだログを持つ場合の解決順序。 +- モード切替途中に障害が発生した場合のリカバリ手順と一貫性確保。 +- データサイズやホットキー偏重時のスロットリング/バックプレッシャー挙動。 + +## Requirements *(mandatory)* + +### Functional Requirements + +- **FR-001**: システムは強整合性モードでクォーラム書き込み/読み出しを行い、コミット済みデータを即時参照可能にする。 +- **FR-002**: システムは結果整合性モードで書き込みを受け付け、定義された収束時間内に全ノードへ反映させる。 +- **FR-003**: モード設定は名前空間単位で指定でき、クラスタは複数モードを同居させられる。 +- **FR-004**: 結果整合性モードのコンフリクト解決はデフォルトで last-write-wins(LWW)を採用し、設定で他方式を選択できる。 +- **FR-005**: モード変更は安全な手順(ローリング適用または再起動)で反映され、途中失敗時はロールバック手段がある。 +- **FR-006**: 強整合性モードではクォーラム未達時に書き込みを拒否し、明示的なエラーを返す。 +- **FR-007**: 結果整合性モードではクォーラム未達時も書き込みを受け付け、後続の同期で補填し、未反映の可能性をクライアントに示せる。 +- **FR-008**: 再起動/障害復旧後、保存されたログ/スナップショットから整合した状態へ自動復元し、必要な再同期を実行する。 +- **FR-009**: モード別の観測指標(レイテンシ、未同期レプリカ数、収束時間、拒否率)をメトリクス/ログとして出力する。 +- **FR-010**: 運用者がモード状態や不一致を確認できるCLI/ログ/メトリクス情報を提供する。 + +### Key Entities + +- **ClusterConfig**: クラスタID、ノード一覧、レプリカ数、現在の整合性モード、適用ステータス。 +- **ConsistencyPolicy**: モード種別(強整合/結果整合)、コンフリクト解決ポリシー、収束目標時間、適用範囲(クラスタ/名前空間)。 +- **ReplicationState**: ノードごとのログ進行度、未同期エントリ数、最後の収束時刻、ヘルス状態。 + +## Success Criteria *(mandatory)* + +### Measurable Outcomes + +- **SC-001**: 強整合性モードでの書き込み→読み出しがクォーラム成立時に最新値を即時返し、可用ノードがクォーラム未満なら明示的に失敗を返すことが確認できる。 +- **SC-002**: 結果整合性モードでの書き込みは、許容する収束時間内(例: 数秒以内)に全レプリカへ反映し、反映遅延をメトリクスで観測できる。 +- **SC-003**: ネットワーク分断からの復旧時、コンフリクト解決ポリシーに従ってデータが一貫した状態に自動で収束することをテストで確認できる。 +- **SC-004**: モード変更操作が安全に完了し、変更後のモードと各ノードの適用状況をメトリクス/ログで確認できる。 diff --git a/flaredb/specs/003-kvs-consistency/tasks.md b/flaredb/specs/003-kvs-consistency/tasks.md new file mode 100644 index 0000000..bac1ee4 --- /dev/null +++ b/flaredb/specs/003-kvs-consistency/tasks.md @@ -0,0 +1,119 @@ +--- +description: "Task list for Distributed KVS Consistency Modes" +--- + +# Tasks: Distributed KVS Consistency Modes + +**Input**: Design documents from `/specs/003-kvs-consistency/` +**Prerequisites**: plan.md (required), spec.md (user stories), research.md, data-model.md, contracts/ + +**Tests**: Required per constitution; include unit/integration tests for mode behaviors (strong/eventual), namespace config, convergence/recovery. + +**Organization**: Tasks are grouped by user story to enable independent implementation and testing. + +## Format: `[ID] [P?] [Story] Description` + +- **[P]**: Can run in parallel (different files, no dependencies) +- **[Story]**: Which user story this task belongs to (e.g., US1, US2, US3) +- Include exact file paths in descriptions + +## Phase 1: Setup (Shared Infrastructure) + +**Purpose**: Prepare config and API surfaces for namespace-level consistency modes. + +- [X] T001 Create namespace/mode config schema and defaults in `rdb-server/src/config/mod.rs` +- [X] T002 Update gRPC proto (if needed) to expose namespace/mode config endpoints in `rdb-proto/src/raft_server.proto` +- [X] T003 Add config loading/validation for namespace modes in `rdb-server/src/main.rs` + +--- + +## Phase 2: Foundational (Blocking Prerequisites) + +**Purpose**: Core plumbing for mode-aware replication and observability hooks. + +- [X] T004 Implement mode flag propagation to peers (strong/eventual per namespace) in `rdb-server/src/peer.rs` +- [X] T005 Add LWW conflict resolution helper for eventual mode in `rdb-server/src/peer.rs` +- [X] T006 Emit mode/lag/quorum metrics and structured logs in `rdb-server/src/raft_service.rs` and `rdb-server/src/peer.rs` + +**Checkpoint**: Mode flags flow through storage/peers; metrics/log hooks in place. + +--- + +## Phase 3: User Story 1 - 強整合性クラスタを安全に稼働 (Priority: P1) + +**Goal**: Quorum read/write with immediate visibility; reject writes without quorum. + +### Tests +- [X] T007 [US1] Add strong-mode integration test (quorum write/read, node failure) in `rdb-server/tests/test_consistency.rs` + +### Implementation +- [X] T008 [US1] Enforce quorum writes/reads for strong mode in `rdb-server/src/peer.rs` +- [X] T009 [US1] Return explicit errors on quorum deficit in strong mode in `rdb-server/src/raft_service.rs` + +**Checkpoint**: Strong mode test passes; quorum enforcement confirmed. + +--- + +## Phase 4: User Story 2 - 結果整合性モードで高スループット運用 (Priority: P1) + +**Goal**: Accept writes under partial availability; converge within target window using LWW. + +### Tests +- [X] T010 [US2] Add eventual-mode integration test (delayed read then convergence) in `rdb-server/tests/test_consistency.rs` +- [X] T011 [P] [US2] Add partition/recovery test with LWW resolution in `rdb-server/tests/test_consistency.rs` + +### Implementation +- [X] T012 [US2] Implement eventual-mode write acceptance with async replication in `rdb-server/src/peer.rs` +- [X] T013 [US2] Apply LWW conflict resolution on replay/sync in `rdb-server/src/peer.rs` +- [X] T014 [US2] Track and expose convergence lag metrics in `rdb-server/src/peer_manager.rs` + +**Checkpoint**: Eventual mode converges within target window; LWW conflicts resolved. + +--- + +## Phase 5: User Story 3 - モード切替と運用観測 (Priority: P2) + +**Goal**: Safe mode changes per namespace and clear observability/state reporting. + +### Tests +- [X] T015 [US3] Add mode-switch test (namespace strong↔eventual, rolling apply) in `rdb-server/tests/test_consistency.rs` +- [X] T016 [US3] Add mismatch detection test for inconsistent mode configs in `rdb-server/tests/test_consistency.rs` + +### Implementation +- [X] T017 [US3] Support mode configuration updates per namespace (reload/rolling) in `rdb-server/src/config/mod.rs` +- [X] T018 [US3] Expose mode state and mismatches via logs/metrics/optional gRPC in `rdb-server/src/raft_service.rs` +- [X] T019 [US3] Provide operator-facing quickstart/CLI instructions for mode ops in `specs/003-kvs-consistency/quickstart.md` + +**Checkpoint**: Mode switches apply safely; operators can detect/report mismatches. + +--- + +## Phase 6: Polish & Cross-Cutting Concerns + +**Purpose**: Hardening, docs, and verification scripts. + +- [X] T020 Add contract/OpenAPI updates for mode/config endpoints in `specs/003-kvs-consistency/contracts/` +- [X] T021 Add data model definitions for ClusterConfig/ConsistencyPolicy/ReplicationState in `specs/003-kvs-consistency/data-model.md` +- [X] T022 Update verification script to cover mode tests in `scripts/verify-raft.sh` +- [X] T023 Run full workspace checks (`cargo fmt`, `cargo test -p rdb-server --tests`) and document results in `specs/003-kvs-consistency/quickstart.md` + +--- + +## Dependencies & Execution Order + +- Phase 2 (Foundational) blocks all user stories. +- US1 (strong) and US2 (eventual) can proceed after foundational; US3 (mode ops) depends on config plumbing from Phases 1–2. +- Tests in each story precede implementation tasks. + +## Parallel Examples + +- T010 and T011 can run in parallel after T006 (tests for eventual mode scenarios). +- T012–T014 can run in parallel once T004–T006 are done (separate code paths for eventual replication and metrics). +- T018 and T019 can run in parallel after mode config plumbing (T017). + +## Implementation Strategy + +1. Lay config/API plumbing (Phases 1–2). +2. Deliver strong mode (US1) and eventual mode (US2) with tests. +3. Add mode switching/observability (US3). +4. Polish: contracts, data model docs, verification script, full test sweep. diff --git a/flaredb/specs/004-multi-raft/checklists/requirements.md b/flaredb/specs/004-multi-raft/checklists/requirements.md new file mode 100644 index 0000000..c550945 --- /dev/null +++ b/flaredb/specs/004-multi-raft/checklists/requirements.md @@ -0,0 +1,34 @@ +# Specification Quality Checklist: Multi-Raft (Static → Split → Move) + +**Purpose**: Validate specification completeness and quality before proceeding to planning +**Created**: 2024-XX-XX +**Feature**: specs/004-multi-raft/spec.md + +## Content Quality + +- [x] No implementation details (languages, frameworks, APIs) +- [x] Focused on user value and business needs +- [x] Written for non-technical stakeholders +- [x] All mandatory sections completed + +## Requirement Completeness + +- [x] No [NEEDS CLARIFICATION] markers remain +- [x] Requirements are testable and unambiguous +- [x] Success criteria are measurable +- [x] Success criteria are technology-agnostic (no implementation details) +- [x] All acceptance scenarios are defined +- [x] Edge cases are identified +- [x] Scope is clearly bounded +- [x] Dependencies and assumptions identified + +## Feature Readiness + +- [x] All functional requirements have clear acceptance criteria +- [x] User scenarios cover primary flows +- [x] Feature meets measurable outcomes defined in Success Criteria +- [x] No implementation details leak into specification + +## Notes + +- Checklist reviewed; no open issues identified. diff --git a/flaredb/specs/004-multi-raft/contracts/pd.md b/flaredb/specs/004-multi-raft/contracts/pd.md new file mode 100644 index 0000000..da103ab --- /dev/null +++ b/flaredb/specs/004-multi-raft/contracts/pd.md @@ -0,0 +1,36 @@ +# Contracts: PD / Placement RPCs (Multi-Raft) + +Source of truth: `rdb-proto/src/pdpb.proto` + +## Services + +- **Pd** + - `RegisterStore(RegisterStoreRequest) -> RegisterStoreResponse` + - `GetRegion(GetRegionRequest) -> GetRegionResponse` + - `ListRegions(ListRegionsRequest) -> ListRegionsResponse` + - `MoveRegion(MoveRegionRequest) -> MoveRegionResponse` + +## Messages (selected) + +- `Region`: + - `id: u64` + - `start_key: bytes` + - `end_key: bytes` (empty = infinity) + - `peers: repeated u64` (store IDs) + - `leader_id: u64` + +- `Store`: + - `id: u64` + - `addr: string` + +- `MoveRegionRequest`: + - `region_id: u64` + - `from_store: u64` + - `to_store: u64` + +## Behaviors / Expectations + +- `ListRegions` is used at bootstrap and periodic refresh to populate routing. +- `MoveRegion` directs a leader to add a replica on `to_store` (ConfChange Add) and, after catch-up, remove `from_store` (ConfChange Remove). Current implementation keeps source online; removal can be triggered separately. +- Region key ranges returned by PD must be non-overlapping; nodes validate and fail startup on overlap. +- Heartbeat: nodes periodically refresh routing via `ListRegions` (30s). A dedicated heartbeat RPC can replace this in a future phase. diff --git a/flaredb/specs/004-multi-raft/data-model.md b/flaredb/specs/004-multi-raft/data-model.md new file mode 100644 index 0000000..a9d8240 --- /dev/null +++ b/flaredb/specs/004-multi-raft/data-model.md @@ -0,0 +1,45 @@ +# Data Model: Multi-Raft (Static → Split → Move) + +## Entities + +- **Store** + - `id: u64` + - `addr: String` + - Holds multiple `Peer` instances (one per `Region` replica) and reports status to PD. + +- **Region** + - `id: u64` + - `start_key: bytes` + - `end_key: bytes` (empty = infinity) + - `voters: Vec` (store IDs) + - `leader_id: u64` + - `approx_size_bytes: u64` + +- **Peer** + - `store_id: u64` + - `region_id: u64` + - `raft_state: HardState, ConfState` + - `pending_eventual: VecDeque<(ns_id, key, value, ts)>` + +- **Placement Metadata (PD)** + - `stores: [Store]` + - `regions: [Region]` + - `move_directives: [(region_id, from_store, to_store)]` + +## Relationships + +- Store 1..* Peer (per Region replica) +- Region 1..* Peer (across Stores) +- PD owns canonical Region→Store mapping and Move directives. + +## Lifecycle + +- **Bootstrap**: PD returns initial `regions` → Store creates Peers and persists meta. +- **Split**: Region exceeds threshold → Split command commits → two Region metas persisted → new Peer created. +- **Move**: PD issues `MoveRegion` → leader adds replica on target store (ConfChange Add) → replica catches up → old replica can be removed via ConfChange Remove. + +## Constraints + +- Region key ranges must be non-overlapping and sorted. +- Raft storage/logs are prefixed by `region_id` to avoid cross-region collisions. +- Quorum required for writes; ConfChange operations must preserve quorum at each step. diff --git a/flaredb/specs/004-multi-raft/plan.md b/flaredb/specs/004-multi-raft/plan.md new file mode 100644 index 0000000..e4e4c80 --- /dev/null +++ b/flaredb/specs/004-multi-raft/plan.md @@ -0,0 +1,62 @@ +# Implementation Plan: Multi-Raft (Static → Split → Move) + +**Branch**: `004-multi-raft` | **Date**: 2024-XX-XX | **Spec**: specs/004-multi-raft/spec.md +**Input**: Feature specification from `/specs/004-multi-raft/spec.md` + +## Summary +- Goal: Rust/Tonic/RocksDBベースのRaft実装をMulti-Raftへ拡張し、PD配布メタに従う静的複数Region起動、閾値Split、ConfChangeによるRegion移動までを扱う。 +- Approach: StoreコンテナでRegionID→Peerを管理、Raft/KVのルータをRegion対応にリファクタ。Splitは閾値検知→Splitコマンド合意→メタ更新→新Peer登録。MoveはPD指示に基づきConfChange(追加→キャッチアップ→削除)。 + +## Technical Context +- **Language/Version**: Rust stable (toolchain per repo) +- **Primary Dependencies**: tonic/prost (gRPC), raft-rs, RocksDB, tokio +- **Storage**: RocksDB(CF/キーにRegionIDプレフィックスで分離) +- **Testing**: cargo test(unit/integration)、Raft/KV多Regionのシナリオテスト +- **Target Platform**: Linux server (Nix flake環境) +- **Project Type**: backend/server (single workspace) +- **Performance Goals**: リーダー選出≤60s、Split適用≤60s、移動完了≤5分(成功率99%以上) +- **Constraints**: 憲法に従いテスト必須・gRPCエラーは構造化ログ・互換性影響を明示 +- **Scale/Scope**: Region数: 最低複数同時稼働、将来数千を想定(バッチ最適化は後フェーズ) + +## Constitution Check +- Test-First: 新機能ごとにユニット/インテグレーションテストを先行作成。 +- Reliability & Coverage: `cargo test` 必須、複数Region・Split・ConfChangeの経路にテストを追加。 +- Simplicity: まず静的Multi-Raft→Split→Moveを段階実装。バッチ化などは後続。 +- Observability: Raft/KV/PD連携で失敗時に理由をログ。 +- Versioning: Raft/PD RPC変更は契約として明示。 +→ 憲法違反なしで進行可能。 + +## Project Structure + +### Documentation (this feature) +```text +specs/004-multi-raft/ +├── plan.md # This file +├── research.md # Phase 0 +├── data-model.md # Phase 1 +├── quickstart.md # Phase 1 +├── contracts/ # Phase 1 +└── tasks.md # Phase 2 (via /speckit.tasks) +``` + +### Source Code (repository root) +```text +rdb-server/src/ +├── main.rs # entry +├── store.rs # (new) Store/Region registry & dispatch +├── peer.rs # Raft Peer (per Region) +├── peer_manager.rs # Raft message clients +├── raft_service.rs # gRPC service (region-aware dispatch) +├── service.rs # KV service (region routing) +├── raft_storage.rs # Raft storage (Region-prefixed keys) +├── merkle.rs # (existing) sync helpers +└── config/… # namespace/mode config + +rdb-proto/src/ # proto definitions +tests/ # integration (multi-region, split, move) +``` + +**Structure Decision**: 単一バックエンド構成。Store/PeerにRegion対応を追加し、既存rdb-server配下にstore.rs等を拡張する。 + +## Complexity Tracking +- 現時点で憲法違反なしのため記載不要。 diff --git a/flaredb/specs/004-multi-raft/quickstart.md b/flaredb/specs/004-multi-raft/quickstart.md new file mode 100644 index 0000000..b7ac595 --- /dev/null +++ b/flaredb/specs/004-multi-raft/quickstart.md @@ -0,0 +1,44 @@ +# Quickstart: Multi-Raft (Static → Split → Move) + +## Prerequisites +- Nix or Rust toolchain per repo. +- PD stub runs inline (tests use in-memory). + +## Run tests (recommended) +```bash +nix develop -c cargo test -q rdb-server::tests::test_multi_region +nix develop -c cargo test -q rdb-server::tests::test_split +nix develop -c cargo test -q rdb-server::tests::test_confchange_move +``` +Or full suite: +```bash +nix develop -c cargo test -q +``` + +## Manual smoke (single node, two regions) +1. Launch PD stub (or ensure `pdpb` gRPC reachable). +2. Start server: + ```bash + nix develop -c cargo run -p rdb-server -- --pd-endpoint http://127.0.0.1:50051 + ``` +3. Verify routing: + - Put key `b"a"` → Region1 + - Put key `b"z"` → Region2 + +## Trigger split (dev) +1. Run `test_split` or fill a region with writes. +2. Observe log: `ApplyCommand::Split` and new region registered. + +## Move (rebalance) flow (simplified) +1. Source store handles region; target store starts with PD meta. +2. PD issues `MoveRegion(region_id, from=src, to=dst)`. +3. Source adds replica on target (ConfChange Add); target catches up; source can later remove itself (ConfChange Remove). +4. Verify data on target: + ```bash + nix develop -c cargo test -q move_region_replica_carries_data -- --nocapture + ``` + +## Notes +- Key ranges must not overlap; nodes validate PD meta. +- Raft logs and hard-state are prefixed by `region_id` to isolate shards. +- Pending eventual writes are forwarded to leaders; local queue persists to disk to survive restart. diff --git a/flaredb/specs/004-multi-raft/spec.md b/flaredb/specs/004-multi-raft/spec.md new file mode 100644 index 0000000..1ea2c09 --- /dev/null +++ b/flaredb/specs/004-multi-raft/spec.md @@ -0,0 +1,208 @@ +# Feature Specification: [FEATURE NAME] + +**Feature Branch**: `[###-feature-name]` +**Created**: [DATE] +**Status**: Draft +**Input**: User description: "$ARGUMENTS" + +## User Scenarios & Testing *(mandatory)* + + + +### User Story 1 - [Brief Title] (Priority: P1) + +[Describe this user journey in plain language] + +**Why this priority**: [Explain the value and why it has this priority level] + +**Independent Test**: [Describe how this can be tested independently - e.g., "Can be fully tested by [specific action] and delivers [specific value]"] + +**Acceptance Scenarios**: + +1. **Given** [initial state], **When** [action], **Then** [expected outcome] +2. **Given** [initial state], **When** [action], **Then** [expected outcome] + +--- + +### User Story 2 - [Brief Title] (Priority: P2) + +[Describe this user journey in plain language] + +**Why this priority**: [Explain the value and why it has this priority level] + +**Independent Test**: [Describe how this can be tested independently] + +**Acceptance Scenarios**: + +1. **Given** [initial state], **When** [action], **Then** [expected outcome] + +--- + +### User Story 3 - [Brief Title] (Priority: P3) + +[Describe this user journey in plain language] + +**Why this priority**: [Explain the value and why it has this priority level] + +**Independent Test**: [Describe how this can be tested independently] + +**Acceptance Scenarios**: + +1. **Given** [initial state], **When** [action], **Then** [expected outcome] + +--- + +[Add more user stories as needed, each with an assigned priority] + +### Edge Cases + + + +- What happens when [boundary condition]? +- How does system handle [error scenario]? + +## Requirements *(mandatory)* + + + +### Functional Requirements + +- **FR-001**: System MUST [specific capability, e.g., "allow users to create accounts"] +- **FR-002**: System MUST [specific capability, e.g., "validate email addresses"] +- **FR-003**: Users MUST be able to [key interaction, e.g., "reset their password"] +- **FR-004**: System MUST [data requirement, e.g., "persist user preferences"] +- **FR-005**: System MUST [behavior, e.g., "log all security events"] + +*Example of marking unclear requirements:* + +- **FR-006**: System MUST authenticate users via [NEEDS CLARIFICATION: auth method not specified - email/password, SSO, OAuth?] +- **FR-007**: System MUST retain user data for [NEEDS CLARIFICATION: retention period not specified] + +### Key Entities *(include if feature involves data)* + +- **[Entity 1]**: [What it represents, key attributes without implementation] +- **[Entity 2]**: [What it represents, relationships to other entities] + +## Success Criteria *(mandatory)* + + + +### Measurable Outcomes + +- **SC-001**: [Measurable metric, e.g., "Users can complete account creation in under 2 minutes"] +- **SC-002**: [Measurable metric, e.g., "System handles 1000 concurrent users without degradation"] +- **SC-003**: [User satisfaction metric, e.g., "90% of users successfully complete primary task on first attempt"] +- **SC-004**: [Business metric, e.g., "Reduce support tickets related to [X] by 50%"] +# Feature Specification: Multi-Raft (Static → Split → Move) + +**Feature Branch**: `004-multi-raft` +**Created**: 2024-XX-XX +**Status**: Draft +**Input**: User description: "Phase 3くらいまでやる前提でお願いします。" + +## User Scenarios & Testing *(mandatory)* + +### User Story 1 - PD主導の複数Region起動 (Priority: P1) + +運用者として、起動時に外部設定を不要とし、PDが配布する初期Regionメタデータに従って複数Regionを自動起動させたい(各Regionが独立にリーダー選出・書き込みを行う)。 + +**Why this priority**: Multi-Raftの基盤となるため最重要。これがないと以降のSplitやMoveが成立しない。 +**Independent Test**: PDが返す初期Regionセット(例: 2Region)で起動し、両Regionでリーダー選出が成功し、別々のキー範囲に書き込み・読み出しできることを確認するE2Eテスト。 + +**Acceptance Scenarios**: + +1. **Given** PDが初期Regionメタ(例: Region1 `[start="", end="m")`, Region2 `[start="m", end=""]`)を返す **When** ノードを起動する **Then** 両Regionでリーダーが選出され、互いに干渉せずに書き込みできる。 +2. **Given** RaftService が region_id 付きメッセージを受信 **When** region_id に対応するPeerが存在する **Then** 正しいPeerに配送され、未登録ならエラーを返す。 + +--- + +### User Story 2 - Region Split のオンライン適用 (Priority: P1) + +運用者として、Regionサイズが閾値を超えたときに、ダウンタイムなしでSplitが実行され、新しいRegionが自動生成・登録されてほしい。 + +**Why this priority**: データ増加に伴うスケールアウトを可能にするため。 +**Independent Test**: 1 Region に大量書き込みを行い、閾値到達で Split が合意・適用され、2 Region に分割後も新旧両Regionで読み書きできることを確認。 + +**Acceptance Scenarios**: + +1. **Given** Region サイズが閾値(例: 96MB相当)に達した **When** リーダーが Split コマンドを提案・合意する **Then** 新Region が作成され、元Regionの EndKey が縮小される。 +2. **Given** Split 適用直後 **When** 分割後キー範囲に対し書き込みを行う **Then** それぞれの新旧Regionが正しく処理し、一貫性が崩れない。 + +--- + +### User Story 3 - Region 移動による負荷分散 (Priority: P2) + +運用者として、混雑しているStoreから空いているStoreへRegionを移動(レプリカ追加・除去)し、ディスク/CPU負荷を均衡化したい。 + +**Why this priority**: Phase 3でのリバランスを可能にし、スケールアウトの価値を引き出すため。 +**Independent Test**: PDが「Region X を Store A→B へ移動」指示を出し、ConfChangeでレプリカ追加→キャッチアップ→旧レプリカ除去が完了することを確認。 + +**Acceptance Scenarios**: + +1. **Given** PD が Store B へのレプリカ追加を指示 **When** リーダーが ConfChange を提案 **Then** 新レプリカが追加され、キャッチアップ後に投票権が付与される。 +2. **Given** 新レプリカがキャッチアップ **When** 旧レプリカを除去する ConfChange を適用 **Then** Region は新しい構成で継続し、クォーラムが維持される。 + +--- + +### Edge Cases + +- 未登録の region_id を含む Raft メッセージを受信した場合は安全に拒否し、ログに記録する。 +- Split 中にリーダーが交代した場合、二重Splitを防ぎ、コミット済みのSplitのみを適用する。 +- Region 移動中にネットワーク分断が発生した場合、クォーラム不足時は書き込みを拒否し、再結合後に再同期する。 +- PDが返す初期Regionメタにキー範囲の重複があった場合、起動時に検出してフェイルする。 + +## Requirements *(mandatory)* + +### Functional Requirements + +- **FR-001**: システムは PD が配布する初期Regionメタに基づき複数Regionを起動し、RegionID→Peerを Store で管理できなければならない。 +- **FR-002**: RaftService は受信メッセージの region_id に基づき適切な Peer に配送し、未登録Regionはエラーを返さなければならない。 +- **FR-003**: KvService は Key から Region を判定し、対応する Peer に提案して処理しなければならない。 +- **FR-004**: Raftログおよびハードステートは RegionID で名前空間分離され、異なる Region 間で衝突しないようにしなければならない。 +- **FR-005**: Region サイズが閾値を超えた場合、リーダーは Split コマンドを提案し、合意後に新Regionを Store に登録しなければならない。 +- **FR-006**: Split 適用時は元Regionのメタデータ (Start/EndKey) を更新し、新Regionのメタデータを生成する操作がアトミックでなければならない。 +- **FR-007**: Region の移動(レプリカ追加・除去)は Raft の ConfChange を用いて実施し、クォーラムを維持しながら完了しなければならない。 +- **FR-008**: PD は Region 配置のメタを保持し、移動/追加/除去の指示を発行し、ノードはそれを反映できなければならない。 +- **FR-009**: Region の状態 (リーダー/レプリカ/サイズ/キー範囲) は PD へハートビートで報告されなければならない。 + +### Key Entities *(include if feature involves data)* + +- **Store**: 物理ノード。RegionID→Peerの管理、Raftメッセージディスパッチ、PDへのハートビートを担う。 +- **Region**: キー範囲を持つ論理シャード。StartKey, EndKey, サイズ情報。 +- **Peer**: RegionごとのRaftレプリカ。リーダー選出・ログ複製を担当。 +- **Placement Metadata (PD)**: Region配置・サイズ・リーダー情報・バランス方針を保持するメタデータ。 + +## Success Criteria *(mandatory)* + +### Measurable Outcomes + +- **SC-001**: 2つ以上のRegionを起動した場合、各Regionでリーダー選出が60秒以内に完了する。 +- **SC-002**: Regionごとの書き込みが他Regionに混入せず、キー範囲外アクセスは100%拒否される。 +- **SC-003**: Split トリガー後、60秒以内に新Regionが登録され、分割後も書き込み成功率が99%以上を維持する。 +- **SC-004**: Region 移動(レプリカ追加→キャッチアップ→除去)が 5 分以内に完了し、移動中の書き込み成功率が99%以上を維持する。 + +## Clarifications + +### Session 2025-01-05 + +- Q: PDへの報告間隔と内容は? → A: 30秒ごとにRegion一覧+approx_size+リーダー/ピア+ヘルスをPDへ報告 diff --git a/flaredb/specs/004-multi-raft/tasks.md b/flaredb/specs/004-multi-raft/tasks.md new file mode 100644 index 0000000..97bf644 --- /dev/null +++ b/flaredb/specs/004-multi-raft/tasks.md @@ -0,0 +1,125 @@ +--- +description: "Task list for Multi-Raft (Static -> Split -> Move)" +--- + +# Tasks: Multi-Raft (Static -> Split -> Move) + +**Input**: Design documents from `/specs/004-multi-raft/` +**Prerequisites**: plan.md (required), spec.md (user stories), research.md, data-model.md, contracts/ + +**Tests**: Required per constitution; include unit/integration tests for multi-region routing, split, confchange/move. + +**Organization**: Tasks are grouped by user story to enable independent implementation and testing. + +## Format: `[ID] [P?] [Story] Description` + +- **[P]**: Can run in parallel (different files, no dependencies) +- **[Story]**: Which user story this task belongs to (e.g., US1, US2, US3) +- Include exact file paths in descriptions + +## Phase 1: Setup (Shared Infrastructure) + +**Purpose**: Prepare store/container and region-aware routing foundations. + +- [X] T001 Add Store container skeleton managing RegionID->Peer map in `rdb-server/src/store.rs` +- [X] T002 Wire RaftService to dispatch by region_id via Store in `rdb-server/src/raft_service.rs` +- [X] T003 Add region-aware KV routing (Key->Region) stub in `rdb-server/src/service.rs` +- [X] T004 Region-prefixed Raft storage keys to isolate logs/hs/conf in `rdb-server/src/raft_storage.rs` +- [X] T005 Update main startup to init Store from PD initial region meta in `rdb-server/src/main.rs` + +--- + +## Phase 2: Foundational (Blocking Prerequisites) + +**Purpose**: PD integration and routing validation. + +- [X] T006 Add PD client call to fetch initial region metadata in `rdb-proto/src/pdpb.proto` and `rdb-server/src/main.rs` +- [X] T007 Add routing cache (Region range map) with PD heartbeat refresh in `rdb-server/src/service.rs` + - [X] T008 Add multi-region Raft message dispatch tests in `rdb-server/tests/test_multi_region.rs` + - [X] T009 Add KV routing tests for disjoint regions in `rdb-server/tests/test_multi_region.rs` + +**Checkpoint**: Multiple regions can start, elect leaders, and route KV without interference. + +--- + +## Phase 3: User Story 1 - PD主導の複数Region起動 (Priority: P1) + +**Goal**: Auto-start multiple regions from PD meta; independent read/write per region. + +### Tests +- [X] T010 [US1] Integration test: startup with PD returning 2 regions; both elect leaders and accept writes in `rdb-server/tests/test_multi_region.rs` + +### Implementation +- [X] T011 [US1] Store registers peers per PD region meta; validation for overlapping ranges in `rdb-server/src/store.rs` +- [X] T012 [US1] KV service uses region router from PD meta to propose to correct peer in `rdb-server/src/service.rs` +- [X] T013 [US1] Structured errors for unknown region/key-range in `rdb-server/src/service.rs` + +**Checkpoint**: Two+ regions operate independently with PD-provided meta. + +--- + +## Phase 4: User Story 2 - Region Split (Priority: P1) + +**Goal**: Detect size threshold and split online into two regions. + +### Tests +- [X] T014 [US2] Split trigger test (approx size over threshold) in `rdb-server/tests/test_split.rs` +- [X] T015 [US2] Post-split routing test: keys before/after split_key go to correct regions in `rdb-server/tests/test_split.rs` + +### Implementation +- [X] T016 [US2] Approximate size measurement and threshold check in `rdb-server/src/store.rs` +- [X] T017 [US2] Define/apply Split raft command; update region meta atomically in `rdb-server/src/peer.rs` +- [X] T018 [US2] Create/register new peer for split region and update routing map in `rdb-server/src/store.rs` +- [X] T019 [US2] Persist updated region metadata (start/end keys) in `rdb-server/src/store.rs` + +**Checkpoint**: Region splits online; post-split read/write succeeds in both regions. + +--- + +## Phase 5: User Story 3 - Region Move (Priority: P2) + +**Goal**: Rebalance region replicas via ConfChange (add → catch-up → remove). + +### Tests +- [X] T020 [US3] ConfChange add/remove replica test across two stores in `rdb-server/tests/test_confchange.rs` +- [X] T021 [US3] Move scenario: PD directs move, data reachable after move in `rdb-server/tests/test_confchange.rs` + +### Implementation +- [X] T022 [US3] Implement ConfChange apply for add/remove node per region in `rdb-server/src/peer.rs` +- [X] T023 [US3] PD heartbeat reporting region list/size and apply PD move directives in `rdb-server/src/store.rs` +- [X] T024 [US3] Snapshot/fast catch-up path for new replica join in `rdb-server/src/peer.rs` + +**Checkpoint**: Region can move between stores without data loss; quorum maintained. + +--- + +## Phase 6: Polish & Cross-Cutting Concerns + +**Purpose**: Hardening, docs, and verification. + +- [X] T025 Update contracts for PD/Region RPCs in `specs/004-multi-raft/contracts/` +- [X] T026 Update data-model for Region/Store/PlacementMeta in `specs/004-multi-raft/data-model.md` +- [X] T027 Quickstart covering multi-region start, split, move flows in `specs/004-multi-raft/quickstart.md` +- [X] T028 Verification script to run multi-region/split/move tests in `scripts/verify-multiraft.sh` +- [ ] T029 [P] Cleanup warnings, run `cargo fmt`, `cargo test -p rdb-server --tests` across workspace + +--- + +## Dependencies & Execution Order + +- Phase 1 → Phase 2 → US1 → US2 → US3 → Polish +- Split (US2) depends on routing in US1; Move (US3) depends on ConfChange plumbing. + +## Parallel Examples + +- T008 and T009 can run in parallel after T002/T003/T004 (multi-region dispatch + routing tests). +- T014 and T015 can run in parallel after routing map is in place (post-split tests). +- T020 and T021 can run in parallel once ConfChange scaffolding exists. + +## Implementation Strategy + +1) Lay Store/routing foundations (Phase 1–2). +2) Deliver US1 (PD-driven multi-region start). +3) Add Split path (US2). +4) Add ConfChange/move path (US3). +5) Polish docs/contracts/verify script. diff --git a/iam b/iam deleted file mode 160000 index 6267375..0000000 --- a/iam +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 626737592eac3968dd7c0a2dd7ec770f01dbeb68 diff --git a/iam/Cargo.toml b/iam/Cargo.toml new file mode 100644 index 0000000..197f353 --- /dev/null +++ b/iam/Cargo.toml @@ -0,0 +1,97 @@ +[workspace] +resolver = "2" +members = [ + "crates/iam-types", + "crates/iam-store", + "crates/iam-authn", + "crates/iam-authz", + "crates/iam-audit", + "crates/iam-api", + "crates/iam-server", + "crates/iam-client", +] + +[workspace.package] +version = "0.1.0" +edition = "2021" +license = "MIT OR Apache-2.0" +rust-version = "1.75" +authors = ["Cloud Platform Contributors"] +repository = "https://github.com/cloud/iam" + +[workspace.dependencies] +# Internal crates +iam-types = { path = "crates/iam-types" } +iam-store = { path = "crates/iam-store" } +iam-authn = { path = "crates/iam-authn" } +iam-authz = { path = "crates/iam-authz" } +iam-audit = { path = "crates/iam-audit" } +iam-api = { path = "crates/iam-api" } +iam-server = { path = "crates/iam-server" } +iam-client = { path = "crates/iam-client" } + +# External SDK dependencies +chainfire-client = { path = "../chainfire/chainfire-client" } +flaredb-client = { path = "../flaredb/crates/flaredb-client" } + +# Async runtime +tokio = { version = "1.40", features = ["full"] } +tokio-stream = "0.1" +futures = "0.3" +async-trait = "0.1" + +# gRPC +tonic = { version = "0.12", features = ["tls", "tls-roots"] } +tonic-build = "0.12" +tonic-health = "0.12" +prost = "0.13" +prost-types = "0.13" + +# Serialization +serde = { version = "=1.0.219", features = ["derive"] } +serde_json = "=1.0.140" + +# JWT/OIDC +jsonwebtoken = "9" +reqwest = { version = "0.12", features = ["json", "rustls-tls"], default-features = false } + +# Crypto +rand = "0.8" +base64 = "0.22" +sha2 = "0.10" +hmac = "0.12" + +# Utilities +thiserror = "1.0" +anyhow = "1.0" +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +bytes = "1.5" + +# Metrics +metrics = "0.23" +metrics-exporter-prometheus = "0.15" +dashmap = "6" +parking_lot = "0.12" +uuid = { version = "1.8", features = ["v4", "serde"] } +ipnetwork = "0.20" +glob-match = "0.2" +chrono = { version = "0.4", features = ["serde"] } + +# Configuration +toml = "0.8" +clap = { version = "4", features = ["derive"] } + +# Testing +tempfile = "3.10" + +[workspace.lints.rust] +unsafe_code = "deny" + +[workspace.lints.clippy] +all = "warn" + +[profile.release] +lto = true +opt-level = 3 +codegen-units = 1 diff --git a/iam/config/iam.example.toml b/iam/config/iam.example.toml new file mode 100644 index 0000000..0023f5d --- /dev/null +++ b/iam/config/iam.example.toml @@ -0,0 +1,48 @@ +# IAM Server Configuration Example +# +# Copy this file to iam.toml and adjust the settings. + +[server] +# Listen address +addr = "0.0.0.0:50051" + +# TLS configuration (optional) +# [server.tls] +# cert_file = "/etc/iam/certs/server.crt" +# key_file = "/etc/iam/certs/server.key" +# ca_file = "/etc/iam/certs/ca.crt" +# require_client_cert = false + +[store] +# Backend type: "memory", "chainfire", or "flaredb" +backend = "memory" + +# Chainfire backend configuration +# chainfire_endpoints = ["localhost:2379", "localhost:2380"] + +# FlareDB backend configuration +# flaredb_endpoint = "localhost:5432" +# flaredb_namespace = "iam" + +[authn] +# JWT/OIDC configuration (optional) +# [authn.jwt] +# jwks_url = "https://auth.example.com/.well-known/jwks.json" +# issuer = "https://auth.example.com" +# audience = "iam" +# jwks_cache_ttl_seconds = 3600 + +# Internal token configuration +[authn.internal_token] +# Signing key for internal tokens +# IMPORTANT: Change this to a secure random value in production! +signing_key = "change-this-to-a-secure-random-value-in-production" +issuer = "iam" +default_ttl_seconds = 3600 # 1 hour +max_ttl_seconds = 604800 # 7 days + +[logging] +# Log level: "trace", "debug", "info", "warn", "error" +level = "info" +# Log format: "text" or "json" +format = "text" diff --git a/iam/crates/iam-api/Cargo.toml b/iam/crates/iam-api/Cargo.toml new file mode 100644 index 0000000..6c16dbc --- /dev/null +++ b/iam/crates/iam-api/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "iam-api" +version.workspace = true +edition.workspace = true +description = "gRPC API for IAM" +license.workspace = true + +[dependencies] +iam-types = { path = "../iam-types" } +iam-store = { path = "../iam-store" } +iam-authn = { path = "../iam-authn" } +iam-authz = { path = "../iam-authz" } +iam-audit = { path = "../iam-audit" } +serde = { workspace = true } +serde_json = { workspace = true } +thiserror = { workspace = true } +tokio = { workspace = true } +tracing = { workspace = true } +async-trait = { workspace = true } +tonic = { workspace = true } +prost = { workspace = true } +base64 = { workspace = true } +sha2 = { workspace = true } +uuid = { workspace = true } + +[dev-dependencies] +tokio = { workspace = true, features = ["full", "test-util"] } + +[build-dependencies] +tonic-build = { workspace = true } diff --git a/iam/crates/iam-api/build.rs b/iam/crates/iam-api/build.rs new file mode 100644 index 0000000..c67fcc6 --- /dev/null +++ b/iam/crates/iam-api/build.rs @@ -0,0 +1,11 @@ +fn main() -> Result<(), Box> { + let proto = "../../proto/iam.proto"; + + tonic_build::configure() + .build_server(true) + .build_client(true) + .compile(&[proto], &["../../proto"])?; + + println!("cargo:rerun-if-changed=../../proto/iam.proto"); + Ok(()) +} diff --git a/iam/crates/iam-api/src/conversions.rs b/iam/crates/iam-api/src/conversions.rs new file mode 100644 index 0000000..b9bd18a --- /dev/null +++ b/iam/crates/iam-api/src/conversions.rs @@ -0,0 +1,412 @@ +//! Proto <-> types conversions +//! +//! Converts between protobuf types and IAM domain types. + +use iam_types::{ + Condition as TypesCondition, ConditionExpr as TypesConditionExpr, + Permission as TypesPermission, PolicyBinding as TypesBinding, Principal as TypesPrincipal, + PrincipalKind as TypesPrincipalKind, PrincipalRef as TypesPrincipalRef, Role as TypesRole, + Scope as TypesScope, +}; + +use crate::proto::{ + self, condition_expr, scope, Condition, ConditionExpr, Permission, PolicyBinding, Principal, + PrincipalKind, PrincipalRef, Role, Scope, +}; + +// ============================================================================ +// PrincipalKind conversions +// ============================================================================ + +/// Convert types PrincipalKind to proto enum value +pub fn principal_kind_to_proto(kind: &TypesPrincipalKind) -> i32 { + match kind { + TypesPrincipalKind::User => PrincipalKind::User as i32, + TypesPrincipalKind::ServiceAccount => PrincipalKind::ServiceAccount as i32, + TypesPrincipalKind::Group => PrincipalKind::Group as i32, + } +} + +/// Convert proto enum value to types PrincipalKind +pub fn proto_to_principal_kind(value: i32) -> Result { + match PrincipalKind::try_from(value) { + Ok(PrincipalKind::User) => Ok(TypesPrincipalKind::User), + Ok(PrincipalKind::ServiceAccount) => Ok(TypesPrincipalKind::ServiceAccount), + Ok(PrincipalKind::Group) => Ok(TypesPrincipalKind::Group), + _ => Err("invalid principal kind"), + } +} + +// ============================================================================ +// PrincipalRef conversions +// ============================================================================ + +impl From for PrincipalRef { + fn from(r: TypesPrincipalRef) -> Self { + PrincipalRef { + kind: principal_kind_to_proto(&r.kind), + id: r.id, + } + } +} + +/// Convert proto PrincipalRef to types PrincipalRef +pub fn proto_to_principal_ref(r: &PrincipalRef) -> Result { + let kind = proto_to_principal_kind(r.kind)?; + Ok(TypesPrincipalRef::new(kind, &r.id)) +} + +// ============================================================================ +// Principal conversions +// ============================================================================ + +impl From for Principal { + fn from(p: TypesPrincipal) -> Self { + Principal { + id: p.id, + kind: principal_kind_to_proto(&p.kind), + name: p.name, + org_id: p.org_id, + project_id: p.project_id, + email: p.email, + oidc_sub: p.oidc_sub, + node_id: p.node_id, + metadata: p.metadata, + created_at: p.created_at, + updated_at: p.updated_at, + enabled: p.enabled, + } + } +} + +impl From for TypesPrincipal { + fn from(p: Principal) -> Self { + TypesPrincipal { + id: p.id, + kind: proto_to_principal_kind(p.kind).unwrap_or(TypesPrincipalKind::User), + name: p.name, + org_id: p.org_id, + project_id: p.project_id, + email: p.email, + oidc_sub: p.oidc_sub, + node_id: p.node_id, + metadata: p.metadata, + created_at: p.created_at, + updated_at: p.updated_at, + enabled: p.enabled, + } + } +} + +// ============================================================================ +// Scope conversions +// ============================================================================ + +impl From for Scope { + fn from(s: TypesScope) -> Self { + Scope { + scope: Some(match s { + TypesScope::System => scope::Scope::System(true), + TypesScope::Org { id } => scope::Scope::Org(proto::OrgScope { id }), + TypesScope::Project { id, org_id } => { + scope::Scope::Project(proto::ProjectScope { id, org_id }) + } + TypesScope::Resource { + id, + project_id, + org_id, + } => scope::Scope::Resource(proto::ResourceScope { + id, + project_id, + org_id, + }), + }), + } + } +} + +impl From for TypesScope { + fn from(s: Scope) -> Self { + match s.scope { + Some(scope::Scope::System(true)) => TypesScope::System, + Some(scope::Scope::Org(org)) => TypesScope::org(org.id), + Some(scope::Scope::Project(proj)) => TypesScope::project(proj.id, proj.org_id), + Some(scope::Scope::Resource(res)) => { + TypesScope::resource(res.id, res.project_id, res.org_id) + } + _ => TypesScope::System, + } + } +} + +// ============================================================================ +// Permission conversions +// ============================================================================ + +impl From for Permission { + fn from(p: TypesPermission) -> Self { + Permission { + action: p.action, + resource_pattern: p.resource_pattern, + condition: p.condition.map(|c| c.into()), + } + } +} + +impl From for TypesPermission { + fn from(p: Permission) -> Self { + TypesPermission { + action: p.action, + resource_pattern: p.resource_pattern, + condition: p.condition.map(|c| c.into()), + } + } +} + +// ============================================================================ +// Role conversions +// ============================================================================ + +impl From for Role { + fn from(r: TypesRole) -> Self { + Role { + name: r.name, + display_name: r.display_name, + description: r.description, + scope: Some(r.scope.into()), + permissions: r.permissions.into_iter().map(|p| p.into()).collect(), + builtin: r.builtin, + created_at: r.created_at, + updated_at: r.updated_at, + } + } +} + +impl From for TypesRole { + fn from(r: Role) -> Self { + TypesRole { + name: r.name, + display_name: r.display_name, + description: r.description, + scope: r + .scope + .unwrap_or_else(|| Scope { + scope: Some(scope::Scope::System(true)), + }) + .into(), + permissions: r.permissions.into_iter().map(Into::into).collect(), + builtin: r.builtin, + created_at: r.created_at, + updated_at: r.updated_at, + } + } +} + +// ============================================================================ +// PolicyBinding conversions +// ============================================================================ + +impl From for PolicyBinding { + fn from(b: TypesBinding) -> Self { + PolicyBinding { + id: b.id, + principal: Some(b.principal_ref.into()), + role: b.role_ref, + scope: Some(b.scope.into()), + condition: b.condition.map(|c| c.into()), + created_at: b.created_at, + updated_at: b.updated_at, + created_by: b.created_by, + expires_at: b.expires_at, + enabled: b.enabled, + } + } +} + +impl From for TypesBinding { + fn from(b: PolicyBinding) -> Self { + TypesBinding { + id: b.id, + principal_ref: proto_to_principal_ref(&b.principal.unwrap_or_else(|| PrincipalRef { + kind: PrincipalKind::User as i32, + id: String::new(), + })) + .unwrap_or_else(|_| TypesPrincipalRef::user("")), + role_ref: b.role, + scope: b.scope.unwrap_or_else(|| Scope { + scope: Some(scope::Scope::System(true)), + }).into(), + condition: b.condition.map(|c| c.into()), + created_at: b.created_at, + updated_at: b.updated_at, + created_by: b.created_by, + expires_at: b.expires_at, + enabled: b.enabled, + } + } +} + +// ============================================================================ +// Condition conversions +// ============================================================================ + +impl From for Condition { + fn from(c: TypesCondition) -> Self { + Condition { + expression: Some(c.expression.into()), + } + } +} + +impl From for TypesCondition { + fn from(c: Condition) -> Self { + TypesCondition { + expression: c + .expression + .map(|e| e.into()) + .unwrap_or(TypesConditionExpr::And(vec![])), + } + } +} + +impl From for ConditionExpr { + fn from(e: TypesConditionExpr) -> Self { + let expr = match e { + TypesConditionExpr::StringEquals { key, value } => { + condition_expr::Expr::StringEquals(proto::StringEqualsExpr { key, value }) + } + TypesConditionExpr::StringNotEquals { key, value } => { + condition_expr::Expr::StringNotEquals(proto::StringNotEqualsExpr { key, value }) + } + TypesConditionExpr::StringLike { key, pattern } => { + condition_expr::Expr::StringLike(proto::StringLikeExpr { key, pattern }) + } + TypesConditionExpr::StringNotLike { key, pattern } => { + condition_expr::Expr::StringNotLike(proto::StringNotLikeExpr { key, pattern }) + } + TypesConditionExpr::NumericEquals { key, value } => { + condition_expr::Expr::NumericEquals(proto::NumericEqualsExpr { key, value }) + } + TypesConditionExpr::NumericLessThan { key, value } => { + condition_expr::Expr::NumericLessThan(proto::NumericLessThanExpr { key, value }) + } + TypesConditionExpr::NumericLessThanEquals { key, value } => { + condition_expr::Expr::NumericLessThan(proto::NumericLessThanExpr { key, value }) + } + TypesConditionExpr::NumericGreaterThan { key, value } => { + condition_expr::Expr::NumericGreaterThan(proto::NumericGreaterThanExpr { + key, + value, + }) + } + TypesConditionExpr::NumericGreaterThanEquals { key, value } => { + condition_expr::Expr::NumericGreaterThan(proto::NumericGreaterThanExpr { + key, + value, + }) + } + TypesConditionExpr::IpAddress { key, cidr } => { + condition_expr::Expr::IpAddress(proto::IpAddressExpr { key, cidr }) + } + TypesConditionExpr::NotIpAddress { key, cidr } => { + condition_expr::Expr::NotIpAddress(proto::NotIpAddressExpr { key, cidr }) + } + TypesConditionExpr::TimeBetween { start, end } => { + condition_expr::Expr::TimeBetween(proto::TimeBetweenExpr { start, end }) + } + TypesConditionExpr::Exists { key } => { + condition_expr::Expr::Exists(proto::ExistsExpr { key }) + } + TypesConditionExpr::StringEqualsAny { key, values } => { + condition_expr::Expr::StringEqualsAny(proto::StringEqualsAnyExpr { key, values }) + } + TypesConditionExpr::Bool { key, value } => { + condition_expr::Expr::BoolExpr(proto::BoolExpr { key, value }) + } + TypesConditionExpr::And(exprs) => condition_expr::Expr::AndExpr(proto::AndExpr { + expressions: exprs.into_iter().map(|e| e.into()).collect(), + }), + TypesConditionExpr::Or(exprs) => condition_expr::Expr::OrExpr(proto::OrExpr { + expressions: exprs.into_iter().map(|e| e.into()).collect(), + }), + TypesConditionExpr::Not(expr) => { + condition_expr::Expr::NotExpr(Box::new(proto::NotExpr { + expression: Some(Box::new((*expr).into())), + })) + } + }; + ConditionExpr { expr: Some(expr) } + } +} + +impl From for TypesConditionExpr { + fn from(e: ConditionExpr) -> Self { + match e.expr { + Some(condition_expr::Expr::StringEquals(e)) => TypesConditionExpr::StringEquals { + key: e.key, + value: e.value, + }, + Some(condition_expr::Expr::StringNotEquals(e)) => TypesConditionExpr::StringNotEquals { + key: e.key, + value: e.value, + }, + Some(condition_expr::Expr::StringLike(e)) => TypesConditionExpr::StringLike { + key: e.key, + pattern: e.pattern, + }, + Some(condition_expr::Expr::StringNotLike(e)) => TypesConditionExpr::StringNotLike { + key: e.key, + pattern: e.pattern, + }, + Some(condition_expr::Expr::NumericEquals(e)) => TypesConditionExpr::NumericEquals { + key: e.key, + value: e.value, + }, + Some(condition_expr::Expr::NumericLessThan(e)) => TypesConditionExpr::NumericLessThan { + key: e.key, + value: e.value, + }, + Some(condition_expr::Expr::NumericGreaterThan(e)) => { + TypesConditionExpr::NumericGreaterThan { + key: e.key, + value: e.value, + } + } + Some(condition_expr::Expr::IpAddress(e)) => TypesConditionExpr::IpAddress { + key: e.key, + cidr: e.cidr, + }, + Some(condition_expr::Expr::NotIpAddress(e)) => TypesConditionExpr::NotIpAddress { + key: e.key, + cidr: e.cidr, + }, + Some(condition_expr::Expr::TimeBetween(e)) => TypesConditionExpr::TimeBetween { + start: e.start, + end: e.end, + }, + Some(condition_expr::Expr::Exists(e)) => TypesConditionExpr::Exists { key: e.key }, + Some(condition_expr::Expr::StringEqualsAny(e)) => TypesConditionExpr::StringEqualsAny { + key: e.key, + values: e.values, + }, + Some(condition_expr::Expr::BoolExpr(e)) => TypesConditionExpr::Bool { + key: e.key, + value: e.value, + }, + Some(condition_expr::Expr::AndExpr(e)) => { + TypesConditionExpr::And(e.expressions.into_iter().map(|x| x.into()).collect()) + } + Some(condition_expr::Expr::OrExpr(e)) => { + TypesConditionExpr::Or(e.expressions.into_iter().map(|x| x.into()).collect()) + } + Some(condition_expr::Expr::NotExpr(e)) => { + let inner = e + .expression + .map(|x| (*x).into()) + .unwrap_or(TypesConditionExpr::And(vec![])); + TypesConditionExpr::Not(Box::new(inner)) + } + None => TypesConditionExpr::And(vec![]), + } + } +} diff --git a/iam/crates/iam-api/src/generated/mod.rs b/iam/crates/iam-api/src/generated/mod.rs new file mode 100644 index 0000000..5618b75 --- /dev/null +++ b/iam/crates/iam-api/src/generated/mod.rs @@ -0,0 +1,10 @@ +//! Generated proto types +//! +//! These types mirror the proto definitions but are manually implemented +//! since protoc is not available in the build environment. + +pub mod iam { + pub mod v1 { + tonic::include_proto!("iam.v1"); + } +} diff --git a/iam/crates/iam-api/src/iam_service.rs b/iam/crates/iam-api/src/iam_service.rs new file mode 100644 index 0000000..ca52e3c --- /dev/null +++ b/iam/crates/iam-api/src/iam_service.rs @@ -0,0 +1,918 @@ +//! IAM Authorization gRPC service implementation + +use std::net::IpAddr; +use std::sync::Arc; +use std::time::{SystemTime, UNIX_EPOCH}; + +use tonic::{Request, Response, Status}; + +use iam_audit::{AuditEvent, AuditLogger}; +use iam_authz::{AuthzContext, AuthzRequest as InternalAuthzRequest, PolicyEvaluator}; +use iam_store::{BindingStore, PrincipalStore, RoleStore}; +use iam_types::{ + Condition, Error as TypesError, IamError, PolicyBinding, Principal, + PrincipalKind as TypesPrincipalKind, PrincipalRef, Resource, Role, Scope, StorageError, +}; +use tracing::warn; +use uuid::Uuid; + +use crate::proto::{ + self, iam_admin_server::IamAdmin, iam_authz_server::IamAuthz, AuthorizeRequest, + AuthorizeResponse, BatchAuthorizeRequest, BatchAuthorizeResponse, CreateBindingRequest, + CreatePrincipalRequest, CreateRoleRequest, DeleteBindingRequest, DeleteBindingResponse, + DeletePrincipalRequest, DeletePrincipalResponse, DeleteRoleRequest, DeleteRoleResponse, + GetBindingRequest, GetPrincipalRequest, GetRoleRequest, ListBindingsRequest, + ListBindingsResponse, ListPrincipalsRequest, ListPrincipalsResponse, ListRolesRequest, + ListRolesResponse, PrincipalKind, UpdateBindingRequest, UpdatePrincipalRequest, + UpdateRoleRequest, +}; + +/// IAM Authorization service implementation +pub struct IamAuthzService { + evaluator: Arc, + principal_store: Arc, + audit_logger: Option>, +} + +impl IamAuthzService { + /// Create a new authorization service without audit logging + pub fn new(evaluator: Arc, principal_store: Arc) -> Self { + Self { + evaluator, + principal_store, + audit_logger: None, + } + } + + /// Create a new authorization service with audit logging + pub fn with_audit( + evaluator: Arc, + principal_store: Arc, + audit_logger: Arc, + ) -> Self { + Self { + evaluator, + principal_store, + audit_logger: Some(audit_logger), + } + } + + /// Log an audit event (non-blocking) + async fn log_event(&self, event: AuditEvent) { + if let Some(logger) = &self.audit_logger { + if let Err(e) = logger.log(event).await { + warn!("Failed to log audit event: {}", e); + } + } + } +} + +#[tonic::async_trait] +impl IamAuthz for IamAuthzService { + async fn authorize( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + + // Get principal ref from request + let principal_ref = req + .principal + .as_ref() + .ok_or_else(|| Status::invalid_argument("principal is required"))?; + + let principal_kind = match PrincipalKind::try_from(principal_ref.kind) { + Ok(PrincipalKind::User) => TypesPrincipalKind::User, + Ok(PrincipalKind::ServiceAccount) => TypesPrincipalKind::ServiceAccount, + Ok(PrincipalKind::Group) => TypesPrincipalKind::Group, + _ => return Err(Status::invalid_argument("invalid principal kind")), + }; + + let types_principal_ref = PrincipalRef::new(principal_kind.clone(), &principal_ref.id); + + // Get principal from store + let principal = self + .principal_store + .get(&types_principal_ref) + .await + .map_err(|e| Status::internal(format!("Failed to get principal: {}", e)))? + .ok_or_else(|| Status::not_found("Principal not found"))?; + + // Build resource + let resource_ref = req + .resource + .as_ref() + .ok_or_else(|| Status::invalid_argument("resource is required"))?; + + let mut resource = Resource::new( + &resource_ref.kind, + &resource_ref.id, + &resource_ref.org_id, + &resource_ref.project_id, + ); + + if let Some(owner) = &resource_ref.owner_id { + resource = resource.with_owner(owner); + } + if let Some(node) = &resource_ref.node_id { + resource = resource.with_node(node); + } + if let Some(region) = &resource_ref.region { + resource = resource.with_region(region); + } + for (k, v) in &resource_ref.tags { + resource = resource.with_tag(k, v); + } + + // Build context + let mut context = AuthzContext::new(); + + if let Some(ctx) = &req.context { + if !ctx.source_ip.is_empty() { + if let Ok(ip) = ctx.source_ip.parse::() { + context = context.with_source_ip(ip); + } + } + if ctx.timestamp > 0 { + context = context.with_timestamp(ctx.timestamp); + } + if !ctx.http_method.is_empty() { + context = context.with_http_method(&ctx.http_method); + } + if !ctx.request_path.is_empty() { + context = context.with_request_path(&ctx.request_path); + } + for (k, v) in &ctx.metadata { + context = context.with_metadata(k, v); + } + } + + // Build internal request + let internal_req = + InternalAuthzRequest::new(principal, &req.action, resource).with_context(context); + + // Evaluate + let decision = self + .evaluator + .evaluate(&internal_req) + .await + .map_err(|e| Status::internal(format!("Authorization error: {}", e)))?; + + // Determine scope for audit logging + let audit_scope = Scope::project(&resource_ref.project_id, &resource_ref.org_id); + + let response = match decision { + iam_authz::AuthzDecision::Allow => { + // Log allowed event + let event = AuditEvent::authz_allowed( + &types_principal_ref.id, + &req.action, + &resource_ref.kind, + &resource_ref.id, + audit_scope.clone(), + ); + self.log_event(event).await; + + AuthorizeResponse { + allowed: true, + reason: String::new(), + matched_binding: String::new(), // TODO: track in evaluator + matched_role: String::new(), + } + } + iam_authz::AuthzDecision::Deny { reason } => { + // Log denied event + let event = AuditEvent::authz_denied( + &types_principal_ref.id, + &req.action, + &resource_ref.kind, + &resource_ref.id, + audit_scope, + &reason, + ); + self.log_event(event).await; + + AuthorizeResponse { + allowed: false, + reason, + matched_binding: String::new(), + matched_role: String::new(), + } + } + }; + + Ok(Response::new(response)) + } + + async fn batch_authorize( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let mut responses = Vec::with_capacity(req.requests.len()); + + for auth_req in req.requests { + let result = self + .authorize(Request::new(auth_req)) + .await + .map(|r| r.into_inner()) + .unwrap_or_else(|e| AuthorizeResponse { + allowed: false, + reason: e.message().to_string(), + matched_binding: String::new(), + matched_role: String::new(), + }); + responses.push(result); + } + + Ok(Response::new(BatchAuthorizeResponse { responses })) + } +} + +/// IAM Admin service implementation +pub struct IamAdminService { + principal_store: Arc, + role_store: Arc, + binding_store: Arc, +} + +impl IamAdminService { + /// Create a new admin service + pub fn new( + principal_store: Arc, + role_store: Arc, + binding_store: Arc, + ) -> Self { + Self { + principal_store, + role_store, + binding_store, + } + } +} + +fn now_ts() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs() +} + +fn map_error(err: TypesError) -> Status { + match err { + TypesError::Iam(IamError::PrincipalNotFound(msg)) + | TypesError::Iam(IamError::RoleNotFound(msg)) + | TypesError::Iam(IamError::BindingNotFound(msg)) => Status::not_found(msg), + TypesError::Iam(IamError::PrincipalAlreadyExists(msg)) + | TypesError::Iam(IamError::RoleAlreadyExists(msg)) + | TypesError::Iam(IamError::BindingAlreadyExists(msg)) => Status::already_exists(msg), + TypesError::Iam(IamError::CannotModifyBuiltinRole(msg)) => Status::failed_precondition(msg), + TypesError::Storage(StorageError::CasConflict { expected, actual }) => Status::aborted( + format!("CAS conflict (expected {}, actual {})", expected, actual), + ), + TypesError::Storage(StorageError::Connection(msg)) => Status::unavailable(msg), + TypesError::Storage(StorageError::Backend(msg)) => Status::unavailable(msg), + TypesError::Storage(StorageError::Timeout) => Status::deadline_exceeded("storage timeout"), + other => Status::internal(other.to_string()), + } +} + +#[tonic::async_trait] +impl IamAdmin for IamAdminService { + async fn create_principal( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let kind = match PrincipalKind::try_from(req.kind) { + Ok(PrincipalKind::User) => TypesPrincipalKind::User, + Ok(PrincipalKind::ServiceAccount) => TypesPrincipalKind::ServiceAccount, + Ok(PrincipalKind::Group) => TypesPrincipalKind::Group, + _ => return Err(Status::invalid_argument("invalid principal kind")), + }; + + let mut principal = match kind { + TypesPrincipalKind::User => Principal::new_user(&req.id, &req.name), + TypesPrincipalKind::ServiceAccount => { + let project_id = req.project_id.clone().unwrap_or_default(); + Principal::new_service_account(&req.id, &req.name, project_id) + } + TypesPrincipalKind::Group => Principal::new_group(&req.id, &req.name), + }; + + principal.org_id = req.org_id.clone(); + principal.project_id = req.project_id.clone(); + principal.email = req.email.clone(); + principal.metadata = req.metadata.clone(); + let now = now_ts(); + principal.created_at = now; + principal.updated_at = now; + + self.principal_store + .create(&principal) + .await + .map_err(map_error)?; + + Ok(Response::new(proto::Principal::from(principal))) + } + + async fn get_principal( + &self, + request: Request, + ) -> Result, Status> { + let principal_ref = request + .into_inner() + .principal + .ok_or_else(|| Status::invalid_argument("principal is required"))?; + + let principal_ref = PrincipalRef::new( + match PrincipalKind::try_from(principal_ref.kind) { + Ok(PrincipalKind::User) => TypesPrincipalKind::User, + Ok(PrincipalKind::ServiceAccount) => TypesPrincipalKind::ServiceAccount, + Ok(PrincipalKind::Group) => TypesPrincipalKind::Group, + _ => return Err(Status::invalid_argument("invalid principal kind")), + }, + &principal_ref.id, + ); + + let principal = self + .principal_store + .get(&principal_ref) + .await + .map_err(map_error)? + .ok_or_else(|| Status::not_found("principal not found"))?; + + Ok(Response::new(proto::Principal::from(principal))) + } + + async fn update_principal( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let principal_ref = req + .principal + .ok_or_else(|| Status::invalid_argument("principal is required"))?; + + let principal_ref = PrincipalRef::new( + match PrincipalKind::try_from(principal_ref.kind) { + Ok(PrincipalKind::User) => TypesPrincipalKind::User, + Ok(PrincipalKind::ServiceAccount) => TypesPrincipalKind::ServiceAccount, + Ok(PrincipalKind::Group) => TypesPrincipalKind::Group, + _ => return Err(Status::invalid_argument("invalid principal kind")), + }, + &principal_ref.id, + ); + + let (mut principal, version) = self + .principal_store + .get_with_version(&principal_ref) + .await + .map_err(map_error)? + .ok_or_else(|| Status::not_found("principal not found"))?; + + if let Some(name) = req.name { + principal.name = name; + } + if let Some(email) = req.email { + principal.email = Some(email); + } + if !req.metadata.is_empty() { + principal.metadata = req.metadata; + } + if let Some(enabled) = req.enabled { + principal.enabled = enabled; + } + principal.updated_at = now_ts(); + + self.principal_store + .update(&principal, version) + .await + .map_err(map_error)?; + + Ok(Response::new(proto::Principal::from(principal))) + } + + async fn delete_principal( + &self, + request: Request, + ) -> Result, Status> { + let principal_ref = request + .into_inner() + .principal + .ok_or_else(|| Status::invalid_argument("principal is required"))?; + + let principal_ref = PrincipalRef::new( + match PrincipalKind::try_from(principal_ref.kind) { + Ok(PrincipalKind::User) => TypesPrincipalKind::User, + Ok(PrincipalKind::ServiceAccount) => TypesPrincipalKind::ServiceAccount, + Ok(PrincipalKind::Group) => TypesPrincipalKind::Group, + _ => return Err(Status::invalid_argument("invalid principal kind")), + }, + &principal_ref.id, + ); + + let deleted = self + .principal_store + .delete(&principal_ref) + .await + .map_err(map_error)?; + + Ok(Response::new(DeletePrincipalResponse { deleted })) + } + + async fn list_principals( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let mut principals = if let Some(kind) = req.kind { + let kind = match PrincipalKind::try_from(kind) { + Ok(PrincipalKind::User) => TypesPrincipalKind::User, + Ok(PrincipalKind::ServiceAccount) => TypesPrincipalKind::ServiceAccount, + Ok(PrincipalKind::Group) => TypesPrincipalKind::Group, + _ => return Err(Status::invalid_argument("invalid principal kind")), + }; + self.principal_store + .list_by_kind(&kind) + .await + .map_err(map_error)? + } else if let Some(org) = req.org_id { + self.principal_store + .list_by_org(&org) + .await + .map_err(map_error)? + } else if let Some(project) = req.project_id { + self.principal_store + .list_by_project(&project) + .await + .map_err(map_error)? + } else { + let mut all = Vec::new(); + for kind in [ + TypesPrincipalKind::User, + TypesPrincipalKind::ServiceAccount, + TypesPrincipalKind::Group, + ] { + all.extend( + self.principal_store + .list_by_kind(&kind) + .await + .map_err(map_error)?, + ); + } + all + }; + + if req.page_size > 0 && principals.len() as i32 > req.page_size { + principals.truncate(req.page_size as usize); + } + + let principals = principals.into_iter().map(proto::Principal::from).collect(); + + Ok(Response::new(ListPrincipalsResponse { + principals, + next_page_token: String::new(), + })) + } + + async fn create_role( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + + let scope = req + .scope + .ok_or_else(|| Status::invalid_argument("scope is required"))? + .into(); + + let permissions = req.permissions.into_iter().map(Into::into).collect(); + + let mut role = Role::new(&req.name, scope, permissions); + role.display_name = req.display_name; + role.description = req.description; + let now = now_ts(); + role.created_at = now; + role.updated_at = now; + + self.role_store.create(&role).await.map_err(map_error)?; + Ok(Response::new(proto::Role::from(role))) + } + + async fn get_role( + &self, + request: Request, + ) -> Result, Status> { + let name = request.into_inner().name; + let role = self + .role_store + .get(&name) + .await + .map_err(map_error)? + .ok_or_else(|| Status::not_found("role not found"))?; + Ok(Response::new(proto::Role::from(role))) + } + + async fn update_role( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let (mut role, version) = self + .role_store + .get_with_version(&req.name) + .await + .map_err(map_error)? + .ok_or_else(|| Status::not_found("role not found"))?; + + if let Some(name) = req.display_name { + role.display_name = name; + } + if let Some(desc) = req.description { + role.description = desc; + } + if !req.permissions.is_empty() { + role.permissions = req.permissions.into_iter().map(Into::into).collect(); + } + role.updated_at = now_ts(); + + self.role_store + .update(&role, version) + .await + .map_err(map_error)?; + + Ok(Response::new(proto::Role::from(role))) + } + + async fn delete_role( + &self, + request: Request, + ) -> Result, Status> { + let deleted = self + .role_store + .delete(&request.into_inner().name) + .await + .map_err(map_error)?; + + Ok(Response::new(DeleteRoleResponse { deleted })) + } + + async fn list_roles( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let mut roles = if let Some(scope) = req.scope { + let scope: Scope = scope.into(); + self.role_store + .list_by_scope(&scope) + .await + .map_err(map_error)? + } else { + self.role_store.list().await.map_err(map_error)? + }; + + if !req.include_builtin { + roles.retain(|r| !r.builtin); + } + + if req.page_size > 0 && roles.len() as i32 > req.page_size { + roles.truncate(req.page_size as usize); + } + + let roles = roles.into_iter().map(proto::Role::from).collect(); + Ok(Response::new(ListRolesResponse { + roles, + next_page_token: String::new(), + })) + } + + async fn create_binding( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + + let principal_ref = req + .principal + .ok_or_else(|| Status::invalid_argument("principal is required"))?; + + let principal_ref = PrincipalRef::new( + match PrincipalKind::try_from(principal_ref.kind) { + Ok(PrincipalKind::User) => TypesPrincipalKind::User, + Ok(PrincipalKind::ServiceAccount) => TypesPrincipalKind::ServiceAccount, + Ok(PrincipalKind::Group) => TypesPrincipalKind::Group, + _ => return Err(Status::invalid_argument("invalid principal kind")), + }, + &principal_ref.id, + ); + + let scope: Scope = req + .scope + .ok_or_else(|| Status::invalid_argument("scope is required"))? + .into(); + + let mut binding = + PolicyBinding::new(Uuid::new_v4().to_string(), principal_ref, req.role, scope); + binding.condition = req.condition.map(Into::into); + binding.expires_at = req.expires_at; + binding.created_at = now_ts(); + binding.updated_at = binding.created_at; + binding.created_by = "iam-admin".into(); + + self.binding_store + .create(&binding) + .await + .map_err(map_error)?; + + Ok(Response::new(proto::PolicyBinding::from(binding))) + } + + async fn get_binding( + &self, + request: Request, + ) -> Result, Status> { + let id = request.into_inner().id; + let binding = self + .binding_store + .get_by_id(&id) + .await + .map_err(map_error)? + .ok_or_else(|| Status::not_found("binding not found"))?; + + Ok(Response::new(proto::PolicyBinding::from(binding))) + } + + async fn update_binding( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + + let (mut binding, version) = self + .binding_store + .get_by_id_with_version(&req.id) + .await + .map_err(map_error)? + .ok_or_else(|| Status::not_found("binding not found"))?; + + if let Some(condition) = req.condition { + binding.condition = Some(condition.into()); + } + if let Some(exp) = req.expires_at { + binding.expires_at = Some(exp); + } + if let Some(enabled) = req.enabled { + binding.enabled = enabled; + } + binding.updated_at = now_ts(); + + self.binding_store + .update(&binding, version) + .await + .map_err(map_error)?; + + Ok(Response::new(proto::PolicyBinding::from(binding))) + } + + async fn delete_binding( + &self, + request: Request, + ) -> Result, Status> { + let id = request.into_inner().id; + let binding = self + .binding_store + .get_by_id(&id) + .await + .map_err(map_error)? + .ok_or_else(|| Status::not_found("binding not found"))?; + + let deleted = self + .binding_store + .delete(&binding.scope, &binding.principal_ref, &binding.id) + .await + .map_err(map_error)?; + + Ok(Response::new(DeleteBindingResponse { deleted })) + } + + async fn list_bindings( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + + let mut bindings = if let Some(principal) = req.principal { + let principal_ref = PrincipalRef::new( + match PrincipalKind::try_from(principal.kind) { + Ok(PrincipalKind::User) => TypesPrincipalKind::User, + Ok(PrincipalKind::ServiceAccount) => TypesPrincipalKind::ServiceAccount, + Ok(PrincipalKind::Group) => TypesPrincipalKind::Group, + _ => return Err(Status::invalid_argument("invalid principal kind")), + }, + &principal.id, + ); + self.binding_store + .list_by_principal(&principal_ref) + .await + .map_err(map_error)? + } else if let Some(role) = req.role { + self.binding_store + .list_by_role(&role) + .await + .map_err(map_error)? + } else if let Some(scope) = req.scope { + let scope: Scope = scope.into(); + self.binding_store + .list_by_scope(&scope) + .await + .map_err(map_error)? + } else { + self.binding_store.list_all().await.map_err(map_error)? + }; + + if !req.include_disabled { + bindings.retain(|b| b.enabled); + } + + if req.page_size > 0 && bindings.len() as i32 > req.page_size { + bindings.truncate(req.page_size as usize); + } + + let bindings = bindings + .into_iter() + .map(proto::PolicyBinding::from) + .collect(); + + Ok(Response::new(ListBindingsResponse { + bindings, + next_page_token: String::new(), + })) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::proto::scope; + use iam_authz::PolicyCache; + use iam_store::Backend; + + fn admin_service() -> ( + IamAdminService, + Arc, + Arc, + Arc, + ) { + let backend = Arc::new(Backend::memory()); + let principal_store = Arc::new(PrincipalStore::new(backend.clone())); + let role_store = Arc::new(RoleStore::new(backend.clone())); + let binding_store = Arc::new(BindingStore::new(backend)); + ( + IamAdminService::new( + principal_store.clone(), + role_store.clone(), + binding_store.clone(), + ), + principal_store, + role_store, + binding_store, + ) + } + + fn test_stores() -> (Arc, Arc, Arc) { + let backend = Arc::new(Backend::memory()); + let principal_store = Arc::new(PrincipalStore::new(backend.clone())); + let role_store = Arc::new(RoleStore::new(backend.clone())); + let binding_store = Arc::new(BindingStore::new(backend)); + (principal_store, role_store, binding_store) + } + + #[tokio::test] + async fn test_authorize_no_principal() { + let (principal_store, role_store, binding_store) = test_stores(); + let cache = Arc::new(PolicyCache::default_config()); + let evaluator = Arc::new(PolicyEvaluator::new(binding_store, role_store, cache)); + + let service = IamAuthzService::new(evaluator, principal_store); + + let req = AuthorizeRequest { + principal: None, + action: "compute:instances:create".into(), + resource: Some(proto::ResourceRef { + kind: "instance".into(), + id: "vm-1".into(), + org_id: "org-1".into(), + project_id: "proj-1".into(), + owner_id: None, + node_id: None, + region: None, + tags: Default::default(), + }), + context: None, + }; + + let result = service.authorize(Request::new(req)).await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_admin_crud_binding_flow() { + let (service, principal_store, role_store, _binding_store) = admin_service(); + + // Create principal + let principal_req = CreatePrincipalRequest { + id: "alice".into(), + kind: PrincipalKind::User as i32, + name: "Alice".into(), + org_id: None, + project_id: None, + email: None, + metadata: Default::default(), + }; + service + .create_principal(Request::new(principal_req)) + .await + .unwrap(); + + // Create role + let role_req = CreateRoleRequest { + name: "ProjectViewer".into(), + display_name: "Viewer".into(), + description: "read-only".into(), + scope: Some(proto::Scope { + scope: Some(proto::scope::Scope::Project(proto::ProjectScope { + id: "proj-1".into(), + org_id: "org-1".into(), + })), + }), + permissions: vec![proto::Permission { + action: "*:read".into(), + resource_pattern: "project/${project}/*".into(), + condition: None, + }], + }; + service.create_role(Request::new(role_req)).await.unwrap(); + + // Create binding + let binding_req = CreateBindingRequest { + principal: Some(proto::PrincipalRef { + kind: PrincipalKind::User as i32, + id: "alice".into(), + }), + role: "roles/ProjectViewer".into(), + scope: Some(proto::Scope { + scope: Some(proto::scope::Scope::Project(proto::ProjectScope { + id: "proj-1".into(), + org_id: "org-1".into(), + })), + }), + condition: None, + expires_at: None, + }; + let binding_resp = service + .create_binding(Request::new(binding_req)) + .await + .unwrap() + .into_inner(); + assert_eq!(binding_resp.role, "roles/ProjectViewer"); + + // List bindings filtered by principal + let list_resp = service + .list_bindings(Request::new(ListBindingsRequest { + principal: Some(proto::PrincipalRef { + kind: PrincipalKind::User as i32, + id: "alice".into(), + }), + role: None, + scope: None, + include_disabled: false, + page_size: 0, + page_token: String::new(), + })) + .await + .unwrap() + .into_inner(); + assert_eq!(list_resp.bindings.len(), 1); + + // Delete binding + let del_resp = service + .delete_binding(Request::new(DeleteBindingRequest { + id: binding_resp.id, + })) + .await + .unwrap() + .into_inner(); + assert!(del_resp.deleted); + + // Principal still exists + assert!(principal_store + .get(&PrincipalRef::user("alice")) + .await + .unwrap() + .is_some()); + // Role still exists + assert!(role_store.get("ProjectViewer").await.unwrap().is_some()); + } +} diff --git a/iam/crates/iam-api/src/lib.rs b/iam/crates/iam-api/src/lib.rs new file mode 100644 index 0000000..d6273d4 --- /dev/null +++ b/iam/crates/iam-api/src/lib.rs @@ -0,0 +1,19 @@ +//! gRPC API for IAM +//! +//! Provides gRPC service implementations for authentication, authorization, and management. + +#![recursion_limit = "2048"] + +pub mod conversions; +pub mod generated; +pub mod iam_service; +pub mod token_service; + +/// Re-export proto types +pub mod proto { + pub use crate::generated::iam::v1::*; +} + +pub use generated::iam::v1::{iam_admin_server, iam_authz_server, iam_token_server}; +pub use iam_service::{IamAdminService, IamAuthzService}; +pub use token_service::IamTokenService; diff --git a/iam/crates/iam-api/src/token_service.rs b/iam/crates/iam-api/src/token_service.rs new file mode 100644 index 0000000..dea1c71 --- /dev/null +++ b/iam/crates/iam-api/src/token_service.rs @@ -0,0 +1,430 @@ +//! IAM Token gRPC service implementation + +use std::sync::Arc; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine}; +use sha2::{Digest, Sha256}; +use tonic::{Request, Response, Status}; + +use iam_authn::InternalTokenService; +use iam_store::{PrincipalStore, TokenStore}; +use iam_types::{ + InternalTokenClaims, Principal, PrincipalKind as TypesPrincipalKind, PrincipalRef, + Scope as TypesScope, TokenMetadata, TokenType, +}; + +use crate::proto::{ + self, iam_token_server::IamToken, scope, IssueTokenRequest, IssueTokenResponse, PrincipalKind, + RefreshTokenRequest, RefreshTokenResponse, RevokeTokenRequest, RevokeTokenResponse, Scope, + ValidateTokenRequest, ValidateTokenResponse, +}; + +/// IAM Token service implementation +pub struct IamTokenService { + token_service: Arc, + principal_store: Arc, + token_store: Arc, +} + +impl IamTokenService { + /// Create a new token service + pub fn new( + token_service: Arc, + principal_store: Arc, + token_store: Arc, + ) -> Self { + Self { + token_service, + principal_store, + token_store, + } + } + + fn convert_scope(proto_scope: &Option) -> TypesScope { + match proto_scope { + Some(s) => match &s.scope { + Some(scope::Scope::System(true)) => TypesScope::System, + Some(scope::Scope::Org(org)) => TypesScope::org(&org.id), + Some(scope::Scope::Project(proj)) => { + TypesScope::project(&proj.id, &proj.org_id) + } + Some(scope::Scope::Resource(res)) => { + TypesScope::resource(&res.id, &res.project_id, &res.org_id) + } + _ => TypesScope::System, + }, + None => TypesScope::System, + } + } + + fn convert_scope_to_proto(scope: &TypesScope) -> Scope { + Scope { + scope: Some(match scope { + TypesScope::System => scope::Scope::System(true), + TypesScope::Org { id } => { + scope::Scope::Org(proto::OrgScope { id: id.clone() }) + } + TypesScope::Project { id, org_id } => { + scope::Scope::Project(proto::ProjectScope { + id: id.clone(), + org_id: org_id.clone(), + }) + } + TypesScope::Resource { + id, + project_id, + org_id, + } => scope::Scope::Resource(proto::ResourceScope { + id: id.clone(), + project_id: project_id.clone(), + org_id: org_id.clone(), + }), + }), + } + } + + fn compute_token_id(token: &str) -> String { + let mut hasher = Sha256::new(); + hasher.update(token.as_bytes()); + let digest = hasher.finalize(); + URL_SAFE_NO_PAD.encode(digest) + } + + fn now_ts() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + } + + async fn persist_metadata( + &self, + claims: &InternalTokenClaims, + token: &str, + ) -> Result { + let token_id = Self::compute_token_id(token); + let mut meta = TokenMetadata::new( + &token_id, + &claims.principal_id, + TokenType::Access, + claims.iat, + claims.exp, + ); + if let Err(e) = self.token_store.put(&meta).await { + return Err(Status::internal(format!( + "failed to persist token metadata: {}", + e + ))); + } + // ensure we mirror revocation flags if claims already mark them + if meta.revoked { + return Ok(token_id); + } + Ok(token_id) + } +} + +#[tonic::async_trait] +impl IamToken for IamTokenService { + async fn issue_token( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + + // Get principal kind + let principal_kind = match PrincipalKind::try_from(req.principal_kind) { + Ok(PrincipalKind::User) => TypesPrincipalKind::User, + Ok(PrincipalKind::ServiceAccount) => TypesPrincipalKind::ServiceAccount, + Ok(PrincipalKind::Group) => TypesPrincipalKind::Group, + _ => return Err(Status::invalid_argument("invalid principal kind")), + }; + + // Get principal from store + let principal_ref = PrincipalRef::new(principal_kind, &req.principal_id); + let principal = self + .principal_store + .get(&principal_ref) + .await + .map_err(|e| Status::internal(format!("Failed to get principal: {}", e)))? + .ok_or_else(|| Status::not_found("Principal not found"))?; + + // Convert scope + let scope = Self::convert_scope(&req.scope); + + // Determine TTL + let ttl = if req.ttl_seconds > 0 { + Some(Duration::from_secs(req.ttl_seconds)) + } else { + None + }; + + // Issue token + let issued = self + .token_service + .issue(&principal, req.roles.clone(), scope, ttl) + .await + .map_err(|e| Status::internal(format!("Failed to issue token: {}", e)))?; + + // Persist metadata for revocation tracking + self.persist_metadata(&issued.claims, &issued.token).await?; + + Ok(Response::new(IssueTokenResponse { + token: issued.token, + expires_at: issued.expires_at, + session_id: issued.claims.session_id, + })) + } + + async fn validate_token( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + + match self.token_service.verify(&req.token).await { + Ok(claims) => { + // Check revocation list + let token_id = Self::compute_token_id(&req.token); + if let Some((meta, _)) = self + .token_store + .get(&claims.principal_id, &token_id) + .await + .map_err(|e| { + Status::internal(format!("Failed to read token metadata: {}", e)) + })? + { + if meta.revoked { + return Ok(Response::new(ValidateTokenResponse { + valid: false, + claims: None, + reason: "token revoked".into(), + })); + } + } + + let proto_claims = crate::proto::InternalTokenClaims { + principal_id: claims.principal_id.clone(), + principal_kind: match claims.principal_kind { + TypesPrincipalKind::User => PrincipalKind::User as i32, + TypesPrincipalKind::ServiceAccount => PrincipalKind::ServiceAccount as i32, + TypesPrincipalKind::Group => PrincipalKind::Group as i32, + }, + principal_name: claims.principal_name.clone(), + roles: claims.roles.clone(), + scope: Some(Self::convert_scope_to_proto(&claims.scope)), + org_id: claims.org_id.clone(), + project_id: claims.project_id.clone(), + node_id: claims.node_id.clone(), + iat: claims.iat, + exp: claims.exp, + session_id: claims.session_id.clone(), + auth_method: claims.auth_method.to_string(), + }; + + Ok(Response::new(ValidateTokenResponse { + valid: true, + claims: Some(proto_claims), + reason: String::new(), + })) + } + Err(e) => Ok(Response::new(ValidateTokenResponse { + valid: false, + claims: None, + reason: e.to_string(), + })), + } + } + + async fn revoke_token( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + + let claims = self + .token_service + .verify(&req.token) + .await + .map_err(|e| Status::invalid_argument(format!("Invalid token: {}", e)))?; + + let token_id = Self::compute_token_id(&req.token); + let now = Self::now_ts(); + + let existing = self + .token_store + .get(&claims.principal_id, &token_id) + .await + .map_err(|e| Status::internal(format!("Failed to read token metadata: {}", e)))?; + + // Try revoking existing metadata; if it does not exist, create a revoked record + let revoked = if let Some((meta, _)) = existing { + if meta.revoked { + true + } else { + self.token_store + .revoke(&claims.principal_id, &token_id, &req.reason, now) + .await + .map_err(|e| Status::internal(format!("Failed to revoke token: {}", e)))? + } + } else { + let mut meta = TokenMetadata::new( + &token_id, + &claims.principal_id, + TokenType::Access, + claims.iat, + claims.exp, + ); + meta.revoke(now, &req.reason); + self.token_store + .put(&meta) + .await + .map_err(|e| Status::internal(format!("Failed to write revocation: {}", e)))?; + true + }; + + Ok(Response::new(RevokeTokenResponse { success: revoked })) + } + + async fn refresh_token( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + + // Validate current token + let claims = self + .token_service + .verify(&req.token) + .await + .map_err(|e| Status::unauthenticated(format!("Invalid token: {}", e)))?; + + // Get principal + let principal_kind = claims.principal_kind.clone(); + let principal_ref = PrincipalRef::new(principal_kind, &claims.principal_id); + let principal = self + .principal_store + .get(&principal_ref) + .await + .map_err(|e| Status::internal(format!("Failed to get principal: {}", e)))? + .ok_or_else(|| Status::not_found("Principal not found"))?; + + // Determine new TTL + let ttl = if req.ttl_seconds > 0 { + Some(Duration::from_secs(req.ttl_seconds)) + } else { + None + }; + + // Issue new token + let issued = self + .token_service + .issue(&principal, claims.roles.clone(), claims.scope.clone(), ttl) + .await + .map_err(|e| Status::internal(format!("Failed to issue token: {}", e)))?; + + self.persist_metadata(&issued.claims, &issued.token).await?; + + Ok(Response::new(RefreshTokenResponse { + token: issued.token, + expires_at: issued.expires_at, + })) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use iam_authn::{InternalTokenConfig, SigningKey}; + use iam_store::{Backend, TokenStore}; + + fn test_setup() -> ( + Arc, + Arc, + Arc, + ) { + let backend = Arc::new(Backend::memory()); + let principal_store = Arc::new(PrincipalStore::new(backend.clone())); + let token_store = Arc::new(TokenStore::new(backend)); + + let signing_key = SigningKey::generate("test-key"); + let config = InternalTokenConfig::new(signing_key, "iam-test"); + let token_service = Arc::new(InternalTokenService::new(config)); + + (token_service, principal_store, token_store) + } + + #[tokio::test] + async fn test_issue_token_principal_not_found() { + let (token_service, principal_store, token_store) = test_setup(); + let service = IamTokenService::new(token_service, principal_store, token_store); + + let req = IssueTokenRequest { + principal_id: "nonexistent".into(), + principal_kind: PrincipalKind::User as i32, + roles: vec![], + scope: None, + ttl_seconds: 3600, + }; + + let result = service.issue_token(Request::new(req)).await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_revoke_and_validate_blocklist() { + let (token_service, principal_store, token_store) = test_setup(); + let service = + IamTokenService::new(token_service, principal_store.clone(), token_store.clone()); + + // create principal + let principal = Principal::new_user("alice", "Alice"); + principal_store.create(&principal).await.unwrap(); + + // issue token + let issue_resp = service + .issue_token(Request::new(IssueTokenRequest { + principal_id: "alice".into(), + principal_kind: PrincipalKind::User as i32, + roles: vec!["roles/ProjectAdmin".into()], + scope: None, + ttl_seconds: 3600, + })) + .await + .unwrap() + .into_inner(); + + // validate OK + let valid_resp = service + .validate_token(Request::new(ValidateTokenRequest { + token: issue_resp.token.clone(), + })) + .await + .unwrap() + .into_inner(); + assert!(valid_resp.valid); + + // revoke + let revoke_resp = service + .revoke_token(Request::new(RevokeTokenRequest { + token: issue_resp.token.clone(), + reason: "test".into(), + })) + .await + .unwrap() + .into_inner(); + assert!(revoke_resp.success); + + // validate should be rejected + let invalid_resp = service + .validate_token(Request::new(ValidateTokenRequest { + token: issue_resp.token.clone(), + })) + .await + .unwrap() + .into_inner(); + assert!(!invalid_resp.valid); + assert_eq!(invalid_resp.reason, "token revoked"); + } +} diff --git a/iam/crates/iam-api/tests/tenant_path_integration.rs b/iam/crates/iam-api/tests/tenant_path_integration.rs new file mode 100644 index 0000000..db3b5a6 --- /dev/null +++ b/iam/crates/iam-api/tests/tenant_path_integration.rs @@ -0,0 +1,778 @@ +//! Integration tests for Tenant Path (User → Org → Project) with RBAC enforcement +//! +//! This test suite validates the E2E flow of IAM tenant setup and authorization: +//! 1. User creation and organization assignment +//! 2. Project creation scoped to organizations +//! 3. Role-based access control (RBAC) enforcement +//! 4. Cross-tenant isolation (users can't access other tenants' resources) +//! 5. Hierarchical permission evaluation + +use std::sync::Arc; + +use iam_api::iam_service::{IamAdminService, IamAuthzService}; +use iam_authz::{AuthzDecision, AuthzRequest, PolicyCache, PolicyEvaluator}; +use iam_store::{Backend, BindingStore, PrincipalStore, RoleStore}; +use iam_types::{Permission, PolicyBinding, Principal, PrincipalRef, Resource, Role, Scope}; + +/// Test helper: Create all required stores and services +fn setup_services() -> ( + IamAdminService, + IamAuthzService, + Arc, + Arc, + Arc, + Arc, +) { + let backend = Arc::new(Backend::memory()); + let principal_store = Arc::new(PrincipalStore::new(backend.clone())); + let role_store = Arc::new(RoleStore::new(backend.clone())); + let binding_store = Arc::new(BindingStore::new(backend)); + + let admin_service = IamAdminService::new( + principal_store.clone(), + role_store.clone(), + binding_store.clone(), + ); + + let cache = Arc::new(PolicyCache::default_config()); + let evaluator = Arc::new( + PolicyEvaluator::new(binding_store.clone(), role_store.clone(), cache).with_config( + iam_authz::PolicyEvaluatorConfig { + use_cache: false, + max_bindings: 1000, + debug: false, + }, + ), + ); + + let authz_service = IamAuthzService::new(evaluator.clone(), principal_store.clone()); + + ( + admin_service, + authz_service, + principal_store, + role_store, + binding_store, + evaluator, + ) +} + +/// Test Scenario 1: Complete tenant setup flow +/// +/// Validates: +/// - User creation +/// - Organization scope assignment +/// - Project creation within organization +/// - Role binding at org and project levels +/// - Authorization checks for created resources +#[tokio::test] +async fn test_tenant_setup_flow() { + let ( + _admin_service, + _authz_service, + principal_store, + role_store, + binding_store, + evaluator, + ) = setup_services(); + + // Step 1: Create User Alice + let mut alice = Principal::new_user("alice", "Alice Smith"); + alice.email = Some("alice@example.com".to_string()); + alice.org_id = Some("acme-corp".to_string()); + principal_store.create(&alice).await.unwrap(); + + // Step 2: Create a custom OrgAdmin role with proper resource patterns + // Resource path format: org/{org_id}/project/{project_id}/{kind}/{id} + let org_admin_role = Role::new( + "OrgAdmin", + Scope::org("*"), + vec![ + Permission::new("*", "org/acme-corp/*"), + ], + ) + .with_display_name("Organization Administrator") + .with_description("Full access to organization resources"); + role_store.create(&org_admin_role).await.unwrap(); + + // Step 3: Create OrgAdmin role binding for Alice at Org scope + let org_scope = Scope::org("acme-corp"); + let alice_org_binding = PolicyBinding::new( + "binding-alice-org-admin", + PrincipalRef::user("alice"), + "roles/OrgAdmin", + org_scope.clone(), + ); + binding_store.create(&alice_org_binding).await.unwrap(); + + // Step 4: Verify Alice can access org-level resources + let org_resource = Resource::new("organization", "acme-corp", "acme-corp", "acme-corp"); + + let request = AuthzRequest::new(alice.clone(), "org:manage", org_resource); + let decision = evaluator.evaluate(&request).await.unwrap(); + + assert!( + decision.is_allowed(), + "Alice should be allowed to manage org resources as OrgAdmin" + ); + + // Step 5: Verify Alice can access project resources (OrgAdmin includes projects) + let project_resource = Resource::new("project", "project-alpha", "acme-corp", "project-alpha"); + + let request = AuthzRequest::new(alice.clone(), "project:read", project_resource); + let decision = evaluator.evaluate(&request).await.unwrap(); + + assert!( + decision.is_allowed(), + "Alice (OrgAdmin) should be able to read projects in her org" + ); + + // Step 6: Create a compute instance in the project + let instance = Resource::new("instance", "vm-001", "acme-corp", "project-alpha"); + + let request = AuthzRequest::new(alice.clone(), "compute:instances:create", instance); + let decision = evaluator.evaluate(&request).await.unwrap(); + + assert!( + decision.is_allowed(), + "Alice (OrgAdmin) should be able to create instances in org projects" + ); +} + +/// Test Scenario 2: Cross-tenant isolation +/// +/// Validates: +/// - Two users in different organizations +/// - Each user has full access to their own org +/// - Users cannot access resources in other organizations +/// - Proper denial reasons are returned +#[tokio::test] +async fn test_cross_tenant_denial() { + let ( + _admin_service, + _authz_service, + principal_store, + role_store, + binding_store, + evaluator, + ) = setup_services(); + + // Create custom org admin roles with proper patterns for each org + let org1_admin_role = Role::new( + "Org1Admin", + Scope::org("org-1"), + vec![Permission::new("*", "org/org-1/*")], + ); + role_store.create(&org1_admin_role).await.unwrap(); + + let org2_admin_role = Role::new( + "Org2Admin", + Scope::org("org-2"), + vec![Permission::new("*", "org/org-2/*")], + ); + role_store.create(&org2_admin_role).await.unwrap(); + + // Setup User A (Alice) with Org1 + let mut alice = Principal::new_user("alice", "Alice"); + alice.org_id = Some("org-1".to_string()); + principal_store.create(&alice).await.unwrap(); + + let alice_binding = PolicyBinding::new( + "alice-org1-admin", + PrincipalRef::user("alice"), + "roles/Org1Admin", + Scope::org("org-1"), + ); + binding_store.create(&alice_binding).await.unwrap(); + + // Setup User B (Bob) with Org2 + let mut bob = Principal::new_user("bob", "Bob"); + bob.org_id = Some("org-2".to_string()); + principal_store.create(&bob).await.unwrap(); + + let bob_binding = PolicyBinding::new( + "bob-org2-admin", + PrincipalRef::user("bob"), + "roles/Org2Admin", + Scope::org("org-2"), + ); + binding_store.create(&bob_binding).await.unwrap(); + + // Create resources in Org1 / Project1 + let org1_project1_instance = Resource::new("instance", "vm-alice-1", "org-1", "project-1"); + + // Test 1: Alice CAN access Org1 resources + let request = AuthzRequest::new( + alice.clone(), + "compute:instances:create", + org1_project1_instance.clone(), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!( + decision.is_allowed(), + "Alice should access her own org resources" + ); + + // Test 2: Bob CANNOT access Org1 resources (cross-tenant denial) + let request = AuthzRequest::new( + bob.clone(), + "compute:instances:create", + org1_project1_instance.clone(), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!( + decision.is_denied(), + "Bob should NOT access Alice's org resources" + ); + + // Verify denial reason mentions no matching policy + if let AuthzDecision::Deny { reason } = decision { + assert!( + reason.contains("No") || reason.contains("not found") || reason.contains("binding"), + "Denial reason should indicate lack of permissions: {}", + reason + ); + } + + // Create resources in Org2 / Project2 + let org2_project2_instance = Resource::new("instance", "vm-bob-1", "org-2", "project-2"); + + // Test 3: Bob CAN access Org2 resources + let request = AuthzRequest::new( + bob.clone(), + "compute:instances:create", + org2_project2_instance.clone(), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + + assert!( + decision.is_allowed(), + "Bob should access his own org resources" + ); + + // Test 4: Alice CANNOT access Org2 resources (cross-tenant denial) + let request = AuthzRequest::new(alice, "compute:instances:create", org2_project2_instance); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!( + decision.is_denied(), + "Alice should NOT access Bob's org resources" + ); +} + +/// Test Scenario 3: RBAC enforcement at project level +/// +/// Validates: +/// - ProjectAdmin has full access within project +/// - ProjectMember has limited access (own resources + read-only) +/// - Users without roles are denied access +/// - Role inheritance and permission evaluation +#[tokio::test] +async fn test_rbac_project_scope() { + let ( + _admin_service, + _authz_service, + principal_store, + role_store, + binding_store, + evaluator, + ) = setup_services(); + + let org_id = "acme-corp"; + let project_id = "project-delta"; + let project_scope = Scope::project(project_id, org_id); + + // Create custom roles with proper patterns + // ProjectAdmin - full access to project resources + let project_admin_role = Role::new( + "ProjectAdmin", + Scope::project("*", "*"), + vec![Permission::new("*", &format!("org/{}/project/{}/*", org_id, project_id))], + ); + role_store.create(&project_admin_role).await.unwrap(); + + // ProjectMember - read access + manage own resources + let project_member_role = Role::new( + "ProjectMember", + Scope::project("*", "*"), + vec![ + // Full access to own resources (with owner condition) + Permission::new( + "compute:instances:*", + &format!("org/{}/project/{}/instance/*", org_id, project_id), + ) + .with_condition(iam_types::Condition::string_equals( + "resource.owner", + "${principal.id}", + )), + // Read access to all project resources + Permission::new("*:*:read", &format!("org/{}/project/{}/*", org_id, project_id)), + Permission::new("*:*:list", &format!("org/{}/project/{}/*", org_id, project_id)), + ], + ); + role_store.create(&project_member_role).await.unwrap(); + + // Create three users in the same org/project + let mut admin_user = Principal::new_user("admin-user", "Project Admin"); + admin_user.org_id = Some(org_id.to_string()); + admin_user.project_id = Some(project_id.to_string()); + principal_store.create(&admin_user).await.unwrap(); + + let mut member_user = Principal::new_user("member-user", "Project Member"); + member_user.org_id = Some(org_id.to_string()); + member_user.project_id = Some(project_id.to_string()); + principal_store.create(&member_user).await.unwrap(); + + let mut guest_user = Principal::new_user("guest-user", "Guest User"); + guest_user.org_id = Some(org_id.to_string()); + principal_store.create(&guest_user).await.unwrap(); + + // Assign ProjectAdmin role to admin_user + let admin_binding = PolicyBinding::new( + "admin-project-admin", + PrincipalRef::user("admin-user"), + "roles/ProjectAdmin", + project_scope.clone(), + ); + binding_store.create(&admin_binding).await.unwrap(); + + // Assign ProjectMember role to member_user + let member_binding = PolicyBinding::new( + "member-project-member", + PrincipalRef::user("member-user"), + "roles/ProjectMember", + project_scope.clone(), + ); + binding_store.create(&member_binding).await.unwrap(); + + // Note: guest_user has no role binding (should be denied) + + // Create test resources + let admin_instance = Resource::new("instance", "vm-admin-1", org_id, project_id) + .with_owner("admin-user"); + let member_instance = Resource::new("instance", "vm-member-1", org_id, project_id) + .with_owner("member-user"); + let shared_volume = Resource::new("volume", "vol-shared", org_id, project_id); + + // Test 1: ProjectAdmin can create instances + let request = AuthzRequest::new( + admin_user.clone(), + "compute:instances:create", + admin_instance.clone(), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!( + decision.is_allowed(), + "ProjectAdmin should create instances" + ); + + // Test 2: ProjectAdmin can delete any instance in project + let request = AuthzRequest::new( + admin_user.clone(), + "compute:instances:delete", + member_instance.clone(), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!( + decision.is_allowed(), + "ProjectAdmin should delete any project instance" + ); + + // Test 3: ProjectMember can read instances (builtin permission) + let request = AuthzRequest::new( + member_user.clone(), + "compute:instances:read", + admin_instance.clone(), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!( + decision.is_allowed(), + "ProjectMember should read project instances" + ); + + // Test 4: ProjectMember can list instances (builtin permission) + let request = AuthzRequest::new( + member_user.clone(), + "compute:instances:list", + shared_volume.clone(), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!( + decision.is_allowed(), + "ProjectMember should list project resources" + ); + + // Test 5: ProjectMember can manage their own instances (owner condition) + let request = AuthzRequest::new( + member_user.clone(), + "compute:instances:create", + member_instance.clone(), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!( + decision.is_allowed(), + "ProjectMember should create own instances" + ); + + // Test 6: ProjectMember CANNOT delete others' instances (owner condition fails) + let request = AuthzRequest::new( + member_user.clone(), + "compute:instances:delete", + admin_instance.clone(), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!( + decision.is_denied(), + "ProjectMember should NOT delete others' instances" + ); + + // Test 7: Guest user (no role) is denied all access + let request = AuthzRequest::new( + guest_user.clone(), + "compute:instances:read", + shared_volume.clone(), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!(decision.is_denied(), "Guest should be denied without roles"); + + let request = AuthzRequest::new( + guest_user.clone(), + "compute:instances:create", + Resource::new("instance", "vm-guest", org_id, project_id).with_owner("guest-user"), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!( + decision.is_denied(), + "Guest should NOT create instances without role" + ); +} + +/// Test Scenario 4: Hierarchical scope inheritance +/// +/// Validates: +/// - System scope permissions apply to all orgs/projects +/// - Org scope permissions apply to all projects within org +/// - Project scope permissions are isolated to that project +#[tokio::test] +async fn test_hierarchical_scope_inheritance() { + let ( + _admin_service, + _authz_service, + principal_store, + role_store, + binding_store, + evaluator, + ) = setup_services(); + + // Create custom roles + // SystemAdmin - full access to everything + let sys_admin_role = Role::new("SystemAdmin", Scope::System, vec![Permission::wildcard()]); + role_store.create(&sys_admin_role).await.unwrap(); + + // Org1Admin - full access to org-1 resources + let org1_admin_role = Role::new( + "Org1Admin", + Scope::org("org-1"), + vec![Permission::new("*", "org/org-1/*")], + ); + role_store.create(&org1_admin_role).await.unwrap(); + + // Create a system admin + let sys_admin = Principal::new_user("sysadmin", "System Administrator"); + principal_store.create(&sys_admin).await.unwrap(); + + let sys_admin_binding = PolicyBinding::new( + "sysadmin-system", + PrincipalRef::user("sysadmin"), + "roles/SystemAdmin", + Scope::System, + ); + binding_store.create(&sys_admin_binding).await.unwrap(); + + // Create org admin for org-1 only + let org_admin = Principal::new_user("orgadmin", "Org Admin"); + principal_store.create(&org_admin).await.unwrap(); + + let org_admin_binding = PolicyBinding::new( + "orgadmin-org1", + PrincipalRef::user("orgadmin"), + "roles/Org1Admin", + Scope::org("org-1"), + ); + binding_store.create(&org_admin_binding).await.unwrap(); + + // Test resources in different orgs/projects + let org1_proj1_resource = Resource::new("instance", "vm-1", "org-1", "proj-1"); + let org1_proj2_resource = Resource::new("instance", "vm-2", "org-1", "proj-2"); + let org2_proj1_resource = Resource::new("instance", "vm-3", "org-2", "proj-1"); + + // Test 1: SystemAdmin can access resources in ANY org/project + for resource in [ + &org1_proj1_resource, + &org1_proj2_resource, + &org2_proj1_resource, + ] { + let request = AuthzRequest::new( + sys_admin.clone(), + "compute:instances:delete", + resource.clone(), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!( + decision.is_allowed(), + "SystemAdmin should access all resources" + ); + } + + // Test 2: OrgAdmin can access resources in org-1 projects + for resource in [&org1_proj1_resource, &org1_proj2_resource] { + let request = AuthzRequest::new( + org_admin.clone(), + "compute:instances:delete", + resource.clone(), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!( + decision.is_allowed(), + "OrgAdmin should access all projects in their org" + ); + } + + // Test 3: OrgAdmin CANNOT access resources in org-2 + let request = AuthzRequest::new( + org_admin.clone(), + "compute:instances:delete", + org2_proj1_resource, + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!( + decision.is_denied(), + "OrgAdmin should NOT access other orgs" + ); +} + +/// Test Scenario 5: Custom role with fine-grained permissions +/// +/// Validates: +/// - Creation of custom roles with specific permissions +/// - Permission pattern matching (action and resource patterns) +/// - Custom role assignment and evaluation +#[tokio::test] +async fn test_custom_role_fine_grained_permissions() { + let ( + _admin_service, + _authz_service, + principal_store, + role_store, + binding_store, + evaluator, + ) = setup_services(); + + let org_id = "tech-corp"; + let project_id = "backend-services"; + + // Create a custom role: "StorageOperator" - can manage volumes but not instances + let storage_operator_role = Role::new( + "StorageOperator", + Scope::project("*", "*"), + vec![ + Permission::new("storage:volumes:*", &format!("org/{}/project/{}/*", org_id, project_id)), + Permission::new("storage:snapshots:*", &format!("org/{}/project/{}/*", org_id, project_id)), + Permission::new("storage:*:read", &format!("org/{}/project/{}/*", org_id, project_id)), + Permission::new("storage:*:list", &format!("org/{}/project/{}/*", org_id, project_id)), + ], + ) + .with_display_name("Storage Operator") + .with_description("Can manage storage resources but not compute"); + + role_store.create(&storage_operator_role).await.unwrap(); + + // Create a user and assign the custom role + let storage_user = Principal::new_user("storage-ops", "Storage Operator User"); + principal_store.create(&storage_user).await.unwrap(); + + let storage_binding = PolicyBinding::new( + "storage-ops-binding", + PrincipalRef::user("storage-ops"), + "roles/StorageOperator", + Scope::project(project_id, org_id), + ); + binding_store.create(&storage_binding).await.unwrap(); + + // Create test resources + let volume = Resource::new("volume", "vol-001", org_id, project_id); + let snapshot = Resource::new("snapshot", "snap-001", org_id, project_id); + let instance = Resource::new("instance", "vm-001", org_id, project_id); + + // Test 1: Storage operator CAN manage volumes + let request = AuthzRequest::new( + storage_user.clone(), + "storage:volumes:create", + volume.clone(), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!( + decision.is_allowed(), + "StorageOperator should create volumes" + ); + + let request = AuthzRequest::new(storage_user.clone(), "storage:volumes:delete", volume); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!( + decision.is_allowed(), + "StorageOperator should delete volumes" + ); + + // Test 2: Storage operator CAN manage snapshots + let request = AuthzRequest::new( + storage_user.clone(), + "storage:snapshots:create", + snapshot.clone(), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!( + decision.is_allowed(), + "StorageOperator should create snapshots" + ); + + // Test 3: Storage operator CAN read instances (read permission granted) + let request = AuthzRequest::new( + storage_user.clone(), + "storage:instances:read", + instance.clone(), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!( + decision.is_allowed(), + "StorageOperator should read instances" + ); + + // Test 4: Storage operator CANNOT create/delete instances (no permission) + let request = AuthzRequest::new( + storage_user.clone(), + "compute:instances:create", + instance.clone(), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!( + decision.is_denied(), + "StorageOperator should NOT create instances" + ); + + let request = AuthzRequest::new( + storage_user.clone(), + "compute:instances:delete", + instance, + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!( + decision.is_denied(), + "StorageOperator should NOT delete instances" + ); +} + +/// Test Scenario 6: Multiple role bindings (role aggregation) +/// +/// Validates: +/// - A principal can have multiple role bindings +/// - Permissions from all roles are aggregated +/// - Most permissive role wins +#[tokio::test] +async fn test_multiple_role_bindings() { + let ( + _admin_service, + _authz_service, + principal_store, + role_store, + binding_store, + evaluator, + ) = setup_services(); + + let org_id = "multi-role-org"; + let project1 = "project-1"; + let project2 = "project-2"; + + // Create custom roles + // ReadOnly for project-1 - read/list/get operations only + let readonly_role = Role::new( + "ReadOnly", + Scope::project("*", "*"), + vec![ + Permission::new("*:*:read", &format!("org/{}/project/{}/*", org_id, project1)), + Permission::new("*:*:list", &format!("org/{}/project/{}/*", org_id, project1)), + Permission::new("*:*:get", &format!("org/{}/project/{}/*", org_id, project1)), + ], + ); + role_store.create(&readonly_role).await.unwrap(); + + // ProjectAdmin for project-2 + let project_admin_role = Role::new( + "ProjectAdmin", + Scope::project("*", "*"), + vec![Permission::new("*", &format!("org/{}/project/{}/*", org_id, project2))], + ); + role_store.create(&project_admin_role).await.unwrap(); + + // Create a user + let user = Principal::new_user("multi-role-user", "Multi Role User"); + principal_store.create(&user).await.unwrap(); + + // Assign ReadOnly role in project-1 + let readonly_binding = PolicyBinding::new( + "readonly-proj1", + PrincipalRef::user("multi-role-user"), + "roles/ReadOnly", + Scope::project(project1, org_id), + ); + binding_store.create(&readonly_binding).await.unwrap(); + + // Assign ProjectAdmin role in project-2 + let admin_binding = PolicyBinding::new( + "admin-proj2", + PrincipalRef::user("multi-role-user"), + "roles/ProjectAdmin", + Scope::project(project2, org_id), + ); + binding_store.create(&admin_binding).await.unwrap(); + + // Resources in different projects + let proj1_instance = Resource::new("instance", "vm-1", org_id, project1); + let proj2_instance = Resource::new("instance", "vm-2", org_id, project2); + + // Test 1: User can only READ in project-1 (ReadOnly role) + let request = AuthzRequest::new( + user.clone(), + "compute:instances:read", + proj1_instance.clone(), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!(decision.is_allowed(), "Should read in project-1"); + + let request = AuthzRequest::new( + user.clone(), + "compute:instances:delete", + proj1_instance, + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!(decision.is_denied(), "Should NOT delete in project-1"); + + // Test 2: User can do ANYTHING in project-2 (ProjectAdmin role) + let request = AuthzRequest::new( + user.clone(), + "compute:instances:read", + proj2_instance.clone(), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!(decision.is_allowed(), "Should read in project-2"); + + let request = AuthzRequest::new( + user.clone(), + "compute:instances:delete", + proj2_instance.clone(), + ); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!(decision.is_allowed(), "Should delete in project-2"); + + let request = AuthzRequest::new(user.clone(), "compute:instances:create", proj2_instance); + let decision = evaluator.evaluate(&request).await.unwrap(); + assert!(decision.is_allowed(), "Should create in project-2"); +} diff --git a/iam/crates/iam-audit/Cargo.toml b/iam/crates/iam-audit/Cargo.toml new file mode 100644 index 0000000..794b80f --- /dev/null +++ b/iam/crates/iam-audit/Cargo.toml @@ -0,0 +1,33 @@ +[package] +name = "iam-audit" +version.workspace = true +edition.workspace = true +license.workspace = true +rust-version.workspace = true +authors.workspace = true +repository.workspace = true +description = "Audit logging for IAM authentication and authorization events" + +[lints] +workspace = true + +[dependencies] +iam-types = { workspace = true } + +# Async +tokio = { workspace = true, features = ["sync", "time", "fs"] } +async-trait = { workspace = true } + +# Serialization +serde = { workspace = true } +serde_json = { workspace = true } + +# Utilities +tracing = { workspace = true } +chrono = { workspace = true } +uuid = { workspace = true } +thiserror = { workspace = true } + +[dev-dependencies] +tempfile = { workspace = true } +tokio = { workspace = true, features = ["full", "test-util"] } diff --git a/iam/crates/iam-audit/src/event.rs b/iam/crates/iam-audit/src/event.rs new file mode 100644 index 0000000..854f0cc --- /dev/null +++ b/iam/crates/iam-audit/src/event.rs @@ -0,0 +1,467 @@ +//! Audit event types +//! +//! Defines structured audit events for IAM operations. + +use std::collections::HashMap; +use std::net::IpAddr; + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use iam_types::Scope; + +/// Audit event +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AuditEvent { + /// Unique event ID + pub id: String, + + /// Event timestamp + pub timestamp: DateTime, + + /// Event kind + pub kind: AuditEventKind, + + /// Principal ID (if known) + pub principal_id: Option, + + /// Source IP address + pub source_ip: Option, + + /// Request ID (for correlation) + pub request_id: Option, + + /// Session ID + pub session_id: Option, + + /// Additional metadata + pub metadata: HashMap, +} + +/// Kind of audit event +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum AuditEventKind { + /// Authentication event + Authn(AuthnEventData), + + /// Authorization event + Authz(AuthzEventData), + + /// Policy change event + Policy(PolicyEventData), + + /// Token event (issue, revoke, refresh) + Token(TokenEventData), + + /// Administrative action + Admin(AdminEventData), +} + +/// Authentication event data +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AuthnEventData { + /// Whether authentication succeeded + pub success: bool, + + /// Authentication method used (jwt, mtls, api_key, etc.) + pub method: String, + + /// Failure reason (if failed) + pub failure_reason: Option, + + /// Token issuer (if JWT) + pub issuer: Option, + + /// Client certificate subject (if mTLS) + pub cert_subject: Option, +} + +/// Authorization event data +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AuthzEventData { + /// Whether authorization was granted + pub allowed: bool, + + /// Action being authorized + pub action: String, + + /// Resource being accessed + pub resource_kind: String, + pub resource_id: String, + + /// Scope of the request + pub scope: Scope, + + /// Matching policy ID (if allowed) + pub policy_id: Option, + + /// Denial reason (if denied) + pub denial_reason: Option, + + /// Roles evaluated + pub roles_evaluated: Vec, +} + +/// Policy change event data +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PolicyEventData { + /// Type of change + pub change_type: PolicyChangeType, + + /// Policy ID + pub policy_id: String, + + /// Policy name + pub policy_name: Option, + + /// Scope affected + pub scope: Scope, + + /// Actor who made the change + pub actor_id: String, +} + +/// Type of policy change +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum PolicyChangeType { + Created, + Updated, + Deleted, + Attached, + Detached, +} + +/// Token event data +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TokenEventData { + /// Type of token event + pub event_type: TokenEventType, + + /// Token ID or session ID + pub token_id: Option, + + /// Token TTL (if issued) + pub ttl_seconds: Option, + + /// Scope of the token + pub scope: Option, +} + +/// Type of token event +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum TokenEventType { + Issued, + Refreshed, + Revoked, + Expired, +} + +/// Administrative action event data +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AdminEventData { + /// Type of admin action + pub action: String, + + /// Target entity type + pub target_type: String, + + /// Target entity ID + pub target_id: String, + + /// Details of the action + pub details: Option, +} + +impl AuditEvent { + /// Create a new audit event + pub fn new(kind: AuditEventKind) -> Self { + Self { + id: Uuid::new_v4().to_string(), + timestamp: Utc::now(), + kind, + principal_id: None, + source_ip: None, + request_id: None, + session_id: None, + metadata: HashMap::new(), + } + } + + /// Create an authentication success event + pub fn authn_success(principal_id: &str, method: &str) -> Self { + Self::new(AuditEventKind::Authn(AuthnEventData { + success: true, + method: method.to_string(), + failure_reason: None, + issuer: None, + cert_subject: None, + })) + .with_principal(principal_id) + } + + /// Create an authentication failure event + pub fn authn_failure(method: &str, reason: &str) -> Self { + Self::new(AuditEventKind::Authn(AuthnEventData { + success: false, + method: method.to_string(), + failure_reason: Some(reason.to_string()), + issuer: None, + cert_subject: None, + })) + } + + /// Create an authorization allowed event + pub fn authz_allowed( + principal_id: &str, + action: &str, + resource_kind: &str, + resource_id: &str, + scope: Scope, + ) -> Self { + Self::new(AuditEventKind::Authz(AuthzEventData { + allowed: true, + action: action.to_string(), + resource_kind: resource_kind.to_string(), + resource_id: resource_id.to_string(), + scope, + policy_id: None, + denial_reason: None, + roles_evaluated: Vec::new(), + })) + .with_principal(principal_id) + } + + /// Create an authorization denied event + pub fn authz_denied( + principal_id: &str, + action: &str, + resource_kind: &str, + resource_id: &str, + scope: Scope, + reason: &str, + ) -> Self { + Self::new(AuditEventKind::Authz(AuthzEventData { + allowed: false, + action: action.to_string(), + resource_kind: resource_kind.to_string(), + resource_id: resource_id.to_string(), + scope, + policy_id: None, + denial_reason: Some(reason.to_string()), + roles_evaluated: Vec::new(), + })) + .with_principal(principal_id) + } + + /// Create a token issued event + pub fn token_issued(principal_id: &str, token_id: &str, ttl_seconds: u64, scope: Scope) -> Self { + Self::new(AuditEventKind::Token(TokenEventData { + event_type: TokenEventType::Issued, + token_id: Some(token_id.to_string()), + ttl_seconds: Some(ttl_seconds), + scope: Some(scope), + })) + .with_principal(principal_id) + } + + /// Create a token revoked event + pub fn token_revoked(principal_id: &str, token_id: &str) -> Self { + Self::new(AuditEventKind::Token(TokenEventData { + event_type: TokenEventType::Revoked, + token_id: Some(token_id.to_string()), + ttl_seconds: None, + scope: None, + })) + .with_principal(principal_id) + } + + /// Create a policy created event + pub fn policy_created(actor_id: &str, policy_id: &str, policy_name: &str, scope: Scope) -> Self { + Self::new(AuditEventKind::Policy(PolicyEventData { + change_type: PolicyChangeType::Created, + policy_id: policy_id.to_string(), + policy_name: Some(policy_name.to_string()), + scope, + actor_id: actor_id.to_string(), + })) + .with_principal(actor_id) + } + + /// Create a policy deleted event + pub fn policy_deleted(actor_id: &str, policy_id: &str, scope: Scope) -> Self { + Self::new(AuditEventKind::Policy(PolicyEventData { + change_type: PolicyChangeType::Deleted, + policy_id: policy_id.to_string(), + policy_name: None, + scope, + actor_id: actor_id.to_string(), + })) + .with_principal(actor_id) + } + + /// Set principal ID + pub fn with_principal(mut self, principal_id: &str) -> Self { + self.principal_id = Some(principal_id.to_string()); + self + } + + /// Set source IP + pub fn with_source_ip(mut self, ip: IpAddr) -> Self { + self.source_ip = Some(ip); + self + } + + /// Set request ID + pub fn with_request_id(mut self, request_id: &str) -> Self { + self.request_id = Some(request_id.to_string()); + self + } + + /// Set session ID + pub fn with_session_id(mut self, session_id: &str) -> Self { + self.session_id = Some(session_id.to_string()); + self + } + + /// Add metadata + pub fn with_metadata(mut self, key: &str, value: &str) -> Self { + self.metadata.insert(key.to_string(), value.to_string()); + self + } + + /// Serialize to JSON + pub fn to_json(&self) -> Result { + serde_json::to_string(self) + } + + /// Serialize to pretty JSON + pub fn to_json_pretty(&self) -> Result { + serde_json::to_string_pretty(self) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::net::Ipv4Addr; + + #[test] + fn test_authn_success_event() { + let event = AuditEvent::authn_success("alice", "jwt") + .with_source_ip(IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1))) + .with_request_id("req-123"); + + assert_eq!(event.principal_id, Some("alice".to_string())); + assert!(matches!(event.kind, AuditEventKind::Authn(ref data) if data.success)); + + let json = event.to_json().unwrap(); + assert!(json.contains("alice")); + assert!(json.contains("jwt")); + } + + #[test] + fn test_authn_failure_event() { + let event = AuditEvent::authn_failure("jwt", "Token expired"); + + assert!(event.principal_id.is_none()); + match &event.kind { + AuditEventKind::Authn(data) => { + assert!(!data.success); + assert_eq!(data.failure_reason, Some("Token expired".to_string())); + } + _ => panic!("Expected Authn event"), + } + } + + #[test] + fn test_authz_allowed_event() { + let event = AuditEvent::authz_allowed( + "alice", + "read", + "instance", + "vm-1", + Scope::project("proj-1", "org-1"), + ); + + assert_eq!(event.principal_id, Some("alice".to_string())); + match &event.kind { + AuditEventKind::Authz(data) => { + assert!(data.allowed); + assert_eq!(data.action, "read"); + } + _ => panic!("Expected Authz event"), + } + } + + #[test] + fn test_authz_denied_event() { + let event = AuditEvent::authz_denied( + "bob", + "delete", + "instance", + "vm-1", + Scope::project("proj-1", "org-1"), + "No matching policy", + ); + + match &event.kind { + AuditEventKind::Authz(data) => { + assert!(!data.allowed); + assert_eq!(data.denial_reason, Some("No matching policy".to_string())); + } + _ => panic!("Expected Authz event"), + } + } + + #[test] + fn test_token_issued_event() { + let event = AuditEvent::token_issued( + "alice", + "session-123", + 3600, + Scope::System, + ); + + match &event.kind { + AuditEventKind::Token(data) => { + assert!(matches!(data.event_type, TokenEventType::Issued)); + assert_eq!(data.ttl_seconds, Some(3600)); + } + _ => panic!("Expected Token event"), + } + } + + #[test] + fn test_policy_created_event() { + let event = AuditEvent::policy_created( + "admin", + "policy-1", + "AllowReadInstances", + Scope::org("org-1"), + ); + + match &event.kind { + AuditEventKind::Policy(data) => { + assert!(matches!(data.change_type, PolicyChangeType::Created)); + assert_eq!(data.policy_id, "policy-1"); + } + _ => panic!("Expected Policy event"), + } + } + + #[test] + fn test_event_serialization() { + let event = AuditEvent::authn_success("alice", "jwt") + .with_metadata("user_agent", "curl/7.68.0"); + + let json = event.to_json().unwrap(); + let parsed: AuditEvent = serde_json::from_str(&json).unwrap(); + + assert_eq!(parsed.principal_id, event.principal_id); + assert_eq!(parsed.metadata.get("user_agent"), Some(&"curl/7.68.0".to_string())); + } +} diff --git a/iam/crates/iam-audit/src/lib.rs b/iam/crates/iam-audit/src/lib.rs new file mode 100644 index 0000000..2de7fab --- /dev/null +++ b/iam/crates/iam-audit/src/lib.rs @@ -0,0 +1,32 @@ +//! Audit logging for IAM +//! +//! Provides structured audit logging for authentication and authorization events. +//! +//! ## Features +//! +//! - Multiple event types (authentication, authorization, policy changes) +//! - Pluggable sinks (file, memory, custom) +//! - Async-first design +//! - Structured JSON output +//! +//! ## Usage +//! +//! ```ignore +//! use iam_audit::{AuditLogger, AuditEvent, AuditEventKind, FileSink}; +//! +//! let sink = FileSink::new("/var/log/iam-audit.jsonl").await?; +//! let logger = AuditLogger::new(sink); +//! +//! logger.log(AuditEvent::authn_success("alice", "jwt")).await?; +//! ``` + +mod event; +mod logger; +mod sink; + +pub use event::{ + AdminEventData, AuditEvent, AuditEventKind, AuthnEventData, AuthzEventData, PolicyChangeType, + PolicyEventData, TokenEventData, TokenEventType, +}; +pub use logger::{AuditLogger, AuditLoggerConfig, BufferedAuditLogger}; +pub use sink::{AuditError, AuditSink, FileSink, MemorySink, MultiSink, NullSink, Result}; diff --git a/iam/crates/iam-audit/src/logger.rs b/iam/crates/iam-audit/src/logger.rs new file mode 100644 index 0000000..5c19932 --- /dev/null +++ b/iam/crates/iam-audit/src/logger.rs @@ -0,0 +1,281 @@ +//! Audit logger +//! +//! Main entry point for audit logging. + +use std::sync::Arc; + +use tokio::sync::mpsc; +use tracing::{debug, error, warn}; + +use crate::event::AuditEvent; +use crate::sink::{AuditSink, Result}; + +/// Configuration for the audit logger +#[derive(Debug, Clone)] +pub struct AuditLoggerConfig { + /// Buffer size for async logging + pub buffer_size: usize, + + /// Whether to block when buffer is full + pub block_on_full: bool, + + /// Whether to log errors to tracing + pub log_errors: bool, +} + +impl Default for AuditLoggerConfig { + fn default() -> Self { + Self { + buffer_size: 1000, + block_on_full: false, + log_errors: true, + } + } +} + +/// Audit logger +/// +/// Handles audit event logging with async processing. +pub struct AuditLogger { + sink: Arc, + config: AuditLoggerConfig, +} + +impl AuditLogger { + /// Create a new audit logger with a sink + pub fn new(sink: impl AuditSink + 'static) -> Self { + Self { + sink: Arc::new(sink), + config: AuditLoggerConfig::default(), + } + } + + /// Create with configuration + pub fn with_config(sink: impl AuditSink + 'static, config: AuditLoggerConfig) -> Self { + Self { + sink: Arc::new(sink), + config, + } + } + + /// Create from an Arc'd sink + pub fn from_arc(sink: Arc) -> Self { + Self { + sink, + config: AuditLoggerConfig::default(), + } + } + + /// Log an audit event + pub async fn log(&self, event: AuditEvent) -> Result<()> { + if let Err(e) = self.sink.write(&event).await { + if self.config.log_errors { + error!( + event_id = %event.id, + error = %e, + "Failed to write audit event" + ); + } + return Err(e); + } + + debug!( + event_id = %event.id, + kind = ?std::mem::discriminant(&event.kind), + "Audit event logged" + ); + + Ok(()) + } + + /// Log an event without waiting for completion + /// + /// Returns immediately. Errors are logged via tracing. + pub fn log_async(&self, event: AuditEvent) { + let sink = self.sink.clone(); + let log_errors = self.config.log_errors; + + tokio::spawn(async move { + if let Err(e) = sink.write(&event).await { + if log_errors { + error!( + event_id = %event.id, + error = %e, + "Failed to write audit event (async)" + ); + } + } + }); + } + + /// Flush the sink + pub async fn flush(&self) -> Result<()> { + self.sink.flush().await + } + + /// Close the logger + pub async fn close(&self) -> Result<()> { + self.sink.close().await + } + + /// Get the underlying sink + pub fn sink(&self) -> &Arc { + &self.sink + } +} + +/// Buffered audit logger with async processing +/// +/// Uses a channel to buffer events and process them in the background. +pub struct BufferedAuditLogger { + tx: mpsc::Sender, + config: AuditLoggerConfig, +} + +impl BufferedAuditLogger { + /// Create a new buffered logger + /// + /// Spawns a background task to process events. + pub fn new(sink: impl AuditSink + 'static, config: AuditLoggerConfig) -> Self { + let (tx, rx) = mpsc::channel(config.buffer_size); + let sink = Arc::new(sink); + + // Spawn background processor + tokio::spawn(Self::process_events(rx, sink, config.log_errors)); + + Self { tx, config } + } + + /// Create with default config + pub fn with_defaults(sink: impl AuditSink + 'static) -> Self { + Self::new(sink, AuditLoggerConfig::default()) + } + + /// Log an audit event + /// + /// If buffer is full, behavior depends on config.block_on_full + pub async fn log(&self, event: AuditEvent) -> bool { + if self.config.block_on_full { + self.tx.send(event).await.is_ok() + } else { + match self.tx.try_send(event) { + Ok(()) => true, + Err(mpsc::error::TrySendError::Full(event)) => { + if self.config.log_errors { + warn!( + event_id = %event.id, + "Audit buffer full, event dropped" + ); + } + false + } + Err(mpsc::error::TrySendError::Closed(_)) => { + error!("Audit logger channel closed"); + false + } + } + } + } + + /// Process events from the channel + async fn process_events( + mut rx: mpsc::Receiver, + sink: Arc, + log_errors: bool, + ) { + while let Some(event) = rx.recv().await { + if let Err(e) = sink.write(&event).await { + if log_errors { + error!( + event_id = %event.id, + error = %e, + "Failed to write audit event" + ); + } + } + } + + // Channel closed, flush and close sink + if let Err(e) = sink.flush().await { + if log_errors { + error!(error = %e, "Failed to flush audit sink on close"); + } + } + + if let Err(e) = sink.close().await { + if log_errors { + error!(error = %e, "Failed to close audit sink"); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::sink::MemorySink; + + #[tokio::test] + async fn test_audit_logger() { + let sink = Arc::new(MemorySink::new(100)); + let logger = AuditLogger::from_arc(sink.clone()); + + let event = AuditEvent::authn_success("alice", "jwt"); + logger.log(event).await.unwrap(); + + assert_eq!(sink.count().await, 1); + } + + #[tokio::test] + async fn test_audit_logger_async() { + let sink = Arc::new(MemorySink::new(100)); + let logger = AuditLogger::from_arc(sink.clone()); + + for i in 0..10 { + let event = AuditEvent::authn_success(&format!("user-{}", i), "jwt"); + logger.log_async(event); + } + + // Give async tasks time to complete + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + + assert_eq!(sink.count().await, 10); + } + + #[tokio::test] + async fn test_buffered_logger() { + let sink = Arc::new(MemorySink::new(100)); + let logger = BufferedAuditLogger::new( + MemorySinkWrapper(sink.clone()), + AuditLoggerConfig::default(), + ); + + for i in 0..10 { + let event = AuditEvent::authn_success(&format!("user-{}", i), "jwt"); + logger.log(event).await; + } + + // Give background task time to process + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + + assert_eq!(sink.count().await, 10); + } + + // Wrapper to allow Arc to be used as AuditSink + struct MemorySinkWrapper(Arc); + + #[async_trait::async_trait] + impl AuditSink for MemorySinkWrapper { + async fn write(&self, event: &AuditEvent) -> Result<()> { + self.0.write(event).await + } + + async fn flush(&self) -> Result<()> { + self.0.flush().await + } + + async fn close(&self) -> Result<()> { + self.0.close().await + } + } +} diff --git a/iam/crates/iam-audit/src/sink.rs b/iam/crates/iam-audit/src/sink.rs new file mode 100644 index 0000000..ec0e14b --- /dev/null +++ b/iam/crates/iam-audit/src/sink.rs @@ -0,0 +1,305 @@ +//! Audit sinks +//! +//! Pluggable destinations for audit events. + +use std::path::Path; +use std::sync::Arc; + +use async_trait::async_trait; +use tokio::fs::{File, OpenOptions}; +use tokio::io::AsyncWriteExt; +use tokio::sync::{Mutex, RwLock}; + +use crate::event::AuditEvent; + +/// Result type for audit operations +pub type Result = std::result::Result; + +/// Audit error +#[derive(Debug, thiserror::Error)] +pub enum AuditError { + #[error("IO error: {0}")] + Io(#[from] std::io::Error), + + #[error("Serialization error: {0}")] + Serialization(#[from] serde_json::Error), + + #[error("Sink error: {0}")] + Sink(String), +} + +/// Trait for audit event sinks +#[async_trait] +pub trait AuditSink: Send + Sync { + /// Write an audit event to the sink + async fn write(&self, event: &AuditEvent) -> Result<()>; + + /// Flush any buffered events + async fn flush(&self) -> Result<()>; + + /// Close the sink + async fn close(&self) -> Result<()>; +} + +/// File-based audit sink +/// +/// Writes audit events as JSON lines to a file. +pub struct FileSink { + file: Mutex, + path: String, +} + +impl FileSink { + /// Create a new file sink + pub async fn new(path: impl AsRef) -> Result { + let path_str = path.as_ref().to_string_lossy().to_string(); + let file = OpenOptions::new() + .create(true) + .append(true) + .open(&path) + .await?; + + Ok(Self { + file: Mutex::new(file), + path: path_str, + }) + } + + /// Get the file path + pub fn path(&self) -> &str { + &self.path + } +} + +#[async_trait] +impl AuditSink for FileSink { + async fn write(&self, event: &AuditEvent) -> Result<()> { + let json = event.to_json()?; + let line = format!("{}\n", json); + + let mut file = self.file.lock().await; + file.write_all(line.as_bytes()).await?; + + Ok(()) + } + + async fn flush(&self) -> Result<()> { + let mut file = self.file.lock().await; + file.flush().await?; + Ok(()) + } + + async fn close(&self) -> Result<()> { + self.flush().await + } +} + +/// In-memory audit sink (for testing) +pub struct MemorySink { + events: RwLock>, + max_events: usize, +} + +impl MemorySink { + /// Create a new memory sink + pub fn new(max_events: usize) -> Self { + Self { + events: RwLock::new(Vec::new()), + max_events, + } + } + + /// Create with default max events (10000) + pub fn default_capacity() -> Self { + Self::new(10000) + } + + /// Get all events + pub async fn events(&self) -> Vec { + self.events.read().await.clone() + } + + /// Get event count + pub async fn count(&self) -> usize { + self.events.read().await.len() + } + + /// Clear all events + pub async fn clear(&self) { + self.events.write().await.clear(); + } + + /// Find events by principal ID + pub async fn find_by_principal(&self, principal_id: &str) -> Vec { + self.events + .read() + .await + .iter() + .filter(|e| e.principal_id.as_deref() == Some(principal_id)) + .cloned() + .collect() + } +} + +#[async_trait] +impl AuditSink for MemorySink { + async fn write(&self, event: &AuditEvent) -> Result<()> { + let mut events = self.events.write().await; + + // If at capacity, remove oldest event + if events.len() >= self.max_events { + events.remove(0); + } + + events.push(event.clone()); + Ok(()) + } + + async fn flush(&self) -> Result<()> { + Ok(()) + } + + async fn close(&self) -> Result<()> { + Ok(()) + } +} + +/// Multi-sink that writes to multiple sinks +pub struct MultiSink { + sinks: Vec>, +} + +impl MultiSink { + /// Create a new multi-sink + pub fn new(sinks: Vec>) -> Self { + Self { sinks } + } + + /// Add a sink + pub fn add(&mut self, sink: Arc) { + self.sinks.push(sink); + } +} + +#[async_trait] +impl AuditSink for MultiSink { + async fn write(&self, event: &AuditEvent) -> Result<()> { + for sink in &self.sinks { + sink.write(event).await?; + } + Ok(()) + } + + async fn flush(&self) -> Result<()> { + for sink in &self.sinks { + sink.flush().await?; + } + Ok(()) + } + + async fn close(&self) -> Result<()> { + for sink in &self.sinks { + sink.close().await?; + } + Ok(()) + } +} + +/// Null sink that discards all events +pub struct NullSink; + +#[async_trait] +impl AuditSink for NullSink { + async fn write(&self, _event: &AuditEvent) -> Result<()> { + Ok(()) + } + + async fn flush(&self) -> Result<()> { + Ok(()) + } + + async fn close(&self) -> Result<()> { + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + #[tokio::test] + async fn test_file_sink() { + let dir = tempdir().unwrap(); + let path = dir.path().join("audit.jsonl"); + + let sink = FileSink::new(&path).await.unwrap(); + + let event = AuditEvent::authn_success("alice", "jwt"); + sink.write(&event).await.unwrap(); + sink.flush().await.unwrap(); + + // Read back and verify + let contents = tokio::fs::read_to_string(&path).await.unwrap(); + assert!(contents.contains("alice")); + assert!(contents.contains("jwt")); + } + + #[tokio::test] + async fn test_memory_sink() { + let sink = MemorySink::new(100); + + let event1 = AuditEvent::authn_success("alice", "jwt"); + let event2 = AuditEvent::authn_success("bob", "mtls"); + + sink.write(&event1).await.unwrap(); + sink.write(&event2).await.unwrap(); + + assert_eq!(sink.count().await, 2); + + let alice_events = sink.find_by_principal("alice").await; + assert_eq!(alice_events.len(), 1); + } + + #[tokio::test] + async fn test_memory_sink_capacity() { + let sink = MemorySink::new(2); + + for i in 0..5 { + let event = AuditEvent::authn_success(&format!("user-{}", i), "jwt"); + sink.write(&event).await.unwrap(); + } + + // Should only have last 2 events + assert_eq!(sink.count().await, 2); + + let events = sink.events().await; + assert_eq!(events[0].principal_id, Some("user-3".to_string())); + assert_eq!(events[1].principal_id, Some("user-4".to_string())); + } + + #[tokio::test] + async fn test_multi_sink() { + let sink1 = Arc::new(MemorySink::new(100)); + let sink2 = Arc::new(MemorySink::new(100)); + + let multi = MultiSink::new(vec![sink1.clone(), sink2.clone()]); + + let event = AuditEvent::authn_success("alice", "jwt"); + multi.write(&event).await.unwrap(); + + assert_eq!(sink1.count().await, 1); + assert_eq!(sink2.count().await, 1); + } + + #[tokio::test] + async fn test_null_sink() { + let sink = NullSink; + let event = AuditEvent::authn_success("alice", "jwt"); + + // Should not error + sink.write(&event).await.unwrap(); + sink.flush().await.unwrap(); + sink.close().await.unwrap(); + } +} diff --git a/iam/crates/iam-authn/Cargo.toml b/iam/crates/iam-authn/Cargo.toml new file mode 100644 index 0000000..56c3e47 --- /dev/null +++ b/iam/crates/iam-authn/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "iam-authn" +version.workspace = true +edition.workspace = true +description = "Authentication layer for IAM" +license.workspace = true + +[dependencies] +iam-types = { path = "../iam-types" } +serde = { workspace = true } +serde_json = { workspace = true } +thiserror = { workspace = true } +tokio = { workspace = true } +tracing = { workspace = true } +async-trait = { workspace = true } +jsonwebtoken = { workspace = true } +reqwest = { workspace = true } +base64 = { workspace = true } +rand = { workspace = true } +hmac = { workspace = true } +sha2 = { workspace = true } + +[dev-dependencies] +tokio = { workspace = true, features = ["full", "test-util"] } diff --git a/iam/crates/iam-authn/src/jwt.rs b/iam/crates/iam-authn/src/jwt.rs new file mode 100644 index 0000000..20e6eba --- /dev/null +++ b/iam/crates/iam-authn/src/jwt.rs @@ -0,0 +1,409 @@ +//! JWT/OIDC token verification +//! +//! Provides JWT signature verification with JWKS (JSON Web Key Set) support. + +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use base64::Engine; +use jsonwebtoken::{decode, decode_header, Algorithm, DecodingKey, Validation}; +use serde::Deserialize; +use tokio::sync::RwLock; + +use iam_types::{Error, IamError, JwtClaims, Result, TokenValidationError}; + +/// JWKS (JSON Web Key Set) structure +#[derive(Debug, Clone, Deserialize)] +pub struct Jwks { + pub keys: Vec, +} + +/// Individual JWK (JSON Web Key) +#[derive(Debug, Clone, Deserialize)] +pub struct Jwk { + pub kty: String, + pub kid: Option, + #[serde(rename = "use")] + pub key_use: Option, + pub alg: Option, + pub n: Option, // RSA modulus + pub e: Option, // RSA exponent + pub x: Option, // EC x coordinate + pub y: Option, // EC y coordinate + pub crv: Option, // EC curve +} + +impl Jwk { + /// Convert to jsonwebtoken DecodingKey + pub fn to_decoding_key(&self) -> std::result::Result { + match self.kty.as_str() { + "RSA" => { + let n = self + .n + .as_ref() + .ok_or_else(|| TokenValidationError::Malformed("RSA key missing n".into()))?; + let e = self + .e + .as_ref() + .ok_or_else(|| TokenValidationError::Malformed("RSA key missing e".into()))?; + DecodingKey::from_rsa_components(n, e) + .map_err(|e| TokenValidationError::Malformed(e.to_string())) + } + "EC" => { + let x = self + .x + .as_ref() + .ok_or_else(|| TokenValidationError::Malformed("EC key missing x".into()))?; + let y = self + .y + .as_ref() + .ok_or_else(|| TokenValidationError::Malformed("EC key missing y".into()))?; + DecodingKey::from_ec_components(x, y) + .map_err(|e| TokenValidationError::Malformed(e.to_string())) + } + other => Err(TokenValidationError::Malformed(format!( + "Unsupported key type: {}", + other + ))), + } + } + + /// Get the algorithm for this key + pub fn algorithm(&self) -> std::result::Result { + match self.alg.as_deref() { + Some("RS256") => Ok(Algorithm::RS256), + Some("RS384") => Ok(Algorithm::RS384), + Some("RS512") => Ok(Algorithm::RS512), + Some("ES256") => Ok(Algorithm::ES256), + Some("ES384") => Ok(Algorithm::ES384), + Some(other) => Err(TokenValidationError::Malformed(format!( + "Unsupported algorithm: {}", + other + ))), + None => { + // Infer from key type + match self.kty.as_str() { + "RSA" => Ok(Algorithm::RS256), + "EC" => match self.crv.as_deref() { + Some("P-256") => Ok(Algorithm::ES256), + Some("P-384") => Ok(Algorithm::ES384), + _ => Ok(Algorithm::ES256), + }, + _ => Err(TokenValidationError::Malformed( + "Cannot infer algorithm".into(), + )), + } + } + } + } +} + +/// Cached JWKS with expiration +struct CachedJwks { + jwks: Jwks, + fetched_at: Instant, +} + +/// Configuration for JWT verifier +#[derive(Debug, Clone)] +pub struct JwtVerifierConfig { + /// JWKS endpoint URL + pub jwks_url: String, + /// Expected issuer + pub issuer: String, + /// Expected audience + pub audience: String, + /// JWKS cache TTL + pub jwks_cache_ttl: Duration, + /// Allow clock skew (seconds) + pub clock_skew_seconds: u64, +} + +impl JwtVerifierConfig { + /// Create a new config + pub fn new( + jwks_url: impl Into, + issuer: impl Into, + audience: impl Into, + ) -> Self { + Self { + jwks_url: jwks_url.into(), + issuer: issuer.into(), + audience: audience.into(), + jwks_cache_ttl: Duration::from_secs(3600), // 1 hour default + clock_skew_seconds: 60, + } + } + + /// Set JWKS cache TTL + pub fn with_cache_ttl(mut self, ttl: Duration) -> Self { + self.jwks_cache_ttl = ttl; + self + } + + /// Set allowed clock skew + pub fn with_clock_skew(mut self, seconds: u64) -> Self { + self.clock_skew_seconds = seconds; + self + } +} + +/// JWT token verifier with JWKS support +pub struct JwtVerifier { + config: JwtVerifierConfig, + jwks_cache: RwLock>, + http_client: reqwest::Client, +} + +impl JwtVerifier { + /// Create a new JWT verifier + pub fn new(config: JwtVerifierConfig) -> Self { + Self { + config, + jwks_cache: RwLock::new(None), + http_client: reqwest::Client::new(), + } + } + + /// Verify a JWT token and return its claims + pub async fn verify(&self, token: &str) -> Result { + // 1. Decode header to get kid + let header = + decode_header(token).map_err(|e| Error::Iam(IamError::InvalidToken(e.to_string())))?; + + // 2. Get the signing key + let jwks = self.get_jwks().await?; + let key = self.find_key(&jwks, header.kid.as_deref())?; + + // 3. Build validation + let mut validation = Validation::new( + key.algorithm() + .map_err(|e| Error::Iam(IamError::InvalidToken(e.to_string())))?, + ); + validation.set_issuer(&[&self.config.issuer]); + validation.set_audience(&[&self.config.audience]); + validation.leeway = self.config.clock_skew_seconds; + + // 4. Decode and verify + let decoding_key = key + .to_decoding_key() + .map_err(|e| Error::Iam(IamError::InvalidToken(e.to_string())))?; + + let token_data = decode::(token, &decoding_key, &validation) + .map_err(|e| Error::Iam(IamError::InvalidToken(e.to_string())))?; + + Ok(token_data.claims) + } + + /// Verify a token and validate specific claims + pub async fn verify_with_validation( + &self, + token: &str, + additional_validation: F, + ) -> Result + where + F: FnOnce(&JwtClaims) -> Result<()>, + { + let claims = self.verify(token).await?; + additional_validation(&claims)?; + Ok(claims) + } + + /// Get JWKS (from cache or fetch) + async fn get_jwks(&self) -> Result { + // Check cache + { + let cache = self.jwks_cache.read().await; + if let Some(cached) = cache.as_ref() { + if cached.fetched_at.elapsed() < self.config.jwks_cache_ttl { + return Ok(cached.jwks.clone()); + } + } + } + + // Fetch new JWKS + let jwks = self.fetch_jwks().await?; + + // Update cache + { + let mut cache = self.jwks_cache.write().await; + *cache = Some(CachedJwks { + jwks: jwks.clone(), + fetched_at: Instant::now(), + }); + } + + Ok(jwks) + } + + /// Fetch JWKS from the configured URL + async fn fetch_jwks(&self) -> Result { + let response = self + .http_client + .get(&self.config.jwks_url) + .send() + .await + .map_err(|e| Error::Internal(format!("Failed to fetch JWKS: {}", e)))?; + + if !response.status().is_success() { + return Err(Error::Internal(format!( + "JWKS fetch failed with status: {}", + response.status() + ))); + } + + let jwks: Jwks = response + .json() + .await + .map_err(|e| Error::Internal(format!("Failed to parse JWKS: {}", e)))?; + + Ok(jwks) + } + + /// Find a key in JWKS by kid + fn find_key<'a>(&self, jwks: &'a Jwks, kid: Option<&str>) -> Result<&'a Jwk> { + match kid { + Some(kid) => jwks + .keys + .iter() + .find(|k| k.kid.as_deref() == Some(kid)) + .ok_or_else(|| { + Error::Iam(IamError::InvalidToken(format!("Key not found: {}", kid))) + }), + None => { + // If no kid, use the first key (common for single-key JWKS) + jwks.keys + .first() + .ok_or_else(|| Error::Iam(IamError::InvalidToken("JWKS has no keys".into()))) + } + } + } + + /// Force refresh the JWKS cache + pub async fn refresh_jwks(&self) -> Result<()> { + let jwks = self.fetch_jwks().await?; + let mut cache = self.jwks_cache.write().await; + *cache = Some(CachedJwks { + jwks, + fetched_at: Instant::now(), + }); + Ok(()) + } +} + +/// Multi-issuer JWT verifier +pub struct MultiIssuerJwtVerifier { + verifiers: HashMap>, +} + +impl MultiIssuerJwtVerifier { + /// Create a new multi-issuer verifier + pub fn new() -> Self { + Self { + verifiers: HashMap::new(), + } + } + + /// Add a verifier for an issuer + pub fn add_issuer(&mut self, issuer: impl Into, verifier: JwtVerifier) { + self.verifiers.insert(issuer.into(), Arc::new(verifier)); + } + + /// Verify a token (automatically selects issuer from token) + pub async fn verify(&self, token: &str) -> Result { + // Extract issuer from token without verification + let issuer = extract_issuer_from_token(token)?; + + let verifier = self.verifiers.get(&issuer).ok_or_else(|| { + Error::Iam(IamError::InvalidToken(format!( + "Unknown issuer: {}", + issuer + ))) + })?; + + verifier.verify(token).await + } +} + +/// Extract issuer from JWT without verification (for routing purposes) +fn extract_issuer_from_token(token: &str) -> Result { + let parts: Vec<&str> = token.split('.').collect(); + if parts.len() != 3 { + return Err(Error::Iam(IamError::InvalidToken( + "Invalid JWT format".into(), + ))); + } + + // Decode payload (middle part) + let payload_json = base64::engine::general_purpose::URL_SAFE_NO_PAD + .decode(parts[1]) + .map_err(|e| Error::Iam(IamError::InvalidToken(format!("Invalid base64: {}", e))))?; + + let claims: UnverifiedClaims = serde_json::from_slice(&payload_json) + .map_err(|e| Error::Iam(IamError::InvalidToken(format!("Invalid JSON: {}", e))))?; + + Ok(claims.iss) +} + +impl Default for MultiIssuerJwtVerifier { + fn default() -> Self { + Self::new() + } +} + +/// Minimal claims for issuer detection +#[derive(Debug, Deserialize)] +struct UnverifiedClaims { + iss: String, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_jwk_algorithm_inference() { + let rsa_key = Jwk { + kty: "RSA".into(), + kid: Some("key1".into()), + key_use: Some("sig".into()), + alg: None, + n: Some("abc".into()), + e: Some("AQAB".into()), + x: None, + y: None, + crv: None, + }; + + assert!(matches!(rsa_key.algorithm(), Ok(Algorithm::RS256))); + + let ec_key = Jwk { + kty: "EC".into(), + kid: Some("key2".into()), + key_use: Some("sig".into()), + alg: Some("ES256".into()), + n: None, + e: None, + x: Some("x".into()), + y: Some("y".into()), + crv: Some("P-256".into()), + }; + + assert!(matches!(ec_key.algorithm(), Ok(Algorithm::ES256))); + } + + #[test] + fn test_verifier_config() { + let config = JwtVerifierConfig::new( + "https://example.com/.well-known/jwks.json", + "https://example.com", + "my-api", + ) + .with_cache_ttl(Duration::from_secs(7200)) + .with_clock_skew(120); + + assert_eq!(config.jwks_cache_ttl, Duration::from_secs(7200)); + assert_eq!(config.clock_skew_seconds, 120); + } +} diff --git a/iam/crates/iam-authn/src/lib.rs b/iam/crates/iam-authn/src/lib.rs new file mode 100644 index 0000000..3287df2 --- /dev/null +++ b/iam/crates/iam-authn/src/lib.rs @@ -0,0 +1,22 @@ +//! Authentication layer for IAM +//! +//! Provides JWT/OIDC verification, mTLS authentication, and internal token management. + +pub mod jwt; +pub mod mtls; +pub mod provider; +pub mod token; + +pub use jwt::{Jwk, Jwks, JwtVerifier, JwtVerifierConfig, MultiIssuerJwtVerifier}; +pub use mtls::{ + CertificateInfo, MtlsAuthResult, MtlsVerifier, MtlsVerifierConfig, PrincipalMapping, + SubjectAltName, +}; +pub use provider::{ + extract_credentials_from_headers, AuthnCredentials, AuthnProvider, AuthnResult, + CombinedAuthProvider, +}; +pub use token::{ + InternalTokenConfig, InternalTokenService, IssuedToken, KeyRotationConfig, + KeyRotationManager, KeyRotationStats, KeyStatus, ManagedKey, SigningKey, TsoClient, +}; diff --git a/iam/crates/iam-authn/src/mtls.rs b/iam/crates/iam-authn/src/mtls.rs new file mode 100644 index 0000000..68dc7d5 --- /dev/null +++ b/iam/crates/iam-authn/src/mtls.rs @@ -0,0 +1,353 @@ +//! mTLS (Mutual TLS) authentication +//! +//! Provides certificate-based authentication for service-to-service communication. + +use std::collections::HashMap; + +use iam_types::{Error, IamError, PrincipalKind, PrincipalRef, Result}; + +/// Certificate information extracted from mTLS connection +#[derive(Debug, Clone)] +pub struct CertificateInfo { + /// Common Name (CN) from the certificate subject + pub common_name: String, + /// Organization (O) from the certificate subject + pub organization: Option, + /// Organizational Unit (OU) from the certificate subject + pub organizational_unit: Option, + /// Serial number of the certificate + pub serial_number: String, + /// Certificate fingerprint (SHA-256) + pub fingerprint: String, + /// Not before timestamp (Unix seconds) + pub not_before: u64, + /// Not after timestamp (Unix seconds) + pub not_after: u64, + /// Subject Alternative Names (SANs) + pub sans: Vec, +} + +/// Subject Alternative Name types +#[derive(Debug, Clone)] +pub enum SubjectAltName { + /// DNS name + Dns(String), + /// URI + Uri(String), + /// IP address + Ip(String), + /// Email + Email(String), +} + +/// Configuration for mTLS verifier +#[derive(Debug, Clone)] +pub struct MtlsVerifierConfig { + /// Mapping from CN patterns to principal references + /// Pattern can use * as wildcard + pub cn_mappings: HashMap, + /// Required organization + pub required_org: Option, + /// Required organizational unit + pub required_ou: Option, + /// Whether to validate certificate expiration + pub validate_expiration: bool, +} + +/// Mapping configuration for a CN pattern +#[derive(Debug, Clone)] +pub struct PrincipalMapping { + /// Principal kind to assign + pub kind: PrincipalKind, + /// ID template (can use {cn}, {ou}, {o} placeholders) + pub id_template: String, + /// Optional node ID extraction from CN + pub node_id_from_cn: bool, +} + +impl Default for MtlsVerifierConfig { + fn default() -> Self { + Self { + cn_mappings: HashMap::new(), + required_org: None, + required_ou: None, + validate_expiration: true, + } + } +} + +impl MtlsVerifierConfig { + /// Create a new config + pub fn new() -> Self { + Self::default() + } + + /// Add a CN to principal mapping + pub fn add_mapping(mut self, cn_pattern: impl Into, mapping: PrincipalMapping) -> Self { + self.cn_mappings.insert(cn_pattern.into(), mapping); + self + } + + /// Set required organization + pub fn with_required_org(mut self, org: impl Into) -> Self { + self.required_org = Some(org.into()); + self + } + + /// Set required OU + pub fn with_required_ou(mut self, ou: impl Into) -> Self { + self.required_ou = Some(ou.into()); + self + } +} + +/// mTLS certificate verifier +pub struct MtlsVerifier { + config: MtlsVerifierConfig, +} + +impl MtlsVerifier { + /// Create a new mTLS verifier + pub fn new(config: MtlsVerifierConfig) -> Self { + Self { config } + } + + /// Create with default service account mapping + pub fn with_default_sa_mapping() -> Self { + let config = MtlsVerifierConfig::new() + .add_mapping( + "*.service.internal", + PrincipalMapping { + kind: PrincipalKind::ServiceAccount, + id_template: "{cn}".into(), + node_id_from_cn: false, + }, + ) + .add_mapping( + "node-*.compute.internal", + PrincipalMapping { + kind: PrincipalKind::ServiceAccount, + id_template: "compute-agent".into(), + node_id_from_cn: true, + }, + ); + + Self::new(config) + } + + /// Verify a certificate and return the authenticated principal + pub fn verify(&self, cert_info: &CertificateInfo) -> Result { + // 1. Validate expiration + if self.config.validate_expiration { + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs(); + + if now < cert_info.not_before { + return Err(Error::Iam(IamError::InvalidToken( + "Certificate not yet valid".into(), + ))); + } + + if now > cert_info.not_after { + return Err(Error::Iam(IamError::InvalidToken( + "Certificate expired".into(), + ))); + } + } + + // 2. Validate organization if required + if let Some(required_org) = &self.config.required_org { + match &cert_info.organization { + Some(org) if org == required_org => {} + Some(org) => { + return Err(Error::Iam(IamError::AuthnFailed(format!( + "Invalid organization: expected {}, got {}", + required_org, org + )))); + } + None => { + return Err(Error::Iam(IamError::AuthnFailed( + "Certificate missing organization".into(), + ))); + } + } + } + + // 3. Validate OU if required + if let Some(required_ou) = &self.config.required_ou { + match &cert_info.organizational_unit { + Some(ou) if ou == required_ou => {} + Some(ou) => { + return Err(Error::Iam(IamError::AuthnFailed(format!( + "Invalid organizational unit: expected {}, got {}", + required_ou, ou + )))); + } + None => { + return Err(Error::Iam(IamError::AuthnFailed( + "Certificate missing organizational unit".into(), + ))); + } + } + } + + // 4. Find matching CN pattern + let (_pattern, mapping) = self + .find_matching_pattern(&cert_info.common_name) + .ok_or_else(|| { + Error::Iam(IamError::AuthnFailed(format!( + "No mapping for CN: {}", + cert_info.common_name + ))) + })?; + + // 5. Build principal reference + let principal_id = self.expand_template(&mapping.id_template, cert_info); + + let principal_ref = PrincipalRef::new(mapping.kind.clone(), principal_id); + + // 6. Extract node ID if configured + let node_id = if mapping.node_id_from_cn { + Some(self.extract_node_id(&cert_info.common_name)) + } else { + None + }; + + Ok(MtlsAuthResult { + principal_ref, + node_id, + certificate_fingerprint: cert_info.fingerprint.clone(), + }) + } + + fn find_matching_pattern(&self, cn: &str) -> Option<(&String, &PrincipalMapping)> { + for (pattern, mapping) in &self.config.cn_mappings { + if self.matches_pattern(pattern, cn) { + return Some((pattern, mapping)); + } + } + None + } + + fn matches_pattern(&self, pattern: &str, value: &str) -> bool { + if pattern == "*" { + return true; + } + + if !pattern.contains('*') { + return pattern == value; + } + + // Simple glob matching + let parts: Vec<&str> = pattern.split('*').collect(); + if parts.len() == 2 { + let (prefix, suffix) = (parts[0], parts[1]); + return value.starts_with(prefix) && value.ends_with(suffix); + } + + // For more complex patterns, do exact match + pattern == value + } + + fn expand_template(&self, template: &str, cert_info: &CertificateInfo) -> String { + let mut result = template.to_string(); + result = result.replace("{cn}", &cert_info.common_name); + if let Some(o) = &cert_info.organization { + result = result.replace("{o}", o); + } + if let Some(ou) = &cert_info.organizational_unit { + result = result.replace("{ou}", ou); + } + result + } + + fn extract_node_id(&self, cn: &str) -> String { + // Extract node ID from CN like "node-abc123.compute.internal" + cn.split('.').next().unwrap_or(cn).to_string() + } +} + +/// Result of mTLS authentication +#[derive(Debug, Clone)] +pub struct MtlsAuthResult { + /// Authenticated principal reference + pub principal_ref: PrincipalRef, + /// Node ID (if applicable) + pub node_id: Option, + /// Certificate fingerprint + pub certificate_fingerprint: String, +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_cert_info() -> CertificateInfo { + CertificateInfo { + common_name: "compute-agent.service.internal".into(), + organization: Some("cloud-platform".into()), + organizational_unit: Some("compute".into()), + serial_number: "123456".into(), + fingerprint: "sha256:abc123".into(), + not_before: 0, + not_after: u64::MAX, + sans: vec![], + } + } + + #[test] + fn test_mtls_verification() { + let config = MtlsVerifierConfig::new() + .add_mapping( + "*.service.internal", + PrincipalMapping { + kind: PrincipalKind::ServiceAccount, + id_template: "{cn}".into(), + node_id_from_cn: false, + }, + ) + .with_required_org("cloud-platform"); + + let verifier = MtlsVerifier::new(config); + let cert = test_cert_info(); + + let result = verifier.verify(&cert).unwrap(); + assert_eq!(result.principal_ref.kind, PrincipalKind::ServiceAccount); + assert_eq!(result.principal_ref.id, "compute-agent.service.internal"); + } + + #[test] + fn test_node_id_extraction() { + let config = MtlsVerifierConfig::new().add_mapping( + "node-*.compute.internal", + PrincipalMapping { + kind: PrincipalKind::ServiceAccount, + id_template: "compute-agent".into(), + node_id_from_cn: true, + }, + ); + + let verifier = MtlsVerifier::new(config); + + let mut cert = test_cert_info(); + cert.common_name = "node-abc123.compute.internal".into(); + cert.organization = None; + + let result = verifier.verify(&cert).unwrap(); + assert_eq!(result.node_id, Some("node-abc123".into())); + } + + #[test] + fn test_pattern_matching() { + let verifier = MtlsVerifier::new(MtlsVerifierConfig::default()); + + assert!(verifier.matches_pattern("*.example.com", "foo.example.com")); + assert!(verifier.matches_pattern("*.example.com", "bar.example.com")); + assert!(!verifier.matches_pattern("*.example.com", "foo.other.com")); + assert!(verifier.matches_pattern("exact-match", "exact-match")); + assert!(!verifier.matches_pattern("exact-match", "other")); + assert!(verifier.matches_pattern("*", "anything")); + } +} diff --git a/iam/crates/iam-authn/src/provider.rs b/iam/crates/iam-authn/src/provider.rs new file mode 100644 index 0000000..d49cd0e --- /dev/null +++ b/iam/crates/iam-authn/src/provider.rs @@ -0,0 +1,245 @@ +//! Authentication provider trait and implementations +//! +//! Defines a unified interface for different authentication methods. + +use async_trait::async_trait; +use std::sync::Arc; + +use iam_types::{ + AuthMethod, Error, IamError, InternalTokenClaims, JwtClaims, PrincipalRef, Result, +}; + +use crate::jwt::JwtVerifier; +use crate::mtls::{CertificateInfo, MtlsVerifier}; +use crate::token::InternalTokenService; + +/// Result of authentication +#[derive(Debug, Clone)] +pub struct AuthnResult { + /// Authenticated principal reference + pub principal_ref: PrincipalRef, + /// Authentication method used + pub auth_method: AuthMethod, + /// Node ID (for service accounts) + pub node_id: Option, + /// Organization ID + pub org_id: Option, + /// Project ID + pub project_id: Option, + /// Groups (from JWT) + pub groups: Vec, + /// Original claims (if JWT) + pub jwt_claims: Option, + /// Internal token claims (if internal token) + pub internal_claims: Option, +} + +/// Authentication credentials +#[derive(Debug, Clone)] +pub enum AuthnCredentials { + /// Bearer token (JWT or internal token) + BearerToken(String), + /// mTLS certificate info + Certificate(CertificateInfo), + /// API key + ApiKey(String), +} + +/// Authentication provider trait +#[async_trait] +pub trait AuthnProvider: Send + Sync { + /// Authenticate using the provided credentials + async fn authenticate(&self, credentials: &AuthnCredentials) -> Result; +} + +/// Combined authentication provider supporting multiple methods +pub struct CombinedAuthProvider { + jwt_verifier: Option>, + mtls_verifier: Option>, + internal_token_service: Option>, +} + +impl CombinedAuthProvider { + /// Create a new combined auth provider + pub fn new() -> Self { + Self { + jwt_verifier: None, + mtls_verifier: None, + internal_token_service: None, + } + } + + /// Add JWT verifier + pub fn with_jwt(mut self, verifier: JwtVerifier) -> Self { + self.jwt_verifier = Some(Arc::new(verifier)); + self + } + + /// Add mTLS verifier + pub fn with_mtls(mut self, verifier: MtlsVerifier) -> Self { + self.mtls_verifier = Some(Arc::new(verifier)); + self + } + + /// Add internal token service + pub fn with_internal_token(mut self, service: InternalTokenService) -> Self { + self.internal_token_service = Some(Arc::new(service)); + self + } + + /// Authenticate a bearer token + async fn authenticate_bearer(&self, token: &str) -> Result { + // Try internal token first (faster, local verification) + if let Some(internal_service) = &self.internal_token_service { + if let Ok(claims) = internal_service.verify(token).await { + return Ok(AuthnResult { + principal_ref: PrincipalRef::new( + claims.principal_kind.clone(), + &claims.principal_id, + ), + auth_method: AuthMethod::Internal, + node_id: claims.node_id.clone(), + org_id: claims.org_id.clone(), + project_id: claims.project_id.clone(), + groups: vec![], + jwt_claims: None, + internal_claims: Some(claims), + }); + } + } + + // Try JWT verification + if let Some(jwt_verifier) = &self.jwt_verifier { + let claims = jwt_verifier.verify(token).await?; + + // Map JWT sub to principal + let principal_ref = PrincipalRef::user(&claims.sub); + + return Ok(AuthnResult { + principal_ref, + auth_method: AuthMethod::Jwt, + node_id: None, + org_id: claims.org_id.clone(), + project_id: claims.project_id.clone(), + groups: claims.groups.clone(), + jwt_claims: Some(claims), + internal_claims: None, + }); + } + + Err(Error::Iam(IamError::AuthnFailed( + "No authentication provider configured for bearer tokens".into(), + ))) + } + + /// Authenticate using certificate + fn authenticate_certificate(&self, cert_info: &CertificateInfo) -> Result { + let mtls_verifier = self.mtls_verifier.as_ref().ok_or_else(|| { + Error::Iam(IamError::AuthnFailed( + "mTLS authentication not configured".into(), + )) + })?; + + let result = mtls_verifier.verify(cert_info)?; + + Ok(AuthnResult { + principal_ref: result.principal_ref, + auth_method: AuthMethod::Mtls, + node_id: result.node_id, + org_id: None, + project_id: None, + groups: vec![], + jwt_claims: None, + internal_claims: None, + }) + } + + /// Authenticate using API key + async fn authenticate_api_key(&self, _api_key: &str) -> Result { + // API key authentication would typically: + // 1. Look up the API key in the store + // 2. Verify it's valid and not expired + // 3. Return the associated principal + + // For now, this is a stub + Err(Error::Iam(IamError::AuthnFailed( + "API key authentication not yet implemented".into(), + ))) + } +} + +impl Default for CombinedAuthProvider { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl AuthnProvider for CombinedAuthProvider { + async fn authenticate(&self, credentials: &AuthnCredentials) -> Result { + match credentials { + AuthnCredentials::BearerToken(token) => self.authenticate_bearer(token).await, + AuthnCredentials::Certificate(cert_info) => self.authenticate_certificate(cert_info), + AuthnCredentials::ApiKey(key) => self.authenticate_api_key(key).await, + } + } +} + +/// Extract authentication credentials from HTTP headers +pub fn extract_credentials_from_headers( + authorization: Option<&str>, + cert_info: Option, +) -> Option { + // Check for mTLS first (if certificate is provided) + if let Some(cert) = cert_info { + return Some(AuthnCredentials::Certificate(cert)); + } + + // Check Authorization header + if let Some(auth_header) = authorization { + if let Some(token) = auth_header.strip_prefix("Bearer ") { + return Some(AuthnCredentials::BearerToken(token.to_string())); + } + if let Some(key) = auth_header.strip_prefix("ApiKey ") { + return Some(AuthnCredentials::ApiKey(key.to_string())); + } + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_combined_provider_no_config() { + let provider = CombinedAuthProvider::new(); + + let result = provider + .authenticate(&AuthnCredentials::BearerToken("some-token".into())) + .await; + + assert!(result.is_err()); + } + + #[test] + fn test_extract_bearer_token() { + let creds = extract_credentials_from_headers(Some("Bearer abc123"), None); + + assert!(matches!( + creds, + Some(AuthnCredentials::BearerToken(t)) if t == "abc123" + )); + } + + #[test] + fn test_extract_api_key() { + let creds = extract_credentials_from_headers(Some("ApiKey secret-key"), None); + + assert!(matches!( + creds, + Some(AuthnCredentials::ApiKey(k)) if k == "secret-key" + )); + } +} diff --git a/iam/crates/iam-authn/src/token.rs b/iam/crates/iam-authn/src/token.rs new file mode 100644 index 0000000..f22adcd --- /dev/null +++ b/iam/crates/iam-authn/src/token.rs @@ -0,0 +1,856 @@ +//! Internal token service +//! +//! Issues and verifies internal tokens for service-to-service authentication. +//! Can optionally use TSO (Timestamp Oracle) for consistent timestamps. +//! +//! ## Key Rotation +//! +//! The service supports multiple signing keys for seamless rotation: +//! - New tokens are always signed with the "active" key +//! - Old tokens can be verified with "deprecated" keys during grace period +//! - "Retired" keys are no longer used for verification + +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine}; +use serde::{Deserialize, Serialize}; +use tokio::sync::RwLock; + +use iam_types::{ + AuthMethod, Error, IamError, InternalTokenClaims, Principal, Result, Scope, +}; + +/// Key status for rotation management +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum KeyStatus { + /// Key is actively used for signing new tokens + Active, + /// Key is deprecated but still valid for verification (grace period) + Deprecated, + /// Key is retired and no longer valid + Retired, +} + +/// Managed signing key with status and timestamps +#[derive(Clone)] +pub struct ManagedKey { + /// The underlying signing key + pub key: SigningKey, + /// Key status + pub status: KeyStatus, + /// When the key was created + pub created_at: u64, + /// When the key was deprecated (if applicable) + pub deprecated_at: Option, + /// When the key was retired (if applicable) + pub retired_at: Option, +} + +/// Token signing key +#[derive(Clone)] +pub struct SigningKey { + /// Key ID + pub kid: String, + /// Secret key bytes + secret: Vec, +} + +impl SigningKey { + /// Create a new signing key + pub fn new(kid: impl Into, secret: impl Into>) -> Self { + Self { + kid: kid.into(), + secret: secret.into(), + } + } + + /// Generate a random signing key + pub fn generate(kid: impl Into) -> Self { + use rand::RngCore; + let mut secret = vec![0u8; 32]; + rand::thread_rng().fill_bytes(&mut secret); + Self::new(kid, secret) + } + + /// Sign data using HMAC-SHA256 + pub fn sign(&self, data: &[u8]) -> Vec { + use hmac::{Hmac, Mac}; + use sha2::Sha256; + + type HmacSha256 = Hmac; + + let mut mac = + HmacSha256::new_from_slice(&self.secret).expect("HMAC can take key of any size"); + mac.update(data); + mac.finalize().into_bytes().to_vec() + } + + /// Verify a signature + pub fn verify(&self, data: &[u8], signature: &[u8]) -> bool { + use hmac::{Hmac, Mac}; + use sha2::Sha256; + + type HmacSha256 = Hmac; + + let mut mac = + HmacSha256::new_from_slice(&self.secret).expect("HMAC can take key of any size"); + mac.update(data); + + mac.verify_slice(signature).is_ok() + } +} + +impl ManagedKey { + /// Create a new active managed key + pub fn new_active(key: SigningKey, created_at: u64) -> Self { + Self { + key, + status: KeyStatus::Active, + created_at, + deprecated_at: None, + retired_at: None, + } + } + + /// Deprecate this key + pub fn deprecate(&mut self, at: u64) { + if self.status == KeyStatus::Active { + self.status = KeyStatus::Deprecated; + self.deprecated_at = Some(at); + } + } + + /// Retire this key + pub fn retire(&mut self, at: u64) { + self.status = KeyStatus::Retired; + self.retired_at = Some(at); + } + + /// Check if key can be used for signing (must be Active) + pub fn can_sign(&self) -> bool { + self.status == KeyStatus::Active + } + + /// Check if key can be used for verification (Active or Deprecated) + pub fn can_verify(&self) -> bool { + matches!(self.status, KeyStatus::Active | KeyStatus::Deprecated) + } +} + +/// Configuration for key rotation +#[derive(Debug, Clone)] +pub struct KeyRotationConfig { + /// How long to keep deprecated keys valid for verification + pub grace_period: Duration, + /// How often to check for keys to retire + pub cleanup_interval: Duration, + /// Prefix for generated key IDs + pub key_id_prefix: String, +} + +impl Default for KeyRotationConfig { + fn default() -> Self { + Self { + grace_period: Duration::from_secs(86400 * 7), // 7 days + cleanup_interval: Duration::from_secs(3600), // 1 hour + key_id_prefix: "key".into(), + } + } +} + +/// Key rotation manager +/// +/// Manages the lifecycle of signing keys: +/// 1. Generate new keys +/// 2. Deprecate old keys (enter grace period) +/// 3. Retire deprecated keys after grace period +pub struct KeyRotationManager { + config: KeyRotationConfig, + /// All managed keys, keyed by kid + keys: RwLock>, + /// ID of the currently active key + active_key_id: RwLock>, +} + +impl KeyRotationManager { + /// Create a new key rotation manager + pub fn new(config: KeyRotationConfig) -> Self { + Self { + config, + keys: RwLock::new(HashMap::new()), + active_key_id: RwLock::new(None), + } + } + + /// Create with default config + pub fn with_defaults() -> Self { + Self::new(KeyRotationConfig::default()) + } + + /// Initialize with an existing key + pub async fn init_with_key(&self, key: SigningKey, created_at: u64) { + let kid = key.kid.clone(); + let managed = ManagedKey::new_active(key, created_at); + + let mut keys = self.keys.write().await; + keys.insert(kid.clone(), managed); + + let mut active_id = self.active_key_id.write().await; + *active_id = Some(kid); + } + + /// Generate and activate a new key, deprecating the current active key + pub async fn rotate(&self) -> Result { + let now = current_timestamp(); + let new_kid = self.generate_key_id(); + let new_key = SigningKey::generate(&new_kid); + let new_managed = ManagedKey::new_active(new_key, now); + + let mut keys = self.keys.write().await; + let mut active_id = self.active_key_id.write().await; + + // Deprecate the current active key + if let Some(old_kid) = active_id.as_ref() { + if let Some(old_key) = keys.get_mut(old_kid) { + old_key.deprecate(now); + } + } + + // Insert and activate new key + keys.insert(new_kid.clone(), new_managed); + *active_id = Some(new_kid.clone()); + + Ok(new_kid) + } + + /// Get the active signing key + pub async fn get_active_key(&self) -> Option { + let active_id = self.active_key_id.read().await; + let keys = self.keys.read().await; + + active_id + .as_ref() + .and_then(|kid| keys.get(kid)) + .filter(|k| k.can_sign()) + .map(|k| k.key.clone()) + } + + /// Get a key by ID for verification (if it can verify) + pub async fn get_key_for_verify(&self, kid: &str) -> Option { + let keys = self.keys.read().await; + keys.get(kid) + .filter(|k| k.can_verify()) + .map(|k| k.key.clone()) + } + + /// Get all keys that can be used for verification + pub async fn get_verification_keys(&self) -> Vec { + let keys = self.keys.read().await; + keys.values() + .filter(|k| k.can_verify()) + .map(|k| k.key.clone()) + .collect() + } + + /// Retire keys that have exceeded the grace period + pub async fn cleanup_expired(&self) -> usize { + let now = current_timestamp(); + let grace_secs = self.config.grace_period.as_secs(); + let mut retired = 0; + + let mut keys = self.keys.write().await; + for managed in keys.values_mut() { + if managed.status == KeyStatus::Deprecated { + if let Some(deprecated_at) = managed.deprecated_at { + // Use >= to allow immediate expiry when grace_period is 0 + if now >= deprecated_at + grace_secs { + managed.retire(now); + retired += 1; + } + } + } + } + + retired + } + + /// Remove retired keys from memory + pub async fn purge_retired(&self) -> usize { + let mut keys = self.keys.write().await; + let before = keys.len(); + keys.retain(|_, v| v.status != KeyStatus::Retired); + before - keys.len() + } + + /// Get key statistics + pub async fn stats(&self) -> KeyRotationStats { + let keys = self.keys.read().await; + let mut active = 0; + let mut deprecated = 0; + let mut retired = 0; + + for key in keys.values() { + match key.status { + KeyStatus::Active => active += 1, + KeyStatus::Deprecated => deprecated += 1, + KeyStatus::Retired => retired += 1, + } + } + + KeyRotationStats { + active, + deprecated, + retired, + total: keys.len(), + } + } + + /// Generate a unique key ID + fn generate_key_id(&self) -> String { + let timestamp = current_timestamp(); + let mut random = [0u8; 4]; + rand::RngCore::fill_bytes(&mut rand::thread_rng(), &mut random); + format!( + "{}-{}-{}", + self.config.key_id_prefix, + timestamp, + URL_SAFE_NO_PAD.encode(random) + ) + } +} + +/// Key rotation statistics +#[derive(Debug, Clone)] +pub struct KeyRotationStats { + pub active: usize, + pub deprecated: usize, + pub retired: usize, + pub total: usize, +} + +/// Get current Unix timestamp +fn current_timestamp() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_secs() +} + +/// Configuration for internal token service +#[derive(Clone)] +pub struct InternalTokenConfig { + /// Signing keys (for rotation support) + pub signing_keys: Vec, + /// Default token TTL + pub default_ttl: Duration, + /// Maximum token TTL + pub max_ttl: Duration, + /// Token issuer identifier + pub issuer: String, +} + +impl InternalTokenConfig { + /// Create a new config with a single key + pub fn new(signing_key: SigningKey, issuer: impl Into) -> Self { + Self { + signing_keys: vec![signing_key], + default_ttl: Duration::from_secs(3600), // 1 hour + max_ttl: Duration::from_secs(86400 * 7), // 7 days + issuer: issuer.into(), + } + } + + /// Add a signing key (for rotation) + pub fn add_key(mut self, key: SigningKey) -> Self { + self.signing_keys.push(key); + self + } + + /// Set default TTL + pub fn with_default_ttl(mut self, ttl: Duration) -> Self { + self.default_ttl = ttl; + self + } + + /// Set maximum TTL + pub fn with_max_ttl(mut self, ttl: Duration) -> Self { + self.max_ttl = ttl; + self + } +} + +/// TSO client trait for timestamp generation +#[async_trait::async_trait] +pub trait TsoClient: Send + Sync { + /// Get current timestamp + async fn get_timestamp(&self) -> Result; +} + +/// Internal token service +pub struct InternalTokenService { + config: InternalTokenConfig, + tso_client: Option>, +} + +impl InternalTokenService { + /// Create a new internal token service + pub fn new(config: InternalTokenConfig) -> Self { + Self { + config, + tso_client: None, + } + } + + /// Create with TSO client for consistent timestamps + pub fn with_tso(mut self, client: Arc) -> Self { + self.tso_client = Some(client); + self + } + + /// Issue a new internal token + pub async fn issue( + &self, + principal: &Principal, + roles: Vec, + scope: Scope, + ttl: Option, + ) -> Result { + let ttl = ttl.unwrap_or(self.config.default_ttl); + if ttl > self.config.max_ttl { + return Err(Error::Iam(IamError::InvalidToken(format!( + "TTL exceeds maximum: {:?} > {:?}", + ttl, self.config.max_ttl + )))); + } + + let now = self.get_timestamp().await?; + let exp = now + ttl.as_secs(); + + let session_id = generate_session_id(); + + let claims = InternalTokenClaims::new( + &principal.id, + principal.kind.clone(), + &principal.name, + scope, + &session_id, + ) + .with_roles(roles) + .with_timestamps(now, exp) + .with_auth_method(AuthMethod::Internal); + + // Add optional fields + let claims = match &principal.org_id { + Some(org) => claims.with_org_id(org), + None => claims, + }; + + let claims = match &principal.project_id { + Some(proj) => claims.with_project_id(proj), + None => claims, + }; + + let claims = match &principal.node_id { + Some(node) => claims.with_node_id(node), + None => claims, + }; + + let token = self.encode_token(&claims)?; + + Ok(IssuedToken { + token, + claims, + expires_at: exp, + }) + } + + /// Verify an internal token + pub async fn verify(&self, token: &str) -> Result { + let claims = self.decode_token(token)?; + + // Check expiration + let now = self.get_timestamp().await?; + if claims.is_expired(now) { + return Err(Error::Iam(IamError::TokenExpired)); + } + + Ok(claims) + } + + /// Get current timestamp (from TSO or system time) + async fn get_timestamp(&self) -> Result { + match &self.tso_client { + Some(tso) => tso.get_timestamp().await, + None => Ok(SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_secs()), + } + } + + /// Encode claims into a signed token + fn encode_token(&self, claims: &InternalTokenClaims) -> Result { + let signing_key = self + .config + .signing_keys + .first() + .ok_or_else(|| Error::Internal("No signing key configured".into()))?; + + // Serialize claims + let claims_json = + serde_json::to_vec(claims).map_err(|e| Error::Serialization(e.to_string()))?; + + // Create header + let header = TokenHeader { + alg: "HS256".into(), + kid: signing_key.kid.clone(), + iss: self.config.issuer.clone(), + }; + let header_json = + serde_json::to_vec(&header).map_err(|e| Error::Serialization(e.to_string()))?; + + // Encode parts + let header_b64 = URL_SAFE_NO_PAD.encode(&header_json); + let claims_b64 = URL_SAFE_NO_PAD.encode(&claims_json); + + // Sign + let signing_input = format!("{}.{}", header_b64, claims_b64); + let signature = signing_key.sign(signing_input.as_bytes()); + let signature_b64 = URL_SAFE_NO_PAD.encode(&signature); + + Ok(format!("{}.{}.{}", header_b64, claims_b64, signature_b64)) + } + + /// Decode and verify a token + fn decode_token(&self, token: &str) -> Result { + let parts: Vec<&str> = token.split('.').collect(); + if parts.len() != 3 { + return Err(Error::Iam(IamError::InvalidToken( + "Invalid token format".into(), + ))); + } + + let header_json = URL_SAFE_NO_PAD + .decode(parts[0]) + .map_err(|e| Error::Iam(IamError::InvalidToken(e.to_string())))?; + + let header: TokenHeader = serde_json::from_slice(&header_json) + .map_err(|e| Error::Iam(IamError::InvalidToken(e.to_string())))?; + + // Find signing key + let signing_key = self + .config + .signing_keys + .iter() + .find(|k| k.kid == header.kid) + .ok_or_else(|| { + Error::Iam(IamError::InvalidToken(format!( + "Unknown key ID: {}", + header.kid + ))) + })?; + + // Verify signature + let signature = URL_SAFE_NO_PAD + .decode(parts[2]) + .map_err(|e| Error::Iam(IamError::InvalidToken(e.to_string())))?; + + let signing_input = format!("{}.{}", parts[0], parts[1]); + if !signing_key.verify(signing_input.as_bytes(), &signature) { + return Err(Error::Iam(IamError::InvalidToken( + "Invalid signature".into(), + ))); + } + + // Decode claims + let claims_json = URL_SAFE_NO_PAD + .decode(parts[1]) + .map_err(|e| Error::Iam(IamError::InvalidToken(e.to_string())))?; + + let claims: InternalTokenClaims = serde_json::from_slice(&claims_json) + .map_err(|e| Error::Iam(IamError::InvalidToken(e.to_string())))?; + + Ok(claims) + } +} + +/// Token header +#[derive(Debug, Serialize, Deserialize)] +struct TokenHeader { + alg: String, + kid: String, + iss: String, +} + +/// Result of issuing a token +#[derive(Debug, Clone)] +pub struct IssuedToken { + /// The token string + pub token: String, + /// The token claims + pub claims: InternalTokenClaims, + /// Expiration timestamp (Unix seconds) + pub expires_at: u64, +} + +/// Generate a random session ID +fn generate_session_id() -> String { + use rand::RngCore; + let mut bytes = [0u8; 16]; + rand::thread_rng().fill_bytes(&mut bytes); + URL_SAFE_NO_PAD.encode(bytes) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_config() -> InternalTokenConfig { + let key = SigningKey::generate("test-key-1"); + InternalTokenConfig::new(key, "iam-test") + } + + #[tokio::test] + async fn test_issue_and_verify() { + let service = InternalTokenService::new(test_config()); + + let principal = Principal::new_user("alice", "Alice Smith"); + let roles = vec!["roles/ProjectAdmin".into()]; + let scope = Scope::project("proj-1", "org-1"); + + let issued = service.issue(&principal, roles, scope, None).await.unwrap(); + + assert!(!issued.token.is_empty()); + assert_eq!(issued.claims.principal_id, "alice"); + + // Verify + let verified = service.verify(&issued.token).await.unwrap(); + assert_eq!(verified.principal_id, "alice"); + assert_eq!(verified.roles.len(), 1); + } + + #[tokio::test] + async fn test_invalid_signature() { + let service = InternalTokenService::new(test_config()); + + let principal = Principal::new_user("alice", "Alice"); + let issued = service + .issue(&principal, vec![], Scope::System, None) + .await + .unwrap(); + + // Tamper with token + let parts: Vec<&str> = issued.token.split('.').collect(); + let tampered = format!("{}.{}.invalid", parts[0], parts[1]); + + let result = service.verify(&tampered).await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_ttl_limit() { + let config = test_config().with_max_ttl(Duration::from_secs(3600)); + let service = InternalTokenService::new(config); + + let principal = Principal::new_user("alice", "Alice"); + let result = service + .issue( + &principal, + vec![], + Scope::System, + Some(Duration::from_secs(86400)), // 24 hours - exceeds max + ) + .await; + + assert!(result.is_err()); + } + + #[test] + fn test_signing_key() { + let key = SigningKey::generate("test"); + let data = b"hello world"; + + let signature = key.sign(data); + assert!(key.verify(data, &signature)); + assert!(!key.verify(b"tampered", &signature)); + } + + // Key rotation tests + + #[tokio::test] + async fn test_key_rotation_init() { + let manager = KeyRotationManager::with_defaults(); + let key = SigningKey::generate("initial-key"); + let now = current_timestamp(); + + manager.init_with_key(key.clone(), now).await; + + let active = manager.get_active_key().await; + assert!(active.is_some()); + assert_eq!(active.unwrap().kid, "initial-key"); + + let stats = manager.stats().await; + assert_eq!(stats.active, 1); + assert_eq!(stats.deprecated, 0); + assert_eq!(stats.total, 1); + } + + #[tokio::test] + async fn test_key_rotation_rotate() { + let manager = KeyRotationManager::with_defaults(); + let key = SigningKey::generate("initial-key"); + let now = current_timestamp(); + + manager.init_with_key(key.clone(), now).await; + + // Rotate to new key + let new_kid = manager.rotate().await.unwrap(); + assert!(new_kid.starts_with("key-")); + + // Active key should be the new one + let active = manager.get_active_key().await; + assert!(active.is_some()); + assert_eq!(active.unwrap().kid, new_kid); + + // Old key should still be available for verification + let old_key = manager.get_key_for_verify("initial-key").await; + assert!(old_key.is_some()); + + let stats = manager.stats().await; + assert_eq!(stats.active, 1); + assert_eq!(stats.deprecated, 1); + assert_eq!(stats.total, 2); + } + + #[tokio::test] + async fn test_key_rotation_verify_with_old_key() { + let manager = KeyRotationManager::with_defaults(); + let key = SigningKey::generate("initial-key"); + let now = current_timestamp(); + + manager.init_with_key(key.clone(), now).await; + + // Sign some data with the active key + let data = b"test data"; + let signature = manager.get_active_key().await.unwrap().sign(data); + + // Rotate to new key + manager.rotate().await.unwrap(); + + // Old key should still be able to verify + let old_key = manager.get_key_for_verify("initial-key").await.unwrap(); + assert!(old_key.verify(data, &signature)); + } + + #[tokio::test] + async fn test_key_rotation_multiple_rotations() { + let manager = KeyRotationManager::with_defaults(); + let key = SigningKey::generate("key-0"); + let now = current_timestamp(); + + manager.init_with_key(key, now).await; + + // Rotate 3 times + for _ in 0..3 { + manager.rotate().await.unwrap(); + } + + let stats = manager.stats().await; + assert_eq!(stats.active, 1); + assert_eq!(stats.deprecated, 3); // key-0, key-1, key-2 are deprecated + assert_eq!(stats.total, 4); + + // All deprecated keys should still be verifiable + assert!(manager.get_key_for_verify("key-0").await.is_some()); + } + + #[tokio::test] + async fn test_key_status_transitions() { + let key = SigningKey::generate("test-key"); + let now = current_timestamp(); + let mut managed = ManagedKey::new_active(key, now); + + // Initially active + assert!(managed.can_sign()); + assert!(managed.can_verify()); + + // Deprecate + managed.deprecate(now); + assert!(!managed.can_sign()); + assert!(managed.can_verify()); + assert_eq!(managed.status, KeyStatus::Deprecated); + + // Retire + managed.retire(now); + assert!(!managed.can_sign()); + assert!(!managed.can_verify()); + assert_eq!(managed.status, KeyStatus::Retired); + } + + #[tokio::test] + async fn test_key_rotation_cleanup_expired() { + // Use very short grace period for testing + let config = KeyRotationConfig { + grace_period: Duration::from_secs(0), // immediate expiry + cleanup_interval: Duration::from_secs(60), + key_id_prefix: "test".into(), + }; + let manager = KeyRotationManager::new(config); + let key = SigningKey::generate("initial"); + let now = current_timestamp(); + + manager.init_with_key(key, now).await; + manager.rotate().await.unwrap(); + + // Wait a moment for grace period to elapse + tokio::time::sleep(Duration::from_millis(10)).await; + + let retired = manager.cleanup_expired().await; + assert_eq!(retired, 1); + + let stats = manager.stats().await; + assert_eq!(stats.deprecated, 0); + assert_eq!(stats.retired, 1); + } + + #[tokio::test] + async fn test_key_rotation_purge_retired() { + let config = KeyRotationConfig { + grace_period: Duration::from_secs(0), + cleanup_interval: Duration::from_secs(60), + key_id_prefix: "test".into(), + }; + let manager = KeyRotationManager::new(config); + let key = SigningKey::generate("initial"); + let now = current_timestamp(); + + manager.init_with_key(key, now).await; + manager.rotate().await.unwrap(); + + tokio::time::sleep(Duration::from_millis(10)).await; + manager.cleanup_expired().await; + + let purged = manager.purge_retired().await; + assert_eq!(purged, 1); + + let stats = manager.stats().await; + assert_eq!(stats.total, 1); + assert_eq!(stats.active, 1); + } + + #[tokio::test] + async fn test_get_verification_keys() { + let manager = KeyRotationManager::with_defaults(); + let key = SigningKey::generate("initial"); + let now = current_timestamp(); + + manager.init_with_key(key, now).await; + manager.rotate().await.unwrap(); + manager.rotate().await.unwrap(); + + let verification_keys = manager.get_verification_keys().await; + assert_eq!(verification_keys.len(), 3); + } +} diff --git a/iam/crates/iam-authz/Cargo.toml b/iam/crates/iam-authz/Cargo.toml new file mode 100644 index 0000000..f6cdbe9 --- /dev/null +++ b/iam/crates/iam-authz/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "iam-authz" +version.workspace = true +edition.workspace = true +description = "Authorization layer (PDP) for IAM" +license.workspace = true + +[dependencies] +iam-types = { path = "../iam-types" } +iam-store = { path = "../iam-store" } +serde = { workspace = true } +serde_json = { workspace = true } +thiserror = { workspace = true } +tokio = { workspace = true } +tracing = { workspace = true } +async-trait = { workspace = true } +dashmap = { workspace = true } +ipnetwork = { workspace = true } +glob-match = { workspace = true } + +[dev-dependencies] +tokio = { workspace = true, features = ["full", "test-util"] } diff --git a/iam/crates/iam-authz/src/cache.rs b/iam/crates/iam-authz/src/cache.rs new file mode 100644 index 0000000..9fcba2e --- /dev/null +++ b/iam/crates/iam-authz/src/cache.rs @@ -0,0 +1,490 @@ +//! Policy cache for authorization +//! +//! Caches policy bindings and roles to reduce storage lookups. +//! Supports both TTL-based expiration and event-driven invalidation. + +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use dashmap::DashMap; +use tokio::sync::broadcast; + +use iam_types::{PolicyBinding, PrincipalRef, Role, Scope}; + +/// Cache invalidation event +#[derive(Debug, Clone)] +pub enum CacheInvalidation { + /// Invalidate bindings for a specific principal + Principal(String), + /// Invalidate a specific role + Role(String), + /// Invalidate all bindings within a scope + Scope(Scope), + /// Invalidate all cached data + All, +} + +/// Source of cache invalidation events (e.g., from Chainfire watch) +pub trait CacheInvalidationSource: Send + Sync { + /// Subscribe to invalidation events + fn subscribe(&self) -> broadcast::Receiver; +} + +/// Configuration for the policy cache +#[derive(Debug, Clone)] +pub struct PolicyCacheConfig { + /// TTL for cached bindings + pub binding_ttl: Duration, + /// TTL for cached roles + pub role_ttl: Duration, + /// Maximum number of binding entries + pub max_binding_entries: usize, + /// Maximum number of role entries + pub max_role_entries: usize, +} + +impl Default for PolicyCacheConfig { + fn default() -> Self { + Self { + binding_ttl: Duration::from_secs(300), // 5 minutes + role_ttl: Duration::from_secs(600), // 10 minutes + max_binding_entries: 10000, + max_role_entries: 1000, + } + } +} + +/// Cached bindings for a principal +struct CachedBindings { + bindings: Vec, + fetched_at: Instant, + /// Scopes covered by these bindings (for scope-based invalidation) + scopes: Vec, +} + +/// Cached role +struct CachedRole { + role: Role, + fetched_at: Instant, +} + +/// Policy cache +pub struct PolicyCache { + config: PolicyCacheConfig, + /// Bindings cache keyed by principal reference + bindings: DashMap, + /// Roles cache keyed by role name + roles: DashMap, +} + +impl PolicyCache { + /// Create a new policy cache + pub fn new(config: PolicyCacheConfig) -> Self { + Self { + config, + bindings: DashMap::new(), + roles: DashMap::new(), + } + } + + /// Create with default config + pub fn default_config() -> Self { + Self::new(PolicyCacheConfig::default()) + } + + /// Get cached bindings for a principal + pub fn get_bindings(&self, principal: &PrincipalRef) -> Option> { + let key = principal.to_string(); + self.bindings.get(&key).and_then(|entry| { + if entry.fetched_at.elapsed() < self.config.binding_ttl { + Some(entry.bindings.clone()) + } else { + None + } + }) + } + + /// Cache bindings for a principal + pub fn put_bindings(&self, principal: &PrincipalRef, bindings: Vec) { + // Evict if at capacity + if self.bindings.len() >= self.config.max_binding_entries { + self.evict_expired_bindings(); + } + + // Extract scopes from bindings for scope-based invalidation + let scopes: Vec = bindings.iter().map(|b| b.scope.clone()).collect(); + + let key = principal.to_string(); + self.bindings.insert( + key, + CachedBindings { + bindings, + fetched_at: Instant::now(), + scopes, + }, + ); + } + + /// Invalidate bindings for a principal + pub fn invalidate_bindings(&self, principal: &PrincipalRef) { + let key = principal.to_string(); + self.bindings.remove(&key); + } + + /// Invalidate all bindings + pub fn invalidate_all_bindings(&self) { + self.bindings.clear(); + } + + /// Invalidate bindings that cover a specific scope + pub fn invalidate_bindings_by_scope(&self, scope: &Scope) { + self.bindings.retain(|_, v| { + // Remove if any binding's scope matches or is a child of the given scope + !v.scopes.iter().any(|s| scope.contains(s)) + }); + } + + /// Handle an invalidation event + pub fn handle_invalidation(&self, event: &CacheInvalidation) { + match event { + CacheInvalidation::Principal(id) => { + // Try to parse as different principal types + // This is a simplified approach - in production you might want + // to include the kind in the event + self.bindings.remove(id); + } + CacheInvalidation::Role(name) => { + self.invalidate_role(name); + } + CacheInvalidation::Scope(scope) => { + self.invalidate_bindings_by_scope(scope); + } + CacheInvalidation::All => { + self.invalidate_all(); + } + } + } + + /// Get a cached role + pub fn get_role(&self, name: &str) -> Option { + self.roles.get(name).and_then(|entry| { + if entry.fetched_at.elapsed() < self.config.role_ttl { + Some(entry.role.clone()) + } else { + None + } + }) + } + + /// Cache a role + pub fn put_role(&self, role: Role) { + // Evict if at capacity + if self.roles.len() >= self.config.max_role_entries { + self.evict_expired_roles(); + } + + let name = role.name.clone(); + self.roles.insert( + name, + CachedRole { + role, + fetched_at: Instant::now(), + }, + ); + } + + /// Invalidate a cached role + pub fn invalidate_role(&self, name: &str) { + self.roles.remove(name); + } + + /// Invalidate all roles + pub fn invalidate_all_roles(&self) { + self.roles.clear(); + } + + /// Invalidate all cached data + pub fn invalidate_all(&self) { + self.bindings.clear(); + self.roles.clear(); + } + + /// Get cache statistics + pub fn stats(&self) -> CacheStats { + CacheStats { + binding_entries: self.bindings.len(), + role_entries: self.roles.len(), + } + } + + /// Evict expired binding entries + fn evict_expired_bindings(&self) { + let ttl = self.config.binding_ttl; + self.bindings.retain(|_, v| v.fetched_at.elapsed() < ttl); + } + + /// Evict expired role entries + fn evict_expired_roles(&self) { + let ttl = self.config.role_ttl; + self.roles.retain(|_, v| v.fetched_at.elapsed() < ttl); + } +} + +/// Start listening for invalidation events from an external source +/// +/// This function spawns a background task that listens for invalidation +/// events and applies them to the cache. +pub fn start_invalidation_listener( + cache: Arc, + source: Arc, +) -> tokio::task::JoinHandle<()> { + let mut rx = source.subscribe(); + tokio::spawn(async move { + loop { + match rx.recv().await { + Ok(event) => { + tracing::debug!("Received cache invalidation event: {:?}", event); + cache.handle_invalidation(&event); + } + Err(broadcast::error::RecvError::Closed) => { + tracing::info!("Cache invalidation source closed"); + break; + } + Err(broadcast::error::RecvError::Lagged(n)) => { + tracing::warn!("Lagged {} invalidation events, invalidating all", n); + cache.invalidate_all(); + } + } + } + }) +} + +/// A simple in-memory invalidation source for testing or local use +pub struct LocalInvalidationSource { + tx: broadcast::Sender, +} + +impl LocalInvalidationSource { + /// Create a new local invalidation source + pub fn new() -> Self { + let (tx, _) = broadcast::channel(256); + Self { tx } + } + + /// Send an invalidation event + pub fn invalidate(&self, event: CacheInvalidation) { + let _ = self.tx.send(event); + } + + /// Invalidate a specific principal + pub fn invalidate_principal(&self, principal_id: &str) { + self.invalidate(CacheInvalidation::Principal(principal_id.to_string())); + } + + /// Invalidate a specific role + pub fn invalidate_role(&self, role_name: &str) { + self.invalidate(CacheInvalidation::Role(role_name.to_string())); + } + + /// Invalidate a scope + pub fn invalidate_scope(&self, scope: Scope) { + self.invalidate(CacheInvalidation::Scope(scope)); + } + + /// Invalidate all + pub fn invalidate_all(&self) { + self.invalidate(CacheInvalidation::All); + } +} + +impl Default for LocalInvalidationSource { + fn default() -> Self { + Self::new() + } +} + +impl CacheInvalidationSource for LocalInvalidationSource { + fn subscribe(&self) -> broadcast::Receiver { + self.tx.subscribe() + } +} + +/// Cache statistics +#[derive(Debug, Clone)] +pub struct CacheStats { + pub binding_entries: usize, + pub role_entries: usize, +} + +#[cfg(test)] +mod tests { + use super::*; + use iam_types::{Permission, Scope}; + + #[test] + fn test_binding_cache() { + let cache = PolicyCache::default_config(); + let principal = PrincipalRef::user("alice"); + + // Initially empty + assert!(cache.get_bindings(&principal).is_none()); + + // Add bindings + let bindings = vec![PolicyBinding::new( + "b1", + principal.clone(), + "roles/Admin", + Scope::System, + )]; + cache.put_bindings(&principal, bindings.clone()); + + // Should be cached + let cached = cache.get_bindings(&principal).unwrap(); + assert_eq!(cached.len(), 1); + + // Invalidate + cache.invalidate_bindings(&principal); + assert!(cache.get_bindings(&principal).is_none()); + } + + #[test] + fn test_role_cache() { + let cache = PolicyCache::default_config(); + + let role = Role::new("TestRole", Scope::System, vec![Permission::wildcard()]); + + // Initially empty + assert!(cache.get_role("TestRole").is_none()); + + // Add role + cache.put_role(role.clone()); + + // Should be cached + let cached = cache.get_role("TestRole").unwrap(); + assert_eq!(cached.name, "TestRole"); + + // Invalidate + cache.invalidate_role("TestRole"); + assert!(cache.get_role("TestRole").is_none()); + } + + #[test] + fn test_cache_stats() { + let cache = PolicyCache::default_config(); + + cache.put_bindings(&PrincipalRef::user("alice"), vec![]); + cache.put_role(Role::new("Role1", Scope::System, vec![])); + + let stats = cache.stats(); + assert_eq!(stats.binding_entries, 1); + assert_eq!(stats.role_entries, 1); + } + + #[test] + fn test_scope_invalidation() { + let cache = PolicyCache::default_config(); + + // Add bindings for different scopes + let alice = PrincipalRef::user("alice"); + let bob = PrincipalRef::user("bob"); + + let alice_bindings = vec![PolicyBinding::new( + "b1", + alice.clone(), + "roles/Admin", + Scope::org("org-1"), + )]; + let bob_bindings = vec![PolicyBinding::new( + "b2", + bob.clone(), + "roles/Viewer", + Scope::project("proj-1", "org-2"), + )]; + + cache.put_bindings(&alice, alice_bindings); + cache.put_bindings(&bob, bob_bindings); + + assert_eq!(cache.stats().binding_entries, 2); + + // Invalidate org-1 scope should remove alice's bindings + cache.invalidate_bindings_by_scope(&Scope::org("org-1")); + + // Alice's bindings should be gone + assert!(cache.get_bindings(&alice).is_none()); + // Bob's bindings should still be there + assert!(cache.get_bindings(&bob).is_some()); + } + + #[test] + fn test_handle_invalidation_event() { + let cache = PolicyCache::default_config(); + + let alice = PrincipalRef::user("alice"); + let bindings = vec![PolicyBinding::new( + "b1", + alice.clone(), + "roles/Admin", + Scope::System, + )]; + cache.put_bindings(&alice, bindings); + cache.put_role(Role::new("TestRole", Scope::System, vec![])); + + assert_eq!(cache.stats().binding_entries, 1); + assert_eq!(cache.stats().role_entries, 1); + + // Handle principal invalidation + cache.handle_invalidation(&CacheInvalidation::Principal(alice.to_string())); + assert!(cache.get_bindings(&alice).is_none()); + assert!(cache.get_role("TestRole").is_some()); + + // Handle role invalidation + cache.handle_invalidation(&CacheInvalidation::Role("TestRole".to_string())); + assert!(cache.get_role("TestRole").is_none()); + } + + #[test] + fn test_local_invalidation_source() { + let source = LocalInvalidationSource::new(); + let mut rx = source.subscribe(); + + source.invalidate_principal("alice"); + + // Should receive the event (non-blocking check) + let event = rx.try_recv(); + assert!(event.is_ok()); + match event.unwrap() { + CacheInvalidation::Principal(id) => assert_eq!(id, "alice"), + _ => panic!("Expected Principal invalidation"), + } + } + + #[tokio::test] + async fn test_invalidation_listener() { + let cache = Arc::new(PolicyCache::default_config()); + let source = Arc::new(LocalInvalidationSource::new()); + + // Add some data to cache + let alice = PrincipalRef::user("alice"); + cache.put_bindings(&alice, vec![]); + cache.put_role(Role::new("TestRole", Scope::System, vec![])); + + assert_eq!(cache.stats().binding_entries, 1); + assert_eq!(cache.stats().role_entries, 1); + + // Start listener + let _handle = start_invalidation_listener(cache.clone(), source.clone()); + + // Give the listener time to start + tokio::time::sleep(std::time::Duration::from_millis(10)).await; + + // Send invalidation + source.invalidate_all(); + + // Give time for event to be processed + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + + // Cache should be empty + assert_eq!(cache.stats().binding_entries, 0); + assert_eq!(cache.stats().role_entries, 0); + } +} diff --git a/iam/crates/iam-authz/src/condition.rs b/iam/crates/iam-authz/src/condition.rs new file mode 100644 index 0000000..d2ff388 --- /dev/null +++ b/iam/crates/iam-authz/src/condition.rs @@ -0,0 +1,349 @@ +//! Condition evaluation for ABAC +//! +//! Evaluates condition expressions against the current authorization context. + +use std::net::IpAddr; +use std::str::FromStr; + +use ipnetwork::IpNetwork; + +use iam_types::{Condition, ConditionExpr, Error, IamError, Result}; + +use crate::context::VariableContext; + +/// Evaluate a condition against the given context +pub fn evaluate_condition(condition: &Condition, ctx: &VariableContext<'_>) -> Result { + evaluate_expr(&condition.expression, ctx) +} + +/// Evaluate a condition expression +fn evaluate_expr(expr: &ConditionExpr, ctx: &VariableContext<'_>) -> Result { + match expr { + ConditionExpr::StringEquals { key, value } => { + let actual = resolve_key(key, ctx); + let expected = ctx.substitute(value); + Ok(actual.as_deref() == Some(&expected)) + } + + ConditionExpr::StringNotEquals { key, value } => { + let actual = resolve_key(key, ctx); + let expected = ctx.substitute(value); + Ok(actual.as_deref() != Some(&expected)) + } + + ConditionExpr::StringLike { key, pattern } => { + let actual = require_key(key, ctx)?; + let pattern = ctx.substitute(pattern); + Ok(glob_match::glob_match(&pattern, &actual)) + } + + ConditionExpr::StringNotLike { key, pattern } => { + let actual = require_key(key, ctx)?; + let pattern = ctx.substitute(pattern); + Ok(!glob_match::glob_match(&pattern, &actual)) + } + + ConditionExpr::NumericEquals { key, value } => { + let actual = require_key(key, ctx)?; + let actual_num: i64 = actual.parse().map_err(|_| { + Error::Iam(IamError::InvalidCondition(format!( + "Cannot parse '{}' as number for key '{}'", + actual, key + ))) + })?; + Ok(actual_num == *value) + } + + ConditionExpr::NumericLessThan { key, value } => { + let actual = require_key(key, ctx)?; + let actual_num: i64 = actual.parse().map_err(|_| { + Error::Iam(IamError::InvalidCondition(format!( + "Cannot parse '{}' as number", + actual + ))) + })?; + Ok(actual_num < *value) + } + + ConditionExpr::NumericLessThanEquals { key, value } => { + let actual = require_key(key, ctx)?; + let actual_num: i64 = actual.parse().map_err(|_| { + Error::Iam(IamError::InvalidCondition(format!( + "Cannot parse '{}' as number", + actual + ))) + })?; + Ok(actual_num <= *value) + } + + ConditionExpr::NumericGreaterThan { key, value } => { + let actual = require_key(key, ctx)?; + let actual_num: i64 = actual.parse().map_err(|_| { + Error::Iam(IamError::InvalidCondition(format!( + "Cannot parse '{}' as number", + actual + ))) + })?; + Ok(actual_num > *value) + } + + ConditionExpr::NumericGreaterThanEquals { key, value } => { + let actual = require_key(key, ctx)?; + let actual_num: i64 = actual.parse().map_err(|_| { + Error::Iam(IamError::InvalidCondition(format!( + "Cannot parse '{}' as number", + actual + ))) + })?; + Ok(actual_num >= *value) + } + + ConditionExpr::IpAddress { key, cidr } => { + let actual = require_key(key, ctx)?; + let ip: IpAddr = actual.parse().map_err(|_| { + Error::Iam(IamError::InvalidCondition(format!( + "Cannot parse '{}' as IP address", + actual + ))) + })?; + let network: IpNetwork = cidr.parse().map_err(|_| { + Error::Iam(IamError::InvalidCondition(format!( + "Cannot parse '{}' as CIDR", + cidr + ))) + })?; + Ok(network.contains(ip)) + } + + ConditionExpr::NotIpAddress { key, cidr } => { + let actual = require_key(key, ctx)?; + let ip: IpAddr = actual.parse().map_err(|_| { + Error::Iam(IamError::InvalidCondition(format!( + "Cannot parse '{}' as IP address", + actual + ))) + })?; + let network: IpNetwork = cidr.parse().map_err(|_| { + Error::Iam(IamError::InvalidCondition(format!( + "Cannot parse '{}' as CIDR", + cidr + ))) + })?; + Ok(!network.contains(ip)) + } + + ConditionExpr::TimeBetween { start, end } => { + let now = ctx.context.timestamp; + + // Parse start/end as either Unix timestamp or HH:MM format + let start_ts = parse_time(start, now)?; + let end_ts = parse_time(end, now)?; + + Ok(now >= start_ts && now <= end_ts) + } + + ConditionExpr::Exists { key } => Ok(resolve_key(key, ctx).is_some()), + + ConditionExpr::StringEqualsAny { key, values } => { + let actual = resolve_key(key, ctx); + match actual { + Some(actual_val) => { + let substituted: Vec = + values.iter().map(|v| ctx.substitute(v)).collect(); + Ok(substituted.contains(&actual_val)) + } + None => Ok(false), + } + } + + ConditionExpr::Bool { key, value } => { + let actual = require_key(key, ctx)?; + let actual_bool = matches!(actual.to_lowercase().as_str(), "true" | "1" | "yes"); + Ok(actual_bool == *value) + } + + ConditionExpr::And(exprs) => { + for e in exprs { + if !evaluate_expr(e, ctx)? { + return Ok(false); + } + } + Ok(true) + } + + ConditionExpr::Or(exprs) => { + for e in exprs { + if evaluate_expr(e, ctx)? { + return Ok(true); + } + } + Ok(false) + } + + ConditionExpr::Not(inner) => Ok(!evaluate_expr(inner, ctx)?), + } +} + +/// Resolve a key to its value, returning Option +fn resolve_key(key: &str, ctx: &VariableContext<'_>) -> Option { + ctx.resolve(key) +} + +/// Resolve a key, returning an error if not found +fn require_key(key: &str, ctx: &VariableContext<'_>) -> Result { + ctx.resolve(key).ok_or_else(|| { + Error::Iam(IamError::InvalidCondition(format!( + "Key not found: {}", + key + ))) + }) +} + +/// Parse time string to Unix timestamp +fn parse_time(time_str: &str, reference_time: u64) -> Result { + // Try parsing as Unix timestamp + if let Ok(ts) = time_str.parse::() { + return Ok(ts); + } + + // Try parsing as HH:MM (relative to current day) + if let Some((hours, minutes)) = time_str.split_once(':') { + let hours: u64 = hours.parse().map_err(|_| { + Error::Iam(IamError::InvalidCondition(format!( + "Invalid time format: {}", + time_str + ))) + })?; + let minutes: u64 = minutes.parse().map_err(|_| { + Error::Iam(IamError::InvalidCondition(format!( + "Invalid time format: {}", + time_str + ))) + })?; + + // Calculate seconds since midnight + let time_of_day = hours * 3600 + minutes * 60; + + // Get current day start + let day_start = (reference_time / 86400) * 86400; + + return Ok(day_start + time_of_day); + } + + // Try parsing as ISO 8601 + // For simplicity, just return an error for unsupported formats + Err(Error::Iam(IamError::InvalidCondition(format!( + "Unsupported time format: {}", + time_str + )))) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::context::AuthzContext; + use iam_types::{Principal, Resource}; + use std::net::{IpAddr, Ipv4Addr}; + + fn test_context<'a>( + principal: &'a Principal, + resource: &'a Resource, + context: &'a AuthzContext, + ) -> VariableContext<'a> { + VariableContext::new(principal, resource, context) + } + + #[test] + fn test_string_equals() { + let principal = Principal::new_user("alice", "Alice"); + let resource = Resource::new("instance", "vm-1", "org-1", "proj-1").with_owner("alice"); + let ctx = AuthzContext::new(); + let var_ctx = test_context(&principal, &resource, &ctx); + + // Direct comparison + let cond = Condition::string_equals("resource.owner", "alice"); + assert!(evaluate_condition(&cond, &var_ctx).unwrap()); + + // Variable substitution + let cond = Condition::string_equals("resource.owner", "${principal.id}"); + assert!(evaluate_condition(&cond, &var_ctx).unwrap()); + + // Non-matching + let cond = Condition::string_equals("resource.owner", "bob"); + assert!(!evaluate_condition(&cond, &var_ctx).unwrap()); + } + + #[test] + fn test_string_like() { + let principal = Principal::new_user("alice", "Alice"); + let resource = Resource::new("instance", "vm-prod-001", "org-1", "proj-1"); + let ctx = AuthzContext::new(); + let var_ctx = test_context(&principal, &resource, &ctx); + + let cond = Condition::string_like("resource.id", "vm-prod-*"); + assert!(evaluate_condition(&cond, &var_ctx).unwrap()); + + let cond = Condition::string_like("resource.id", "vm-dev-*"); + assert!(!evaluate_condition(&cond, &var_ctx).unwrap()); + } + + #[test] + fn test_ip_address() { + let principal = Principal::new_user("alice", "Alice"); + let resource = Resource::new("instance", "vm-1", "org-1", "proj-1"); + let ctx = AuthzContext::new().with_source_ip(IpAddr::V4(Ipv4Addr::new(10, 0, 1, 50))); + let var_ctx = test_context(&principal, &resource, &ctx); + + let cond = Condition::ip_address("request.source_ip", "10.0.0.0/8"); + assert!(evaluate_condition(&cond, &var_ctx).unwrap()); + + let cond = Condition::ip_address("request.source_ip", "192.168.0.0/16"); + assert!(!evaluate_condition(&cond, &var_ctx).unwrap()); + } + + #[test] + fn test_and_condition() { + let principal = Principal::new_user("alice", "Alice"); + let resource = Resource::new("instance", "vm-1", "org-1", "proj-1").with_owner("alice"); + let ctx = AuthzContext::new().with_source_ip(IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1))); + let var_ctx = test_context(&principal, &resource, &ctx); + + let cond = Condition::and(vec![ + Condition::string_equals("resource.owner", "${principal.id}"), + Condition::ip_address("request.source_ip", "10.0.0.0/8"), + ]); + assert!(evaluate_condition(&cond, &var_ctx).unwrap()); + + // One condition fails + let cond = Condition::and(vec![ + Condition::string_equals("resource.owner", "bob"), + Condition::ip_address("request.source_ip", "10.0.0.0/8"), + ]); + assert!(!evaluate_condition(&cond, &var_ctx).unwrap()); + } + + #[test] + fn test_or_condition() { + let principal = Principal::new_user("alice", "Alice"); + let resource = Resource::new("instance", "vm-1", "org-1", "proj-1").with_owner("bob"); + let ctx = AuthzContext::new(); + let var_ctx = test_context(&principal, &resource, &ctx); + + let cond = Condition::or(vec![ + Condition::string_equals("resource.owner", "${principal.id}"), // false + Condition::string_equals("principal.id", "alice"), // true + ]); + assert!(evaluate_condition(&cond, &var_ctx).unwrap()); + } + + #[test] + fn test_not_condition() { + let principal = Principal::new_user("alice", "Alice"); + let resource = Resource::new("instance", "vm-1", "org-1", "proj-1"); + let ctx = AuthzContext::new(); + let var_ctx = test_context(&principal, &resource, &ctx); + + let cond = Condition::not(Condition::string_equals("principal.id", "bob")); + assert!(evaluate_condition(&cond, &var_ctx).unwrap()); + } +} diff --git a/iam/crates/iam-authz/src/context.rs b/iam/crates/iam-authz/src/context.rs new file mode 100644 index 0000000..69a6d5e --- /dev/null +++ b/iam/crates/iam-authz/src/context.rs @@ -0,0 +1,222 @@ +//! Authorization context +//! +//! Provides context for policy evaluation including principal, resource, and request metadata. + +use std::collections::HashMap; +use std::net::IpAddr; + +use iam_types::{Principal, PrincipalRef, Resource, Scope}; + +/// Context for authorization evaluation +#[derive(Debug, Clone)] +pub struct AuthzContext { + /// Source IP address of the request + pub source_ip: Option, + + /// Request timestamp (Unix seconds) + pub timestamp: u64, + + /// Request metadata + pub metadata: HashMap, + + /// HTTP method (if applicable) + pub http_method: Option, + + /// Request path (if applicable) + pub request_path: Option, +} + +impl AuthzContext { + /// Create a new context with the current timestamp + pub fn new() -> Self { + let timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs(); + + Self { + source_ip: None, + timestamp, + metadata: HashMap::new(), + http_method: None, + request_path: None, + } + } + + /// Set the source IP + pub fn with_source_ip(mut self, ip: IpAddr) -> Self { + self.source_ip = Some(ip); + self + } + + /// Set the timestamp + pub fn with_timestamp(mut self, timestamp: u64) -> Self { + self.timestamp = timestamp; + self + } + + /// Add metadata + pub fn with_metadata(mut self, key: impl Into, value: impl Into) -> Self { + self.metadata.insert(key.into(), value.into()); + self + } + + /// Set HTTP method + pub fn with_http_method(mut self, method: impl Into) -> Self { + self.http_method = Some(method.into()); + self + } + + /// Set request path + pub fn with_request_path(mut self, path: impl Into) -> Self { + self.request_path = Some(path.into()); + self + } + + /// Get a value for condition evaluation + pub fn get_value(&self, key: &str) -> Option { + match key { + "request.source_ip" => self.source_ip.map(|ip| ip.to_string()), + "request.time" => Some(self.timestamp.to_string()), + "request.method" => self.http_method.clone(), + "request.path" => self.request_path.clone(), + key if key.starts_with("request.metadata.") => { + let meta_key = &key["request.metadata.".len()..]; + self.metadata.get(meta_key).cloned() + } + _ => None, + } + } +} + +impl Default for AuthzContext { + fn default() -> Self { + Self::new() + } +} + +/// Variable context for condition evaluation +/// Combines principal, resource, and request context +pub struct VariableContext<'a> { + pub principal: &'a Principal, + pub resource: &'a Resource, + pub context: &'a AuthzContext, +} + +impl<'a> VariableContext<'a> { + /// Create a new variable context + pub fn new( + principal: &'a Principal, + resource: &'a Resource, + context: &'a AuthzContext, + ) -> Self { + Self { + principal, + resource, + context, + } + } + + /// Resolve a variable key to its value + pub fn resolve(&self, key: &str) -> Option { + if key.starts_with("principal.") { + let prop = &key["principal.".len()..]; + self.resolve_principal(prop) + } else if key.starts_with("resource.") { + let prop = &key["resource.".len()..]; + self.resource.get_property(prop) + } else if key.starts_with("request.") { + self.context.get_value(key) + } else { + None + } + } + + fn resolve_principal(&self, prop: &str) -> Option { + match prop { + "id" => Some(self.principal.id.clone()), + "kind" => Some(self.principal.kind.to_string()), + "name" => Some(self.principal.name.clone()), + "org_id" => self.principal.org_id.clone(), + "project_id" => self.principal.project_id.clone(), + "node_id" => self.principal.node_id.clone(), + "email" => self.principal.email.clone(), + key if key.starts_with("metadata.") => { + let meta_key = &key["metadata.".len()..]; + self.principal.metadata.get(meta_key).cloned() + } + _ => None, + } + } + + /// Substitute variables in a string (${var} syntax) + pub fn substitute(&self, template: &str) -> String { + let mut result = template.to_string(); + let mut start = 0; + + while let Some(var_start) = result[start..].find("${") { + let absolute_start = start + var_start; + if let Some(var_end) = result[absolute_start..].find('}') { + let absolute_end = absolute_start + var_end; + let var_name = &result[absolute_start + 2..absolute_end]; + + if let Some(value) = self.resolve(var_name) { + result = format!( + "{}{}{}", + &result[..absolute_start], + value, + &result[absolute_end + 1..] + ); + start = absolute_start + value.len(); + } else { + // Variable not found, keep as is + start = absolute_end + 1; + } + } else { + break; + } + } + + result + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::net::Ipv4Addr; + + #[test] + fn test_context_values() { + let ctx = AuthzContext::new() + .with_source_ip(IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1))) + .with_metadata("tenant", "acme"); + + assert_eq!(ctx.get_value("request.source_ip"), Some("10.0.0.1".into())); + assert_eq!( + ctx.get_value("request.metadata.tenant"), + Some("acme".into()) + ); + } + + #[test] + fn test_variable_substitution() { + let principal = Principal::new_user("alice", "Alice Smith"); + let resource = Resource::new("instance", "vm-123", "org-1", "proj-1").with_owner("alice"); + let context = AuthzContext::new(); + + let var_ctx = VariableContext::new(&principal, &resource, &context); + + // Test simple substitution + assert_eq!(var_ctx.substitute("user-${principal.id}"), "user-alice"); + + // Test multiple substitutions + assert_eq!( + var_ctx.substitute("${resource.kind}/${resource.id}"), + "instance/vm-123" + ); + + // Test unknown variable (kept as is) + assert_eq!(var_ctx.substitute("${unknown.var}"), "${unknown.var}"); + } +} diff --git a/iam/crates/iam-authz/src/evaluator.rs b/iam/crates/iam-authz/src/evaluator.rs new file mode 100644 index 0000000..7866dcb --- /dev/null +++ b/iam/crates/iam-authz/src/evaluator.rs @@ -0,0 +1,550 @@ +//! Policy evaluation engine +//! +//! The core authorization engine that evaluates whether a principal +//! is allowed to perform an action on a resource. + +use std::sync::Arc; + +use iam_store::{BindingStore, GroupStore, RoleStore}; +use iam_types::{ + Error, IamError, Permission, PolicyBinding, Principal, PrincipalKind, PrincipalRef, Resource, + Result, Role, Scope, +}; + +use crate::cache::PolicyCache; +use crate::condition::evaluate_condition; +use crate::context::{AuthzContext, VariableContext}; + +/// Authorization request +#[derive(Debug, Clone)] +pub struct AuthzRequest { + /// Principal making the request + pub principal: Principal, + /// Action being performed (e.g., "compute:instances:create") + pub action: String, + /// Resource being accessed + pub resource: Resource, + /// Request context + pub context: AuthzContext, +} + +impl AuthzRequest { + /// Create a new authorization request + pub fn new(principal: Principal, action: impl Into, resource: Resource) -> Self { + Self { + principal, + action: action.into(), + resource, + context: AuthzContext::new(), + } + } + + /// Set the context + pub fn with_context(mut self, context: AuthzContext) -> Self { + self.context = context; + self + } +} + +/// Authorization decision +#[derive(Debug, Clone)] +pub enum AuthzDecision { + /// Access is allowed + Allow, + /// Access is denied with reason + Deny { reason: String }, +} + +impl AuthzDecision { + /// Check if the decision is Allow + pub fn is_allowed(&self) -> bool { + matches!(self, AuthzDecision::Allow) + } + + /// Check if the decision is Deny + pub fn is_denied(&self) -> bool { + matches!(self, AuthzDecision::Deny { .. }) + } +} + +/// Policy evaluator configuration +#[derive(Debug, Clone)] +pub struct PolicyEvaluatorConfig { + /// Whether to use caching + pub use_cache: bool, + /// Maximum number of bindings to evaluate + pub max_bindings: usize, + /// Enable debug logging + pub debug: bool, +} + +impl Default for PolicyEvaluatorConfig { + fn default() -> Self { + Self { + use_cache: true, + max_bindings: 1000, + debug: false, + } + } +} + +/// Policy evaluator (PDP - Policy Decision Point) +pub struct PolicyEvaluator { + binding_store: Arc, + role_store: Arc, + group_store: Option>, + cache: Arc, + config: PolicyEvaluatorConfig, +} + +impl PolicyEvaluator { + /// Create a new policy evaluator + pub fn new( + binding_store: Arc, + role_store: Arc, + cache: Arc, + ) -> Self { + Self { + binding_store, + role_store, + group_store: None, + cache, + config: PolicyEvaluatorConfig::default(), + } + } + + /// Create with group store for group expansion + pub fn with_group_store( + binding_store: Arc, + role_store: Arc, + group_store: Arc, + cache: Arc, + ) -> Self { + Self { + binding_store, + role_store, + group_store: Some(group_store), + cache, + config: PolicyEvaluatorConfig::default(), + } + } + + /// Set configuration + pub fn with_config(mut self, config: PolicyEvaluatorConfig) -> Self { + self.config = config; + self + } + + /// Evaluate an authorization request + pub async fn evaluate(&self, req: &AuthzRequest) -> Result { + // Default deny + let mut decision = AuthzDecision::Deny { + reason: "No matching policy".into(), + }; + + // Get resource scope + let resource_scope = Scope::project(&req.resource.project_id, &req.resource.org_id); + + // Get effective bindings for the principal + let bindings = self + .get_effective_bindings(&req.principal.to_ref(), &resource_scope) + .await?; + + if bindings.is_empty() { + return Ok(AuthzDecision::Deny { + reason: format!("No bindings found for principal {}", req.principal.to_ref()), + }); + } + + // Evaluate each binding + for binding in bindings.iter().take(self.config.max_bindings) { + // Skip disabled or expired bindings + let now = req.context.timestamp; + if !binding.is_active(now) { + continue; + } + + // Get the role + let role = match self.get_role(&binding.role_ref).await? { + Some(r) => r, + None => { + tracing::warn!("Role not found: {}", binding.role_ref); + continue; + } + }; + + // Evaluate binding condition (if any) + if let Some(cond) = &binding.condition { + let var_ctx = VariableContext::new(&req.principal, &req.resource, &req.context); + if !evaluate_condition(cond, &var_ctx)? { + continue; + } + } + + // Evaluate role permissions + if self.evaluate_role(&role, req)? { + decision = AuthzDecision::Allow; + break; + } + } + + Ok(decision) + } + + /// Check if a specific action is allowed + pub async fn is_allowed( + &self, + principal: &Principal, + action: &str, + resource: &Resource, + ) -> Result { + let req = AuthzRequest::new(principal.clone(), action, resource.clone()); + let decision = self.evaluate(&req).await?; + Ok(decision.is_allowed()) + } + + /// Check if a specific action is allowed with context + pub async fn is_allowed_with_context( + &self, + principal: &Principal, + action: &str, + resource: &Resource, + context: AuthzContext, + ) -> Result { + let req = + AuthzRequest::new(principal.clone(), action, resource.clone()).with_context(context); + let decision = self.evaluate(&req).await?; + Ok(decision.is_allowed()) + } + + /// Evaluate role permissions against the request + fn evaluate_role(&self, role: &Role, req: &AuthzRequest) -> Result { + let var_ctx = VariableContext::new(&req.principal, &req.resource, &req.context); + + for permission in &role.permissions { + // Check action pattern + if !matches_action(&permission.action, &req.action) { + continue; + } + + // Check resource pattern + let resource_path = req.resource.to_path(); + let pattern = var_ctx.substitute(&permission.resource_pattern); + if !matches_resource(&pattern, &resource_path) { + continue; + } + + // Check permission condition (if any) + if let Some(cond) = &permission.condition { + if !evaluate_condition(cond, &var_ctx)? { + continue; + } + } + + // All checks passed - permission matches + return Ok(true); + } + + Ok(false) + } + + /// Get effective bindings for a principal (with caching) + /// + /// If group_store is configured, this will also include bindings + /// for any groups the principal belongs to. + async fn get_effective_bindings( + &self, + principal: &PrincipalRef, + scope: &Scope, + ) -> Result> { + // Check cache first + if self.config.use_cache { + if let Some(bindings) = self.cache.get_bindings(principal) { + // Filter to scope + let effective: Vec<_> = bindings + .into_iter() + .filter(|b| b.scope.contains(scope) || scope.contains(&b.scope)) + .collect(); + return Ok(effective); + } + } + + // Fetch bindings for the principal + let mut bindings = self + .binding_store + .get_effective_bindings(principal, scope) + .await?; + + // If group store is available, expand group memberships + if let Some(group_store) = &self.group_store { + // Get groups the principal belongs to + let groups = group_store.list_groups(principal).await?; + + // Get bindings for each group + for group_id in groups { + let group_ref = PrincipalRef::new(PrincipalKind::Group, &group_id); + let group_bindings = self + .binding_store + .get_effective_bindings(&group_ref, scope) + .await?; + bindings.extend(group_bindings); + } + } + + // Cache if enabled + if self.config.use_cache { + self.cache.put_bindings(principal, bindings.clone()); + } + + Ok(bindings) + } + + /// Get a role (with caching) + async fn get_role(&self, role_ref: &str) -> Result> { + let name = role_ref.strip_prefix("roles/").unwrap_or(role_ref); + + // Check cache first + if self.config.use_cache { + if let Some(role) = self.cache.get_role(name) { + return Ok(Some(role)); + } + } + + // Fetch from store + let role = self.role_store.get(name).await?; + + // Cache if enabled and found + if self.config.use_cache { + if let Some(ref r) = role { + self.cache.put_role(r.clone()); + } + } + + Ok(role) + } + + /// Invalidate cache for a principal + pub fn invalidate_principal(&self, principal: &PrincipalRef) { + self.cache.invalidate_bindings(principal); + } + + /// Invalidate cache for a role + pub fn invalidate_role(&self, role_name: &str) { + self.cache.invalidate_role(role_name); + } + + /// Invalidate all caches + pub fn invalidate_all(&self) { + self.cache.invalidate_all(); + } +} + +/// Match action pattern against action +/// Supports wildcards: "*" matches everything, "compute:*" matches all compute actions +fn matches_action(pattern: &str, action: &str) -> bool { + if pattern == "*" { + return true; + } + + if !pattern.contains('*') { + return pattern == action; + } + + // Split by ":" + let pattern_parts: Vec<&str> = pattern.split(':').collect(); + let action_parts: Vec<&str> = action.split(':').collect(); + + // Compare each part + for (i, p) in pattern_parts.iter().enumerate() { + if *p == "*" { + // Wildcard matches rest + if i == pattern_parts.len() - 1 { + return true; + } + continue; + } + + if i >= action_parts.len() { + return false; + } + + if *p != action_parts[i] { + return false; + } + } + + pattern_parts.len() == action_parts.len() +} + +/// Match resource pattern against resource path +fn matches_resource(pattern: &str, path: &str) -> bool { + if pattern == "*" { + return true; + } + + // Handle trailing /* as "match all remaining" ONLY if there are no other wildcards + // This allows patterns like "project/p1/*" to match "project/p1/instances/vm-1" + if pattern.ends_with("/*") { + let prefix = &pattern[..pattern.len() - 2]; + // Only use special handling if prefix has no wildcards + if !prefix.contains('*') && !prefix.contains('?') { + return path.starts_with(prefix) + && path.len() > prefix.len() + && path.as_bytes()[prefix.len()] == b'/'; + } + } + + // Use glob matching for other patterns + glob_match::glob_match(pattern, path) +} + +#[cfg(test)] +mod tests { + use super::*; + use iam_store::Backend; + use iam_types::PrincipalKind; + + fn test_stores() -> (Arc, Arc) { + let backend = Arc::new(Backend::memory()); + let binding_store = Arc::new(BindingStore::new(backend.clone())); + let role_store = Arc::new(RoleStore::new(backend)); + (binding_store, role_store) + } + + #[test] + fn test_action_matching() { + assert!(matches_action("*", "compute:instances:create")); + assert!(matches_action("compute:*", "compute:instances:create")); + assert!(matches_action( + "compute:instances:*", + "compute:instances:create" + )); + assert!(matches_action( + "compute:instances:create", + "compute:instances:create" + )); + assert!(!matches_action( + "compute:instances:delete", + "compute:instances:create" + )); + assert!(!matches_action("storage:*", "compute:instances:create")); + } + + #[test] + fn test_resource_matching() { + assert!(matches_resource("*", "project/p1/instances/vm-1")); + assert!(matches_resource( + "project/*/instances/*", + "project/p1/instances/vm-1" + )); + assert!(matches_resource( + "project/p1/*", + "project/p1/instances/vm-1" + )); + assert!(!matches_resource( + "project/p2/*", + "project/p1/instances/vm-1" + )); + } + + #[tokio::test] + async fn test_evaluator_allow() { + let (binding_store, role_store) = test_stores(); + let cache = Arc::new(PolicyCache::default_config()); + + // Initialize builtin roles + role_store.init_builtin_roles().await.unwrap(); + + // Create a binding for alice as SystemAdmin + let alice = PrincipalRef::user("alice"); + binding_store + .create(&PolicyBinding::new( + "b1", + alice.clone(), + "roles/SystemAdmin", + Scope::System, + )) + .await + .unwrap(); + + let evaluator = PolicyEvaluator::new(binding_store, role_store, cache); + + let principal = Principal::new_user("alice", "Alice"); + let resource = Resource::new("instance", "vm-1", "org-1", "proj-1"); + + let req = AuthzRequest::new(principal, "compute:instances:delete", resource); + let decision = evaluator.evaluate(&req).await.unwrap(); + + assert!(decision.is_allowed()); + } + + #[tokio::test] + async fn test_evaluator_deny() { + let (binding_store, role_store) = test_stores(); + let cache = Arc::new(PolicyCache::default_config()); + + role_store.init_builtin_roles().await.unwrap(); + + // No bindings for bob + let evaluator = PolicyEvaluator::new(binding_store, role_store, cache); + + let principal = Principal::new_user("bob", "Bob"); + let resource = Resource::new("instance", "vm-1", "org-1", "proj-1"); + + let req = AuthzRequest::new(principal, "compute:instances:delete", resource); + let decision = evaluator.evaluate(&req).await.unwrap(); + + assert!(decision.is_denied()); + } + + #[tokio::test] + async fn test_evaluator_with_condition() { + let (binding_store, role_store) = test_stores(); + let cache = Arc::new(PolicyCache::default_config()); + + // Create a custom role with condition + // Note: Resource path format is org/{org_id}/project/{project_id}/{kind}/{id} + let role = Role::new( + "OwnerOnly", + Scope::project("*", "*"), + vec![ + Permission::new("compute:instances:*", "org/*/project/*/instance/*").with_condition( + iam_types::Condition::string_equals("resource.owner", "${principal.id}"), + ), + ], + ); + role_store.create_internal(&role).await.unwrap(); + + // Create binding + let alice = PrincipalRef::user("alice"); + binding_store + .create(&PolicyBinding::new( + "b1", + alice, + "roles/OwnerOnly", + Scope::project("proj-1", "org-1"), + )) + .await + .unwrap(); + + let evaluator = PolicyEvaluator::new(binding_store, role_store, cache); + + let principal = Principal::new_user("alice", "Alice"); + + // Alice owns this resource - should be allowed + let resource = Resource::new("instance", "vm-1", "org-1", "proj-1").with_owner("alice"); + let decision = evaluator + .is_allowed(&principal, "compute:instances:delete", &resource) + .await + .unwrap(); + assert!(decision); + + // Alice doesn't own this resource - should be denied + let resource = Resource::new("instance", "vm-2", "org-1", "proj-1").with_owner("bob"); + let decision = evaluator + .is_allowed(&principal, "compute:instances:delete", &resource) + .await + .unwrap(); + assert!(!decision); + } +} diff --git a/iam/crates/iam-authz/src/lib.rs b/iam/crates/iam-authz/src/lib.rs new file mode 100644 index 0000000..05fdc09 --- /dev/null +++ b/iam/crates/iam-authz/src/lib.rs @@ -0,0 +1,18 @@ +#![recursion_limit = "2048"] + +//! Authorization layer (PDP - Policy Decision Point) for IAM +//! +//! Provides policy evaluation engine with RBAC + ABAC support. + +pub mod cache; +pub mod condition; +pub mod context; +pub mod evaluator; + +pub use cache::{ + start_invalidation_listener, CacheInvalidation, CacheInvalidationSource, CacheStats, + LocalInvalidationSource, PolicyCache, PolicyCacheConfig, +}; +pub use condition::evaluate_condition; +pub use context::{AuthzContext, VariableContext}; +pub use evaluator::{AuthzDecision, AuthzRequest, PolicyEvaluator, PolicyEvaluatorConfig}; diff --git a/iam/crates/iam-client/Cargo.toml b/iam/crates/iam-client/Cargo.toml new file mode 100644 index 0000000..7086b2b --- /dev/null +++ b/iam/crates/iam-client/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "iam-client" +version.workspace = true +edition.workspace = true +description = "IAM client SDK" +license.workspace = true + +[dependencies] +iam-types = { path = "../iam-types" } +iam-api = { path = "../iam-api" } +serde = { workspace = true } +serde_json = { workspace = true } +thiserror = { workspace = true } +tokio = { workspace = true } +tracing = { workspace = true } +async-trait = { workspace = true } +tonic = { workspace = true } + +[dev-dependencies] +tokio = { workspace = true, features = ["full", "test-util"] } diff --git a/iam/crates/iam-client/src/client.rs b/iam/crates/iam-client/src/client.rs new file mode 100644 index 0000000..971a035 --- /dev/null +++ b/iam/crates/iam-client/src/client.rs @@ -0,0 +1,510 @@ +//! IAM client implementation +//! +//! Provides a thin gRPC client for interacting with the IAM service. + +use std::time::Duration; + +use iam_api::proto::{ + iam_admin_client::IamAdminClient, iam_authz_client::IamAuthzClient, + iam_token_client::IamTokenClient, AuthorizeRequest, AuthzContext, CreateBindingRequest, + CreatePrincipalRequest, CreateRoleRequest, DeleteBindingRequest, GetPrincipalRequest, + GetRoleRequest, IssueTokenRequest, ListBindingsRequest, ListPrincipalsRequest, + ListRolesRequest, Principal as ProtoPrincipal, PrincipalKind as ProtoPrincipalKind, + PrincipalRef as ProtoPrincipalRef, ResourceRef as ProtoResourceRef, RevokeTokenRequest, + Scope as ProtoScope, ValidateTokenRequest, +}; +use iam_types::{ + AuthMethod, Error, IamError, InternalTokenClaims, PolicyBinding, Principal, + PrincipalKind as TypesPrincipalKind, PrincipalRef, Resource, Result, Role, Scope, +}; +use tonic::transport::{Channel, ClientTlsConfig, Endpoint}; + +/// Configuration for the IAM client +#[derive(Debug, Clone)] +pub struct IamClientConfig { + /// IAM server endpoint + pub endpoint: String, + /// Connection timeout + pub timeout_ms: u64, + /// Enable TLS + pub tls: bool, +} + +impl IamClientConfig { + /// Create a new config + pub fn new(endpoint: impl Into) -> Self { + Self { + endpoint: endpoint.into(), + timeout_ms: 5000, + tls: true, + } + } + + /// Set timeout + pub fn with_timeout(mut self, timeout_ms: u64) -> Self { + self.timeout_ms = timeout_ms; + self + } + + /// Disable TLS + pub fn without_tls(mut self) -> Self { + self.tls = false; + self + } +} + +/// IAM client +pub struct IamClient { + config: IamClientConfig, + channel: Channel, +} + +impl IamClient { + /// Create a new IAM client + pub async fn connect(config: IamClientConfig) -> Result { + let endpoint_uri = + if config.endpoint.starts_with("http://") || config.endpoint.starts_with("https://") { + config.endpoint.clone() + } else if config.tls { + format!("https://{}", config.endpoint) + } else { + format!("http://{}", config.endpoint) + }; + + let mut endpoint = Endpoint::from_shared(endpoint_uri.clone()) + .map_err(|e| Error::Internal(e.to_string()))? + .timeout(Duration::from_millis(config.timeout_ms)); + + if config.tls { + endpoint = endpoint + .tls_config(ClientTlsConfig::new()) + .map_err(|e| Error::Internal(e.to_string()))?; + } + + let channel = endpoint + .connect() + .await + .map_err(|e| Error::Internal(e.to_string()))?; + + Ok(Self { config, channel }) + } + + fn authz_client(&self) -> IamAuthzClient { + IamAuthzClient::new(self.channel.clone()) + } + + fn admin_client(&self) -> IamAdminClient { + IamAdminClient::new(self.channel.clone()) + } + + fn token_client(&self) -> IamTokenClient { + IamTokenClient::new(self.channel.clone()) + } + + // ======================================================================== + // Authorization APIs + // ======================================================================== + + /// Check if an action is allowed + pub async fn authorize( + &self, + principal: &Principal, + action: &str, + resource: &Resource, + ) -> Result { + self.authorize_with_context( + principal, + action, + resource, + std::collections::HashMap::new(), + ) + .await + } + + /// Check authorization with additional context + pub async fn authorize_with_context( + &self, + principal: &Principal, + action: &str, + resource: &Resource, + context: std::collections::HashMap, + ) -> Result { + let mut client = self.authz_client(); + let request = AuthorizeRequest { + principal: Some(to_proto_principal_ref(&principal.to_ref())), + action: action.to_string(), + resource: Some(ProtoResourceRef { + kind: resource.kind.clone(), + id: resource.id.clone(), + org_id: resource.org_id.clone(), + project_id: resource.project_id.clone(), + owner_id: resource.owner_id.clone(), + node_id: resource.node_id.clone(), + region: resource.region.clone(), + tags: resource.tags.clone(), + }), + context: Some(AuthzContext { + source_ip: context.get("source_ip").cloned().unwrap_or_default(), + timestamp: 0, + http_method: context.get("http_method").cloned().unwrap_or_default(), + request_path: context.get("request_path").cloned().unwrap_or_default(), + metadata: context, + }), + }; + + let resp = client + .authorize(request) + .await + .map_err(map_status)? + .into_inner(); + + Ok(resp.allowed) + } + + // ======================================================================== + // Principal Management APIs + // ======================================================================== + + /// Create a new user + pub async fn create_user(&self, id: &str, name: &str) -> Result { + let mut client = self.admin_client(); + let req = CreatePrincipalRequest { + id: id.into(), + kind: ProtoPrincipalKind::User as i32, + name: name.into(), + org_id: None, + project_id: None, + email: None, + metadata: Default::default(), + }; + + let resp = client + .create_principal(req) + .await + .map_err(map_status)? + .into_inner(); + Ok(ProtoPrincipal::into(resp)) + } + + /// Get a principal + pub async fn get_principal(&self, principal_ref: &PrincipalRef) -> Result> { + let mut client = self.admin_client(); + let req = GetPrincipalRequest { + principal: Some(to_proto_principal_ref(principal_ref)), + }; + let resp = client.get_principal(req).await; + match resp { + Ok(r) => Ok(Some(ProtoPrincipal::into(r.into_inner()))), + Err(status) if status.code() == tonic::Code::NotFound => Ok(None), + Err(status) => Err(map_status(status)), + } + } + + /// Create a service account + pub async fn create_service_account( + &self, + id: &str, + name: &str, + project_id: &str, + ) -> Result { + let mut client = self.admin_client(); + let req = CreatePrincipalRequest { + id: id.into(), + kind: ProtoPrincipalKind::ServiceAccount as i32, + name: name.into(), + org_id: None, + project_id: Some(project_id.into()), + email: None, + metadata: Default::default(), + }; + let resp = client + .create_principal(req) + .await + .map_err(map_status)? + .into_inner(); + Ok(ProtoPrincipal::into(resp)) + } + + /// List users + pub async fn list_users(&self) -> Result> { + let mut client = self.admin_client(); + let req = ListPrincipalsRequest { + kind: Some(ProtoPrincipalKind::User as i32), + org_id: None, + project_id: None, + page_size: 0, + page_token: String::new(), + }; + + let resp = client + .list_principals(req) + .await + .map_err(map_status)? + .into_inner(); + + Ok(resp + .principals + .into_iter() + .map(ProtoPrincipal::into) + .collect()) + } + + // ======================================================================== + // Role Management APIs + // ======================================================================== + + /// Get a role by name + pub async fn get_role(&self, name: &str) -> Result> { + let mut client = self.admin_client(); + let req = GetRoleRequest { name: name.into() }; + let resp = client.get_role(req).await; + match resp { + Ok(r) => Ok(Some(r.into_inner().into())), + Err(status) if status.code() == tonic::Code::NotFound => Ok(None), + Err(status) => Err(map_status(status)), + } + } + + /// List all roles + pub async fn list_roles(&self) -> Result> { + let mut client = self.admin_client(); + let req = ListRolesRequest { + scope: None, + include_builtin: true, + page_size: 0, + page_token: String::new(), + }; + let resp = client + .list_roles(req) + .await + .map_err(map_status)? + .into_inner(); + Ok(resp.roles.into_iter().map(Into::into).collect()) + } + + /// Create a custom role + pub async fn create_role(&self, role: &Role) -> Result { + let mut client = self.admin_client(); + let req = CreateRoleRequest { + name: role.name.clone(), + display_name: role.display_name.clone(), + description: role.description.clone(), + scope: Some(ProtoScope::from(role.scope.clone())), + permissions: role + .permissions + .clone() + .into_iter() + .map(Into::into) + .collect(), + }; + + let resp = client + .create_role(req) + .await + .map_err(map_status)? + .into_inner(); + Ok(resp.into()) + } + + // ======================================================================== + // Binding Management APIs + // ======================================================================== + + /// Create a policy binding + pub async fn create_binding(&self, binding: &PolicyBinding) -> Result { + let mut client = self.admin_client(); + let req = CreateBindingRequest { + principal: Some(to_proto_principal_ref(&binding.principal_ref)), + role: binding.role_ref.clone(), + scope: Some(ProtoScope::from(binding.scope.clone())), + condition: binding.condition.clone().map(Into::into), + expires_at: binding.expires_at, + }; + + let resp = client + .create_binding(req) + .await + .map_err(map_status)? + .into_inner(); + Ok(resp.into()) + } + + /// Delete a policy binding + pub async fn delete_binding(&self, binding_id: &str) -> Result { + let mut client = self.admin_client(); + let req = DeleteBindingRequest { + id: binding_id.into(), + }; + let resp = client + .delete_binding(req) + .await + .map_err(map_status)? + .into_inner(); + Ok(resp.deleted) + } + + /// List bindings for a principal + pub async fn list_bindings_for_principal( + &self, + principal: &PrincipalRef, + ) -> Result> { + let mut client = self.admin_client(); + let req = ListBindingsRequest { + principal: Some(to_proto_principal_ref(principal)), + role: None, + scope: None, + include_disabled: false, + page_size: 0, + page_token: String::new(), + }; + + let resp = client + .list_bindings(req) + .await + .map_err(map_status)? + .into_inner(); + Ok(resp.bindings.into_iter().map(Into::into).collect()) + } + + /// List bindings for a scope + pub async fn list_bindings_for_scope(&self, scope: &Scope) -> Result> { + let mut client = self.admin_client(); + let req = ListBindingsRequest { + principal: None, + role: None, + scope: Some(ProtoScope::from(scope.clone())), + include_disabled: false, + page_size: 0, + page_token: String::new(), + }; + + let resp = client + .list_bindings(req) + .await + .map_err(map_status)? + .into_inner(); + Ok(resp.bindings.into_iter().map(Into::into).collect()) + } + + // ======================================================================== + // Token APIs + // ======================================================================== + + /// Issue an internal token + pub async fn issue_token( + &self, + principal: &Principal, + roles: Vec, + scope: Scope, + ttl_seconds: u64, + ) -> Result { + let mut client = self.token_client(); + let req = IssueTokenRequest { + principal_id: principal.id.clone(), + principal_kind: match principal.kind { + TypesPrincipalKind::User => ProtoPrincipalKind::User as i32, + TypesPrincipalKind::ServiceAccount => ProtoPrincipalKind::ServiceAccount as i32, + TypesPrincipalKind::Group => ProtoPrincipalKind::Group as i32, + }, + roles, + scope: Some(ProtoScope::from(scope)), + ttl_seconds, + }; + + let resp = client + .issue_token(req) + .await + .map_err(map_status)? + .into_inner(); + Ok(resp.token) + } + + /// Validate a token + pub async fn validate_token(&self, token: &str) -> Result { + let mut client = self.token_client(); + let req = ValidateTokenRequest { + token: token.to_string(), + }; + let resp = client + .validate_token(req) + .await + .map_err(map_status)? + .into_inner(); + + if !resp.valid { + return Err(Error::Iam(IamError::InvalidToken(resp.reason.clone()))); + } + + let claims = resp + .claims + .ok_or_else(|| Error::Iam(IamError::InvalidToken("missing claims".into())))?; + + let principal_kind = match ProtoPrincipalKind::try_from(claims.principal_kind) { + Ok(ProtoPrincipalKind::User) => TypesPrincipalKind::User, + Ok(ProtoPrincipalKind::ServiceAccount) => TypesPrincipalKind::ServiceAccount, + Ok(ProtoPrincipalKind::Group) => TypesPrincipalKind::Group, + _ => TypesPrincipalKind::User, + }; + + let auth_method = match claims.auth_method.as_str() { + "jwt" => AuthMethod::Jwt, + "mtls" => AuthMethod::Mtls, + "api_key" => AuthMethod::ApiKey, + "internal" => AuthMethod::Internal, + _ => AuthMethod::Unknown, + }; + + let mut internal_claims = InternalTokenClaims::new( + claims.principal_id, + principal_kind, + claims.principal_name, + claims.scope.map(ProtoScope::into).unwrap_or(Scope::System), + claims.session_id, + ) + .with_roles(claims.roles) + .with_timestamps(claims.iat, claims.exp) + .with_auth_method(auth_method); + + if let Some(org_id) = claims.org_id { + internal_claims = internal_claims.with_org_id(&org_id); + } + if let Some(project_id) = claims.project_id { + internal_claims = internal_claims.with_project_id(&project_id); + } + if let Some(node_id) = claims.node_id { + internal_claims = internal_claims.with_node_id(&node_id); + } + + Ok(internal_claims) + } + + /// Revoke a token + pub async fn revoke_token(&self, token: &str) -> Result<()> { + let mut client = self.token_client(); + let req = RevokeTokenRequest { + token: token.to_string(), + reason: "client revoke".into(), + }; + client + .revoke_token(req) + .await + .map_err(map_status)? + .into_inner(); + Ok(()) + } +} + +fn map_status(status: tonic::Status) -> Error { + Error::Internal(status.to_string()) +} + +fn to_proto_principal_ref(principal_ref: &PrincipalRef) -> ProtoPrincipalRef { + ProtoPrincipalRef { + kind: match principal_ref.kind { + TypesPrincipalKind::User => ProtoPrincipalKind::User as i32, + TypesPrincipalKind::ServiceAccount => ProtoPrincipalKind::ServiceAccount as i32, + TypesPrincipalKind::Group => ProtoPrincipalKind::Group as i32, + }, + id: principal_ref.id.clone(), + } +} diff --git a/iam/crates/iam-client/src/lib.rs b/iam/crates/iam-client/src/lib.rs new file mode 100644 index 0000000..e1aebcc --- /dev/null +++ b/iam/crates/iam-client/src/lib.rs @@ -0,0 +1,7 @@ +//! IAM Client SDK +//! +//! Provides a client for interacting with the IAM service. + +pub mod client; + +pub use client::IamClient; diff --git a/iam/crates/iam-server/Cargo.toml b/iam/crates/iam-server/Cargo.toml new file mode 100644 index 0000000..18653e9 --- /dev/null +++ b/iam/crates/iam-server/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "iam-server" +version.workspace = true +edition.workspace = true +description = "IAM server binary" +license.workspace = true + +[[bin]] +name = "iam-server" +path = "src/main.rs" + +[dependencies] +iam-types = { path = "../iam-types" } +iam-store = { path = "../iam-store" } +iam-authn = { path = "../iam-authn" } +iam-authz = { path = "../iam-authz" } +iam-api = { path = "../iam-api" } +serde = { workspace = true } +serde_json = { workspace = true } +thiserror = { workspace = true } +tokio = { workspace = true, features = ["full"] } +tracing = { workspace = true } +tracing-subscriber = { workspace = true } +tonic = { workspace = true } +tonic-health = { workspace = true } +toml = { workspace = true } +clap = { workspace = true } +metrics = { workspace = true } +metrics-exporter-prometheus = { workspace = true } + +[dev-dependencies] +tokio = { workspace = true, features = ["full", "test-util"] } diff --git a/iam/crates/iam-server/src/config.rs b/iam/crates/iam-server/src/config.rs new file mode 100644 index 0000000..16bcea7 --- /dev/null +++ b/iam/crates/iam-server/src/config.rs @@ -0,0 +1,347 @@ +//! Server configuration +//! +//! Configuration for the IAM server including storage backend, authentication, and server settings. + +use std::net::SocketAddr; +use std::path::Path; + +use serde::{Deserialize, Serialize}; + +/// Main server configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ServerConfig { + /// Server settings + pub server: ServerSettings, + + /// Storage backend configuration + pub store: StoreConfig, + + /// Authentication configuration + pub authn: AuthnConfig, + + /// Logging configuration + #[serde(default)] + pub logging: LoggingConfig, +} + +impl ServerConfig { + /// Load configuration from a TOML file + pub fn from_file(path: impl AsRef) -> Result { + let content = std::fs::read_to_string(path).map_err(|e| ConfigError::Io(e.to_string()))?; + toml::from_str(&content).map_err(|e| ConfigError::Parse(e.to_string())) + } + + /// Load configuration from environment variables + pub fn from_env() -> Result { + // Try to find config file from environment + if let Ok(path) = std::env::var("IAM_CONFIG") { + return Self::from_file(path); + } + + // Default paths to try + let default_paths = ["iam.toml", "/etc/iam/iam.toml", "config/iam.toml"]; + + for path in &default_paths { + if Path::new(path).exists() { + return Self::from_file(path); + } + } + + // Use default configuration + Ok(Self::default()) + } + + /// Create a minimal configuration for testing + pub fn for_testing() -> Self { + Self { + server: ServerSettings { + addr: "127.0.0.1:50051".parse().unwrap(), + tls: None, + }, + store: StoreConfig { + backend: BackendKind::Memory, + chainfire_endpoints: None, + flaredb_endpoint: None, + flaredb_namespace: None, + }, + authn: AuthnConfig { + jwt: None, + internal_token: InternalTokenConfig { + signing_key: "test-signing-key-for-development-only".into(), + issuer: "iam-test".into(), + default_ttl_seconds: 3600, + max_ttl_seconds: 86400, + }, + }, + logging: LoggingConfig::default(), + } + } +} + +impl Default for ServerConfig { + fn default() -> Self { + Self { + server: ServerSettings::default(), + store: StoreConfig::default(), + authn: AuthnConfig::default(), + logging: LoggingConfig::default(), + } + } +} + +/// Server settings +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ServerSettings { + /// Listen address + #[serde(default = "default_addr")] + pub addr: SocketAddr, + + /// TLS configuration + pub tls: Option, +} + +impl Default for ServerSettings { + fn default() -> Self { + Self { + addr: default_addr(), + tls: None, + } + } +} + +fn default_addr() -> SocketAddr { + "0.0.0.0:50051".parse().unwrap() +} + +/// TLS configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TlsConfig { + /// Path to certificate file (PEM) + pub cert_file: String, + + /// Path to private key file (PEM) + pub key_file: String, + + /// Path to CA certificate file for client verification (optional) + pub ca_file: Option, + + /// Require client certificates + #[serde(default)] + pub require_client_cert: bool, +} + +/// Storage backend configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StoreConfig { + /// Backend type + #[serde(default)] + pub backend: BackendKind, + + /// Chainfire endpoints (for chainfire backend) + pub chainfire_endpoints: Option>, + + /// FlareDB endpoint (for flaredb backend) + pub flaredb_endpoint: Option, + + /// FlareDB namespace (for flaredb backend) + pub flaredb_namespace: Option, +} + +impl Default for StoreConfig { + fn default() -> Self { + Self { + backend: BackendKind::Memory, + chainfire_endpoints: None, + flaredb_endpoint: None, + flaredb_namespace: Some("iam".into()), + } + } +} + +/// Backend type +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum BackendKind { + /// In-memory backend (for testing) + #[default] + Memory, + /// Chainfire distributed KV + Chainfire, + /// FlareDB distributed database + FlareDb, +} + +/// Authentication configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AuthnConfig { + /// JWT/OIDC configuration + pub jwt: Option, + + /// Internal token configuration + pub internal_token: InternalTokenConfig, +} + +impl Default for AuthnConfig { + fn default() -> Self { + Self { + jwt: None, + internal_token: InternalTokenConfig::default(), + } + } +} + +/// JWT/OIDC configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct JwtConfig { + /// JWKS endpoint URL + pub jwks_url: String, + + /// Expected issuer + pub issuer: String, + + /// Expected audience + pub audience: String, + + /// JWKS cache TTL in seconds + #[serde(default = "default_jwks_cache_ttl")] + pub jwks_cache_ttl_seconds: u64, +} + +fn default_jwks_cache_ttl() -> u64 { + 3600 +} + +/// Internal token configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct InternalTokenConfig { + /// Signing key (base64 encoded or raw string) + pub signing_key: String, + + /// Token issuer identifier + #[serde(default = "default_issuer")] + pub issuer: String, + + /// Default token TTL in seconds + #[serde(default = "default_token_ttl")] + pub default_ttl_seconds: u64, + + /// Maximum token TTL in seconds + #[serde(default = "default_max_token_ttl")] + pub max_ttl_seconds: u64, +} + +impl Default for InternalTokenConfig { + fn default() -> Self { + Self { + signing_key: String::new(), + issuer: default_issuer(), + default_ttl_seconds: default_token_ttl(), + max_ttl_seconds: default_max_token_ttl(), + } + } +} + +fn default_issuer() -> String { + "iam".into() +} + +fn default_token_ttl() -> u64 { + 3600 // 1 hour +} + +fn default_max_token_ttl() -> u64 { + 86400 * 7 // 7 days +} + +/// Logging configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LoggingConfig { + /// Log level + #[serde(default = "default_log_level")] + pub level: String, + + /// Log format (json or text) + #[serde(default = "default_log_format")] + pub format: String, +} + +impl Default for LoggingConfig { + fn default() -> Self { + Self { + level: default_log_level(), + format: default_log_format(), + } + } +} + +fn default_log_level() -> String { + "info".into() +} + +fn default_log_format() -> String { + "text".into() +} + +/// Configuration errors +#[derive(Debug)] +pub enum ConfigError { + /// IO error + Io(String), + /// Parse error + Parse(String), + /// Validation error + Validation(String), +} + +impl std::fmt::Display for ConfigError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ConfigError::Io(e) => write!(f, "IO error: {}", e), + ConfigError::Parse(e) => write!(f, "Parse error: {}", e), + ConfigError::Validation(e) => write!(f, "Validation error: {}", e), + } + } +} + +impl std::error::Error for ConfigError {} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_config() { + let config = ServerConfig::default(); + assert_eq!(config.server.addr, "0.0.0.0:50051".parse().unwrap()); + assert!(matches!(config.store.backend, BackendKind::Memory)); + } + + #[test] + fn test_for_testing() { + let config = ServerConfig::for_testing(); + assert!(config.authn.internal_token.signing_key.len() > 0); + } + + #[test] + fn test_toml_parse() { + let toml = r#" +[server] +addr = "127.0.0.1:8080" + +[store] +backend = "memory" + +[authn.internal_token] +signing_key = "secret" +issuer = "test" +default_ttl_seconds = 1800 + +[logging] +level = "debug" +"#; + let config: ServerConfig = toml::from_str(toml).unwrap(); + assert_eq!(config.server.addr, "127.0.0.1:8080".parse().unwrap()); + assert_eq!(config.authn.internal_token.default_ttl_seconds, 1800); + assert_eq!(config.logging.level, "debug"); + } +} diff --git a/iam/crates/iam-server/src/main.rs b/iam/crates/iam-server/src/main.rs new file mode 100644 index 0000000..5e3a97a --- /dev/null +++ b/iam/crates/iam-server/src/main.rs @@ -0,0 +1,257 @@ +//! IAM Server +//! +//! The main entry point for the IAM gRPC server. + +mod config; + +use std::sync::Arc; +use std::time::Duration; + +use clap::Parser; +use metrics_exporter_prometheus::PrometheusBuilder; +use tonic::transport::Server; +use tonic_health::server::health_reporter; +use tracing::{info, warn}; + +use iam_api::{ + iam_admin_server::IamAdminServer, iam_authz_server::IamAuthzServer, + iam_token_server::IamTokenServer, IamAdminService, IamAuthzService, IamTokenService, +}; +use iam_authn::{InternalTokenConfig, InternalTokenService, SigningKey}; +use iam_authz::{PolicyCache, PolicyCacheConfig, PolicyEvaluator}; +use iam_store::{Backend, BackendConfig, BindingStore, PrincipalStore, RoleStore, TokenStore}; + +use config::{BackendKind, ServerConfig}; + +/// IAM Server +#[derive(Parser, Debug)] +#[command(name = "iam-server")] +#[command(about = "Identity and Access Management Server")] +struct Args { + /// Configuration file path + #[arg(short, long)] + config: Option, + + /// Listen address (overrides config) + #[arg(long)] + addr: Option, + + /// Log level (overrides config) + #[arg(long)] + log_level: Option, + + /// Metrics port for Prometheus scraping (default: 9090) + #[arg(long, default_value = "9090")] + metrics_port: u16, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let args = Args::parse(); + + // Load configuration + let mut config = match &args.config { + Some(path) => ServerConfig::from_file(path)?, + None => ServerConfig::from_env()?, + }; + + // Apply CLI overrides + if let Some(addr) = args.addr { + config.server.addr = addr.parse()?; + } + if let Some(level) = args.log_level { + config.logging.level = level; + } + + // Initialize logging + init_logging(&config.logging.level); + + // Initialize Prometheus metrics exporter + // Serves metrics at http://0.0.0.0:{metrics_port}/metrics + let metrics_addr = format!("0.0.0.0:{}", args.metrics_port); + let builder = PrometheusBuilder::new(); + builder + .with_http_listener(metrics_addr.parse::()?) + .install() + .expect("Failed to install Prometheus metrics exporter"); + + info!( + "Prometheus metrics available at http://{}/metrics", + metrics_addr + ); + + // Register common metrics + metrics::describe_counter!( + "iam_authz_requests_total", + "Total number of authorization requests" + ); + metrics::describe_counter!( + "iam_authz_allowed_total", + "Total number of allowed authorization requests" + ); + metrics::describe_counter!( + "iam_authz_denied_total", + "Total number of denied authorization requests" + ); + metrics::describe_counter!( + "iam_token_issued_total", + "Total number of tokens issued" + ); + metrics::describe_histogram!( + "iam_request_duration_seconds", + "Request duration in seconds" + ); + + info!("Starting IAM server on {}", config.server.addr); + + // Create backend + let backend = create_backend(&config.store).await?; + let backend = Arc::new(backend); + + // Create stores + let principal_store = Arc::new(PrincipalStore::new(backend.clone())); + let role_store = Arc::new(RoleStore::new(backend.clone())); + let binding_store = Arc::new(BindingStore::new(backend.clone())); + let token_store = Arc::new(TokenStore::new(backend.clone())); + + // Initialize builtin roles + info!("Initializing builtin roles..."); + role_store.init_builtin_roles().await?; + + // Create policy cache + let cache_config = PolicyCacheConfig { + binding_ttl: Duration::from_secs(300), + role_ttl: Duration::from_secs(600), + max_binding_entries: 10000, + max_role_entries: 1000, + }; + let cache = Arc::new(PolicyCache::new(cache_config)); + + // Create evaluator + let evaluator = Arc::new(PolicyEvaluator::new( + binding_store.clone(), + role_store.clone(), + cache, + )); + + // Create token service + let signing_key = if config.authn.internal_token.signing_key.is_empty() { + warn!("No signing key configured, generating random key"); + SigningKey::generate("iam-key-1") + } else { + SigningKey::new( + "iam-key-1", + config.authn.internal_token.signing_key.as_bytes().to_vec(), + ) + }; + + let token_config = InternalTokenConfig::new(signing_key, &config.authn.internal_token.issuer) + .with_default_ttl(Duration::from_secs( + config.authn.internal_token.default_ttl_seconds, + )) + .with_max_ttl(Duration::from_secs( + config.authn.internal_token.max_ttl_seconds, + )); + + let token_service = Arc::new(InternalTokenService::new(token_config)); + + // Create gRPC services + let authz_service = IamAuthzService::new(evaluator, principal_store.clone()); + let token_grpc_service = + IamTokenService::new(token_service, principal_store.clone(), token_store.clone()); + let admin_service = IamAdminService::new( + principal_store.clone(), + role_store.clone(), + binding_store.clone(), + ); + + info!("IAM server ready, starting gRPC listeners..."); + + // Create health service (for K8s liveness/readiness probes) + // Uses grpc.health.v1.Health standard protocol + let (mut health_reporter, health_service) = health_reporter(); + + // Mark services as serving + health_reporter + .set_serving::>() + .await; + health_reporter + .set_serving::>() + .await; + health_reporter + .set_serving::>() + .await; + + // Spawn health monitoring task + let backend_for_health = backend.clone(); + tokio::spawn(async move { + // Periodically check backend connectivity + loop { + tokio::time::sleep(Duration::from_secs(30)).await; + // Backend health check could be added here if Backend exposes a ping method + let _ = backend_for_health; // Keep reference alive + } + }); + + info!("Health check service enabled (grpc.health.v1.Health)"); + + Server::builder() + .add_service(health_service) + .add_service(IamAuthzServer::new(authz_service)) + .add_service(IamTokenServer::new(token_grpc_service)) + .add_service(IamAdminServer::new(admin_service)) + .serve(config.server.addr) + .await?; + + Ok(()) +} + +async fn create_backend( + config: &config::StoreConfig, +) -> Result> { + let backend_config = match config.backend { + BackendKind::Memory => { + info!("Using in-memory backend"); + BackendConfig::Memory + } + BackendKind::Chainfire => { + let endpoints = config + .chainfire_endpoints + .clone() + .ok_or("chainfire_endpoints required for chainfire backend")?; + info!("Using Chainfire backend with endpoints: {:?}", endpoints); + BackendConfig::Chainfire { endpoints } + } + BackendKind::FlareDb => { + let endpoint = config + .flaredb_endpoint + .clone() + .ok_or("flaredb_endpoint required for flaredb backend")?; + let namespace = config + .flaredb_namespace + .clone() + .unwrap_or_else(|| "iam".into()); + info!( + "Using FlareDB backend at {} (namespace: {})", + endpoint, namespace + ); + BackendConfig::FlareDb { + endpoint, + namespace, + } + } + }; + + Backend::new(backend_config).await.map_err(|e| e.into()) +} + +fn init_logging(level: &str) { + use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, EnvFilter}; + + let filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(level)); + + tracing_subscriber::registry() + .with(filter) + .with(tracing_subscriber::fmt::layer()) + .init(); +} diff --git a/iam/crates/iam-store/Cargo.toml b/iam/crates/iam-store/Cargo.toml new file mode 100644 index 0000000..7567354 --- /dev/null +++ b/iam/crates/iam-store/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "iam-store" +version.workspace = true +edition.workspace = true +description = "Storage abstraction layer for IAM" +license.workspace = true + +[dependencies] +iam-types = { path = "../iam-types" } +serde = { workspace = true } +serde_json = { workspace = true } +thiserror = { workspace = true } +tokio = { workspace = true } +tracing = { workspace = true } +async-trait = { workspace = true } +bytes = { workspace = true } +chainfire-client = { workspace = true } +flaredb-client = { workspace = true } +tonic = { workspace = true } + +[dev-dependencies] +tokio = { workspace = true, features = ["full", "test-util"] } diff --git a/iam/crates/iam-store/src/backend.rs b/iam/crates/iam-store/src/backend.rs new file mode 100644 index 0000000..f952f70 --- /dev/null +++ b/iam/crates/iam-store/src/backend.rs @@ -0,0 +1,1047 @@ +//! Backend abstraction for IAM storage +//! +//! Provides a unified interface for storing IAM data in either +//! Chainfire (distributed KV) or FlareDB (distributed DB) backends. + +use async_trait::async_trait; +use bytes::Bytes; +use serde::{de::DeserializeOwned, Serialize}; +use tokio::sync::Mutex; +use tonic::Status; + +use iam_types::{Error, Result, StorageError}; + +use chainfire_client::{ + CasOutcome, Client as ChainfireClient, ClientError as ChainfireClientError, +}; +use flaredb_client::RdbClient; + +/// Key-value pair with version +#[derive(Debug, Clone)] +pub struct KvPair { + pub key: Bytes, + pub value: Bytes, + pub version: u64, +} + +/// Result of a CAS (Compare-And-Swap) operation +#[derive(Debug, Clone)] +pub enum CasResult { + /// CAS succeeded, returning the new version + Success(u64), + /// CAS failed due to version mismatch + Conflict { expected: u64, actual: u64 }, + /// Key not found (when expected version > 0) + NotFound, +} + +/// Backend trait for storage operations +#[async_trait] +pub trait StorageBackend: Send + Sync { + /// Get a value by key + async fn get(&self, key: &[u8]) -> Result>; + + /// Put a value (unconditional write) + async fn put(&self, key: &[u8], value: &[u8]) -> Result; + + /// Compare-and-swap write + /// - If expected_version is 0, only succeeds if key doesn't exist + /// - Otherwise, only succeeds if current version matches expected_version + async fn cas(&self, key: &[u8], expected_version: u64, value: &[u8]) -> Result; + + /// Delete a key + async fn delete(&self, key: &[u8]) -> Result; + + /// Scan keys with a prefix + async fn scan_prefix(&self, prefix: &[u8], limit: u32) -> Result>; + + /// Scan keys in a range [start, end) + async fn scan_range(&self, start: &[u8], end: &[u8], limit: u32) -> Result>; + + /// Paginated scan by prefix; returns items and an optional cursor for the next page + async fn scan_prefix_paged( + &self, + prefix: &[u8], + start_after: Option<&[u8]>, + limit: u32, + ) -> Result<(Vec, Option)> { + // Fallback implementation using scan_prefix (no pagination cursor) + let mut effective_prefix = prefix.to_vec(); + if let Some(start_after) = start_after { + effective_prefix = start_after.to_vec(); + effective_prefix.push(0); // ensure greater than start_after + } + let items = self.scan_prefix(&effective_prefix, limit).await?; + Ok((items, None)) + } + + /// Paginated range scan [start, end); returns items and an optional cursor for the next page + async fn scan_range_paged( + &self, + start: &[u8], + end: &[u8], + start_after: Option<&[u8]>, + limit: u32, + ) -> Result<(Vec, Option)> { + let mut effective_start = start.to_vec(); + if let Some(after) = start_after { + effective_start = after.to_vec(); + effective_start.push(0); + } + let items = self.scan_range(&effective_start, end, limit).await?; + Ok((items, None)) + } +} + +/// Backend configuration +#[derive(Debug, Clone)] +pub enum BackendConfig { + /// Chainfire backend + Chainfire { + /// Chainfire endpoint addresses + endpoints: Vec, + }, + /// FlareDB backend + FlareDb { + /// FlareDB endpoint address + endpoint: String, + /// Namespace for IAM data + namespace: String, + }, + /// In-memory backend (for testing) + Memory, +} + +/// Backend enum wrapping different implementations +pub enum Backend { + /// Chainfire backend + Chainfire(ChainfireBackend), + /// FlareDB backend + FlareDb(FlareDbBackend), + /// In-memory backend (for testing) + Memory(MemoryBackend), +} + +impl Backend { + /// Create a new backend from configuration + pub async fn new(config: BackendConfig) -> Result { + match config { + BackendConfig::Chainfire { endpoints } => { + let backend = ChainfireBackend::new(endpoints).await?; + Ok(Backend::Chainfire(backend)) + } + BackendConfig::FlareDb { + endpoint, + namespace, + } => { + let backend = FlareDbBackend::new(endpoint, namespace).await?; + Ok(Backend::FlareDb(backend)) + } + BackendConfig::Memory => Ok(Backend::Memory(MemoryBackend::new())), + } + } + + /// Create an in-memory backend for testing + pub fn memory() -> Self { + Backend::Memory(MemoryBackend::new()) + } +} + +#[async_trait] +impl StorageBackend for Backend { + async fn get(&self, key: &[u8]) -> Result> { + match self { + Backend::Chainfire(b) => b.get(key).await, + Backend::FlareDb(b) => b.get(key).await, + Backend::Memory(b) => b.get(key).await, + } + } + + async fn put(&self, key: &[u8], value: &[u8]) -> Result { + match self { + Backend::Chainfire(b) => b.put(key, value).await, + Backend::FlareDb(b) => b.put(key, value).await, + Backend::Memory(b) => b.put(key, value).await, + } + } + + async fn cas(&self, key: &[u8], expected_version: u64, value: &[u8]) -> Result { + match self { + Backend::Chainfire(b) => b.cas(key, expected_version, value).await, + Backend::FlareDb(b) => b.cas(key, expected_version, value).await, + Backend::Memory(b) => b.cas(key, expected_version, value).await, + } + } + + async fn delete(&self, key: &[u8]) -> Result { + match self { + Backend::Chainfire(b) => b.delete(key).await, + Backend::FlareDb(b) => b.delete(key).await, + Backend::Memory(b) => b.delete(key).await, + } + } + + async fn scan_prefix(&self, prefix: &[u8], limit: u32) -> Result> { + match self { + Backend::Chainfire(b) => b.scan_prefix(prefix, limit).await, + Backend::FlareDb(b) => b.scan_prefix(prefix, limit).await, + Backend::Memory(b) => b.scan_prefix(prefix, limit).await, + } + } + + async fn scan_range(&self, start: &[u8], end: &[u8], limit: u32) -> Result> { + match self { + Backend::Chainfire(b) => b.scan_range(start, end, limit).await, + Backend::FlareDb(b) => b.scan_range(start, end, limit).await, + Backend::Memory(b) => b.scan_range(start, end, limit).await, + } + } + + async fn scan_prefix_paged( + &self, + prefix: &[u8], + start_after: Option<&[u8]>, + limit: u32, + ) -> Result<(Vec, Option)> { + match self { + Backend::Chainfire(b) => b.scan_prefix_paged(prefix, start_after, limit).await, + Backend::FlareDb(b) => b.scan_prefix_paged(prefix, start_after, limit).await, + Backend::Memory(b) => b.scan_prefix_paged(prefix, start_after, limit).await, + } + } + + async fn scan_range_paged( + &self, + start: &[u8], + end: &[u8], + start_after: Option<&[u8]>, + limit: u32, + ) -> Result<(Vec, Option)> { + match self { + Backend::Chainfire(b) => b.scan_range_paged(start, end, start_after, limit).await, + Backend::FlareDb(b) => b.scan_range_paged(start, end, start_after, limit).await, + Backend::Memory(b) => b.scan_range_paged(start, end, start_after, limit).await, + } + } +} + +// ============================================================================ +// Chainfire Backend Implementation +// ============================================================================ + +/// Chainfire backend implementation +pub struct ChainfireBackend { + endpoints: Vec, + client: Mutex, +} + +impl ChainfireBackend { + /// Create a new Chainfire backend + pub async fn new(endpoints: Vec) -> Result { + let client = Self::connect_any(&endpoints).await?; + Ok(Self { + endpoints, + client: Mutex::new(client), + }) + } + + async fn connect_any(endpoints: &[String]) -> Result { + let mut last_err = None; + for ep in endpoints { + let addr = if ep.starts_with("http://") || ep.starts_with("https://") { + ep.clone() + } else { + format!("http://{}", ep) + }; + match ChainfireClient::connect(addr.clone()).await { + Ok(client) => return Ok(client), + Err(e) => { + last_err = Some(e); + } + } + } + + Err(Error::Storage(StorageError::Connection( + last_err + .map(|e| e.to_string()) + .unwrap_or_else(|| "no endpoints available".into()), + ))) + } +} + +#[async_trait] +impl StorageBackend for ChainfireBackend { + async fn get(&self, key: &[u8]) -> Result> { + let mut client = self.client.lock().await; + let result = client + .get_with_revision(key) + .await + .map_err(map_chainfire_error)?; + Ok(result.map(|(v, rev)| (Bytes::from(v), rev))) + } + + async fn put(&self, key: &[u8], value: &[u8]) -> Result { + let mut client = self.client.lock().await; + client.put(key, value).await.map_err(map_chainfire_error) + } + + async fn cas(&self, key: &[u8], expected_version: u64, value: &[u8]) -> Result { + let mut client = self.client.lock().await; + let outcome: CasOutcome = client + .compare_and_swap(key, expected_version, value) + .await + .map_err(map_chainfire_error)?; + + if outcome.success { + return Ok(CasResult::Success(outcome.new_version)); + } + + if expected_version == 0 { + if outcome.current_version == 0 { + Ok(CasResult::NotFound) + } else { + Ok(CasResult::Conflict { + expected: 0, + actual: outcome.current_version, + }) + } + } else { + Ok(CasResult::Conflict { + expected: expected_version, + actual: outcome.current_version, + }) + } + } + + async fn delete(&self, key: &[u8]) -> Result { + let mut client = self.client.lock().await; + client.delete(key).await.map_err(map_chainfire_error) + } + + async fn scan_prefix(&self, prefix: &[u8], limit: u32) -> Result> { + let mut client = self.client.lock().await; + let (results, _) = client + .scan_prefix(prefix, limit as i64) + .await + .map_err(map_chainfire_error)?; + + Ok(results + .into_iter() + .map(|(k, v, ver)| KvPair { + key: Bytes::from(k), + value: Bytes::from(v), + version: ver, + }) + .collect()) + } + + async fn scan_range(&self, start: &[u8], end: &[u8], limit: u32) -> Result> { + let mut client = self.client.lock().await; + let (results, _) = client + .scan_range(start, end, limit as i64) + .await + .map_err(map_chainfire_error)?; + + Ok(results + .into_iter() + .map(|(k, v, ver)| KvPair { + key: Bytes::from(k), + value: Bytes::from(v), + version: ver, + }) + .collect()) + } + + async fn scan_prefix_paged( + &self, + prefix: &[u8], + start_after: Option<&[u8]>, + limit: u32, + ) -> Result<(Vec, Option)> { + let mut start = prefix.to_vec(); + if let Some(after) = start_after { + start = after.to_vec(); + start.push(0); + } + let end = prefix_end(prefix); + + let mut client = self.client.lock().await; + let (results, next) = client + .scan_range(&start, &end, limit as i64) + .await + .map_err(map_chainfire_error)?; + + let kvs = results + .into_iter() + .map(|(k, v, ver)| KvPair { + key: Bytes::from(k), + value: Bytes::from(v), + version: ver, + }) + .collect(); + + Ok((kvs, next.map(Bytes::from))) + } + + async fn scan_range_paged( + &self, + start: &[u8], + end: &[u8], + start_after: Option<&[u8]>, + limit: u32, + ) -> Result<(Vec, Option)> { + let mut effective_start = start.to_vec(); + if let Some(after) = start_after { + effective_start = after.to_vec(); + effective_start.push(0); + } + + let mut client = self.client.lock().await; + let (results, next) = client + .scan_range(&effective_start, end, limit as i64) + .await + .map_err(map_chainfire_error)?; + + let kvs = results + .into_iter() + .map(|(k, v, ver)| KvPair { + key: Bytes::from(k), + value: Bytes::from(v), + version: ver, + }) + .collect(); + + Ok((kvs, next.map(Bytes::from))) + } +} + +fn map_chainfire_error(err: ChainfireClientError) -> Error { + match err { + ChainfireClientError::Connection(msg) => Error::Storage(StorageError::Connection(msg)), + ChainfireClientError::Transport(e) => { + Error::Storage(StorageError::Connection(e.to_string())) + } + ChainfireClientError::Rpc(status) => { + Error::Storage(StorageError::Backend(status.to_string())) + } + other => Error::Storage(StorageError::Backend(other.to_string())), + } +} + +fn map_flaredb_error(err: Status) -> Error { + if err.code() == tonic::Code::Unavailable { + Error::Storage(StorageError::Connection(err.to_string())) + } else { + Error::Storage(StorageError::Backend(err.to_string())) + } +} + +fn prefix_end(prefix: &[u8]) -> Vec { + let mut end = prefix.to_vec(); + for i in (0..end.len()).rev() { + if end[i] < 0xff { + end[i] += 1; + end.truncate(i + 1); + return end; + } + } + Vec::new() +} + +// ============================================================================ +// FlareDB Backend Implementation +// ============================================================================ + +/// FlareDB backend implementation +pub struct FlareDbBackend { + endpoint: String, + namespace: String, + client: Mutex, +} + +impl FlareDbBackend { + /// Create a new FlareDB backend + pub async fn new(endpoint: String, namespace: String) -> Result { + let client = RdbClient::connect_with_pd_namespace( + endpoint.clone(), + endpoint.clone(), + namespace.clone(), + ) + .await + .map_err(|e| Error::Storage(StorageError::Connection(e.to_string())))?; + + Ok(Self { + endpoint, + namespace, + client: Mutex::new(client), + }) + } +} + +#[async_trait] +impl StorageBackend for FlareDbBackend { + async fn get(&self, key: &[u8]) -> Result> { + let mut client = self.client.lock().await; + let res = client + .cas_get(key.to_vec()) + .await + .map_err(map_flaredb_error)?; + + Ok(res.and_then(|(ver, val)| { + if val.is_empty() { + None + } else { + Some((Bytes::from(val), ver)) + } + })) + } + + async fn put(&self, key: &[u8], value: &[u8]) -> Result { + let key = key.to_vec(); + let value = value.to_vec(); + let mut attempts = 0; + + loop { + // Get current version (treat tombstone as absent) + let current = { + let mut client = self.client.lock().await; + client + .cas_get(key.clone()) + .await + .map_err(map_flaredb_error)? + }; + let mut expected_version = 0; + if let Some((ver, val)) = current { + if !val.is_empty() { + expected_version = ver; + } + } + + let (success, current_version, new_version) = { + let mut client = self.client.lock().await; + client + .cas(key.clone(), value.clone(), expected_version) + .await + .map_err(map_flaredb_error)? + }; + + if success { + return Ok(new_version); + } + + attempts += 1; + if attempts >= 3 { + return Err(Error::Storage(StorageError::CasConflict { + expected: expected_version, + actual: current_version, + })); + } + } + } + + async fn cas(&self, key: &[u8], expected_version: u64, value: &[u8]) -> Result { + let mut client = self.client.lock().await; + let (success, current_version, new_version) = client + .cas(key.to_vec(), value.to_vec(), expected_version) + .await + .map_err(map_flaredb_error)?; + + if success { + Ok(CasResult::Success(new_version)) + } else if expected_version == 0 { + if current_version == 0 { + Ok(CasResult::NotFound) + } else { + Ok(CasResult::Conflict { + expected: 0, + actual: current_version, + }) + } + } else { + Ok(CasResult::Conflict { + expected: expected_version, + actual: current_version, + }) + } + } + + async fn delete(&self, key: &[u8]) -> Result { + // FlareDB does not expose a delete; use a tombstone (empty value) + let (current_version, value) = { + let mut client = self.client.lock().await; + let current = client + .cas_get(key.to_vec()) + .await + .map_err(map_flaredb_error)?; + + match current { + Some((ver, val)) => (ver, val), + None => return Ok(false), + } + }; + + if value.is_empty() { + return Ok(false); + } + + let mut client = self.client.lock().await; + let (success, _, _) = client + .cas(key.to_vec(), Vec::new(), current_version) + .await + .map_err(map_flaredb_error)?; + + Ok(success) + } + + async fn scan_prefix(&self, prefix: &[u8], limit: u32) -> Result> { + let start = prefix.to_vec(); + let end = prefix_end(prefix); + + let mut client = self.client.lock().await; + let (entries, _) = client + .cas_scan(start, end, limit) + .await + .map_err(map_flaredb_error)?; + + Ok(entries + .into_iter() + .filter(|(_, val, _)| !val.is_empty()) + .map(|(k, v, ver)| KvPair { + key: Bytes::from(k), + value: Bytes::from(v), + version: ver, + }) + .collect()) + } + + async fn scan_range(&self, start: &[u8], end: &[u8], limit: u32) -> Result> { + let mut client = self.client.lock().await; + let (entries, _) = client + .cas_scan(start.to_vec(), end.to_vec(), limit) + .await + .map_err(map_flaredb_error)?; + + Ok(entries + .into_iter() + .filter(|(_, val, _)| !val.is_empty()) + .map(|(k, v, ver)| KvPair { + key: Bytes::from(k), + value: Bytes::from(v), + version: ver, + }) + .collect()) + } + + async fn scan_prefix_paged( + &self, + prefix: &[u8], + start_after: Option<&[u8]>, + limit: u32, + ) -> Result<(Vec, Option)> { + let mut start = prefix.to_vec(); + if let Some(after) = start_after { + start = after.to_vec(); + start.push(0); + } + let end = prefix_end(prefix); + + let mut client = self.client.lock().await; + let (entries, next) = client + .cas_scan(start, end, limit) + .await + .map_err(map_flaredb_error)?; + + let kvs = entries + .into_iter() + .filter(|(_, val, _)| !val.is_empty()) + .map(|(k, v, ver)| KvPair { + key: Bytes::from(k), + value: Bytes::from(v), + version: ver, + }) + .collect(); + + Ok((kvs, next.map(Bytes::from))) + } + + async fn scan_range_paged( + &self, + start: &[u8], + end: &[u8], + start_after: Option<&[u8]>, + limit: u32, + ) -> Result<(Vec, Option)> { + let mut s = start.to_vec(); + if let Some(after) = start_after { + s = after.to_vec(); + s.push(0); + } + + let mut client = self.client.lock().await; + let (entries, next) = client + .cas_scan(s, end.to_vec(), limit) + .await + .map_err(map_flaredb_error)?; + + let kvs = entries + .into_iter() + .filter(|(_, val, _)| !val.is_empty()) + .map(|(k, v, ver)| KvPair { + key: Bytes::from(k), + value: Bytes::from(v), + version: ver, + }) + .collect(); + + Ok((kvs, next.map(Bytes::from))) + } +} + +// ============================================================================ +// In-Memory Backend Implementation (for testing) +// ============================================================================ + +use std::collections::BTreeMap; +use std::sync::RwLock; + +/// In-memory backend for testing +pub struct MemoryBackend { + data: RwLock, (Vec, u64)>>, + version_counter: RwLock, +} + +impl MemoryBackend { + /// Create a new in-memory backend + pub fn new() -> Self { + Self { + data: RwLock::new(BTreeMap::new()), + version_counter: RwLock::new(0), + } + } + + fn next_version(&self) -> u64 { + let mut counter = self.version_counter.write().unwrap(); + *counter += 1; + *counter + } +} + +impl Default for MemoryBackend { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl StorageBackend for MemoryBackend { + async fn get(&self, key: &[u8]) -> Result> { + let data = self.data.read().unwrap(); + Ok(data + .get(key) + .map(|(v, ver)| (Bytes::copy_from_slice(v), *ver))) + } + + async fn put(&self, key: &[u8], value: &[u8]) -> Result { + let version = self.next_version(); + let mut data = self.data.write().unwrap(); + data.insert(key.to_vec(), (value.to_vec(), version)); + Ok(version) + } + + async fn cas(&self, key: &[u8], expected_version: u64, value: &[u8]) -> Result { + let mut data = self.data.write().unwrap(); + + match data.get(key) { + Some((_, current_version)) => { + if *current_version != expected_version { + return Ok(CasResult::Conflict { + expected: expected_version, + actual: *current_version, + }); + } + } + None => { + if expected_version != 0 { + return Ok(CasResult::NotFound); + } + } + } + + let version = self.next_version(); + data.insert(key.to_vec(), (value.to_vec(), version)); + Ok(CasResult::Success(version)) + } + + async fn delete(&self, key: &[u8]) -> Result { + let mut data = self.data.write().unwrap(); + Ok(data.remove(key).is_some()) + } + + async fn scan_prefix(&self, prefix: &[u8], limit: u32) -> Result> { + let data = self.data.read().unwrap(); + let mut results = Vec::new(); + + for (k, (v, ver)) in data.range(prefix.to_vec()..) { + if !k.starts_with(prefix) { + break; + } + results.push(KvPair { + key: Bytes::copy_from_slice(k), + value: Bytes::copy_from_slice(v), + version: *ver, + }); + if results.len() >= limit as usize { + break; + } + } + + Ok(results) + } + + async fn scan_range(&self, start: &[u8], end: &[u8], limit: u32) -> Result> { + let data = self.data.read().unwrap(); + let mut results = Vec::new(); + + for (k, (v, ver)) in data.range(start.to_vec()..end.to_vec()) { + results.push(KvPair { + key: Bytes::copy_from_slice(k), + value: Bytes::copy_from_slice(v), + version: *ver, + }); + if results.len() >= limit as usize { + break; + } + } + + Ok(results) + } + + async fn scan_prefix_paged( + &self, + prefix: &[u8], + start_after: Option<&[u8]>, + limit: u32, + ) -> Result<(Vec, Option)> { + let start_key = match start_after { + Some(after) => { + let mut k = after.to_vec(); + k.push(0); + k + } + None => prefix.to_vec(), + }; + let end_key = prefix_end(prefix); + let items = self.scan_range(&start_key, &end_key, limit).await?; + let next = if items.len() as u32 == limit { + items.last().map(|kv| { + let mut n = kv.key.to_vec(); + n.push(0); + Bytes::from(n) + }) + } else { + None + }; + Ok((items, next)) + } + + async fn scan_range_paged( + &self, + start: &[u8], + end: &[u8], + start_after: Option<&[u8]>, + limit: u32, + ) -> Result<(Vec, Option)> { + let start_key = match start_after { + Some(after) => { + let mut k = after.to_vec(); + k.push(0); + k + } + None => start.to_vec(), + }; + let items = self.scan_range(&start_key, end, limit).await?; + let next = if items.len() as u32 == limit { + items.last().map(|kv| { + let mut n = kv.key.to_vec(); + n.push(0); + Bytes::from(n) + }) + } else { + None + }; + Ok((items, next)) + } +} + +// ============================================================================ +// Helper functions for stores +// ============================================================================ + +/// Helper trait for JSON serialization in stores +pub trait JsonStore: Sync { + fn backend(&self) -> &Backend; + + /// Get and deserialize a value + fn get_json<'a, T: DeserializeOwned + Send + 'a>( + &'a self, + key: &'a [u8], + ) -> impl std::future::Future>> + Send + 'a { + async move { + match self.backend().get(key).await? { + Some((bytes, version)) => { + let value: T = serde_json::from_slice(&bytes) + .map_err(|e| Error::Serialization(e.to_string()))?; + Ok(Some((value, version))) + } + None => Ok(None), + } + } + } + + /// Serialize and put a value + fn put_json<'a, T: Serialize + Send + Sync + 'a>( + &'a self, + key: &'a [u8], + value: &'a T, + ) -> impl std::future::Future> + Send + 'a { + async move { + let bytes = + serde_json::to_vec(value).map_err(|e| Error::Serialization(e.to_string()))?; + self.backend().put(key, &bytes).await + } + } + + /// Serialize and CAS a value + fn cas_json<'a, T: Serialize + Send + Sync + 'a>( + &'a self, + key: &'a [u8], + expected_version: u64, + value: &'a T, + ) -> impl std::future::Future> + Send + 'a { + async move { + let bytes = + serde_json::to_vec(value).map_err(|e| Error::Serialization(e.to_string()))?; + self.backend().cas(key, expected_version, &bytes).await + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_memory_backend_basic() { + let backend = MemoryBackend::new(); + + // Test put and get + let version = backend.put(b"key1", b"value1").await.unwrap(); + assert!(version > 0); + + let result = backend.get(b"key1").await.unwrap(); + assert!(result.is_some()); + let (value, ver) = result.unwrap(); + assert_eq!(&value[..], b"value1"); + assert_eq!(ver, version); + + // Test get non-existent + let result = backend.get(b"nonexistent").await.unwrap(); + assert!(result.is_none()); + } + + #[tokio::test] + async fn test_memory_backend_cas() { + let backend = MemoryBackend::new(); + + // CAS create (expected version 0) + let result = backend.cas(b"key1", 0, b"value1").await.unwrap(); + let version = match result { + CasResult::Success(v) => v, + _ => panic!("expected success"), + }; + + // CAS update with correct version + let result = backend.cas(b"key1", version, b"value2").await.unwrap(); + assert!(matches!(result, CasResult::Success(_))); + + // CAS update with wrong version + let result = backend.cas(b"key1", version, b"value3").await.unwrap(); + assert!(matches!(result, CasResult::Conflict { .. })); + + // CAS create when key exists + let result = backend.cas(b"key1", 0, b"value4").await.unwrap(); + assert!(matches!(result, CasResult::Conflict { .. })); + } + + #[tokio::test] + async fn test_memory_backend_scan_prefix() { + let backend = MemoryBackend::new(); + + backend.put(b"prefix/a", b"1").await.unwrap(); + backend.put(b"prefix/b", b"2").await.unwrap(); + backend.put(b"prefix/c", b"3").await.unwrap(); + backend.put(b"other/x", b"4").await.unwrap(); + + let results = backend.scan_prefix(b"prefix/", 10).await.unwrap(); + assert_eq!(results.len(), 3); + + // Test limit + let results = backend.scan_prefix(b"prefix/", 2).await.unwrap(); + assert_eq!(results.len(), 2); + } + + #[tokio::test] + async fn test_memory_backend_scan_prefix_paged() { + let backend = MemoryBackend::new(); + for i in 0..5u8 { + let key = format!("prefix/{}", i); + backend.put(key.as_bytes(), &[i]).await.unwrap(); + } + + let (page1, cursor1) = backend + .scan_prefix_paged(b"prefix/", None, 2) + .await + .unwrap(); + assert_eq!(page1.len(), 2); + assert!(cursor1.is_some()); + + let (page2, cursor2) = backend + .scan_prefix_paged(b"prefix/", cursor1.as_deref(), 2) + .await + .unwrap(); + assert_eq!(page2.len(), 2); + assert!(cursor2.is_some()); + + let (page3, cursor3) = backend + .scan_prefix_paged(b"prefix/", cursor2.as_deref(), 2) + .await + .unwrap(); + assert_eq!(page3.len(), 1); + assert!(cursor3.is_none()); + + let collected: Vec = page1 + .iter() + .chain(page2.iter()) + .chain(page3.iter()) + .map(|kv| kv.value[0]) + .collect(); + assert_eq!(collected.len(), 5); + assert!(collected.contains(&0) && collected.contains(&4)); + } + + #[tokio::test] + async fn test_memory_backend_delete() { + let backend = MemoryBackend::new(); + + backend.put(b"key1", b"value1").await.unwrap(); + assert!(backend.get(b"key1").await.unwrap().is_some()); + + let deleted = backend.delete(b"key1").await.unwrap(); + assert!(deleted); + assert!(backend.get(b"key1").await.unwrap().is_none()); + + // Delete non-existent + let deleted = backend.delete(b"key1").await.unwrap(); + assert!(!deleted); + } +} diff --git a/iam/crates/iam-store/src/binding_store.rs b/iam/crates/iam-store/src/binding_store.rs new file mode 100644 index 0000000..9194c2d --- /dev/null +++ b/iam/crates/iam-store/src/binding_store.rs @@ -0,0 +1,458 @@ +//! Policy binding storage +//! +//! Stores and retrieves policy bindings that assign roles to principals. + +use std::sync::Arc; + +use iam_types::{Error, IamError, PolicyBinding, PrincipalRef, Result, Scope}; + +use crate::backend::{Backend, CasResult, JsonStore, StorageBackend}; + +/// Key prefixes for binding storage +mod keys { + /// Primary key: by scope and principal + /// Format: iam/bindings/scope/{scope}/principal/{principal}/{id} + pub const BINDINGS_BY_SCOPE: &str = "iam/bindings/scope/"; + + /// Secondary index: by principal + /// Format: iam/bindings/by-principal/{principal}/{id} + pub const BINDINGS_BY_PRINCIPAL: &str = "iam/bindings/by-principal/"; + + /// Secondary index: by role + /// Format: iam/bindings/by-role/{role}/{id} + pub const BINDINGS_BY_ROLE: &str = "iam/bindings/by-role/"; +} + +/// Store for policy bindings +pub struct BindingStore { + backend: Arc, +} + +impl JsonStore for BindingStore { + fn backend(&self) -> &Backend { + &self.backend + } +} + +impl BindingStore { + /// Create a new binding store + pub fn new(backend: Arc) -> Self { + Self { backend } + } + + /// Create a new binding + pub async fn create(&self, binding: &PolicyBinding) -> Result { + let key = self.make_primary_key(binding); + + // Use CAS with version 0 to ensure uniqueness + let bytes = serde_json::to_vec(binding).map_err(|e| Error::Serialization(e.to_string()))?; + + match self.backend.cas(key.as_bytes(), 0, &bytes).await? { + CasResult::Success(version) => { + // Create secondary indexes; on failure, roll back primary to avoid dangling data + if let Err(e) = self.create_indexes(binding).await { + let _ = self.backend.delete(key.as_bytes()).await; + return Err(e); + } + Ok(version) + } + CasResult::Conflict { .. } => Err(Error::Iam(IamError::BindingAlreadyExists( + binding.id.clone(), + ))), + CasResult::NotFound => { + // This shouldn't happen with version 0 + Err(Error::Internal("Unexpected CAS result".into())) + } + } + } + + /// Get a binding by ID + pub async fn get( + &self, + scope: &Scope, + principal: &PrincipalRef, + id: &str, + ) -> Result> { + let key = format!( + "{}{}principal/{}/{}", + keys::BINDINGS_BY_SCOPE, + scope.to_key(), + principal, + id + ); + + match self.get_json::(key.as_bytes()).await? { + Some((binding, _)) => Ok(Some(binding)), + None => Ok(None), + } + } + + /// Get a binding by ID only (searches across all scopes) + pub async fn get_by_id(&self, id: &str) -> Result> { + // Search across all scopes - this is expensive, prefer get() when possible + let prefix = keys::BINDINGS_BY_SCOPE; + let pairs = self.backend.scan_prefix(prefix.as_bytes(), 10000).await?; + + for pair in pairs { + let key_str = String::from_utf8_lossy(&pair.key); + if key_str.ends_with(&format!("/{}", id)) { + let binding: PolicyBinding = serde_json::from_slice(&pair.value) + .map_err(|e| Error::Serialization(e.to_string()))?; + return Ok(Some(binding)); + } + } + + Ok(None) + } + + /// Get a binding with its storage version + pub async fn get_by_id_with_version(&self, id: &str) -> Result> { + let prefix = keys::BINDINGS_BY_SCOPE; + let pairs = self.backend.scan_prefix(prefix.as_bytes(), 10000).await?; + + for pair in pairs { + let key_str = String::from_utf8_lossy(&pair.key); + if key_str.ends_with(&format!("/{}", id)) { + let binding: PolicyBinding = serde_json::from_slice(&pair.value) + .map_err(|e| Error::Serialization(e.to_string()))?; + return Ok(Some((binding, pair.version))); + } + } + + Ok(None) + } + + /// Update a binding + pub async fn update(&self, binding: &PolicyBinding, expected_version: u64) -> Result { + let key = self.make_primary_key(binding); + let bytes = serde_json::to_vec(binding).map_err(|e| Error::Serialization(e.to_string()))?; + + match self + .backend + .cas(key.as_bytes(), expected_version, &bytes) + .await? + { + CasResult::Success(version) => Ok(version), + CasResult::Conflict { expected, actual } => { + Err(Error::Storage(iam_types::StorageError::CasConflict { + expected, + actual, + })) + } + CasResult::NotFound => Err(Error::Iam(IamError::BindingNotFound(binding.id.clone()))), + } + } + + /// Delete a binding + pub async fn delete(&self, scope: &Scope, principal: &PrincipalRef, id: &str) -> Result { + // First get the binding to know what indexes to delete + if let Some(binding) = self.get(scope, principal, id).await? { + let key = self.make_primary_key(&binding); + + // Delete primary key + let deleted = self.backend.delete(key.as_bytes()).await?; + + if deleted { + // Delete secondary indexes + self.delete_indexes(&binding).await?; + } + + Ok(deleted) + } else { + Ok(false) + } + } + + /// List bindings for a principal + pub async fn list_by_principal(&self, principal: &PrincipalRef) -> Result> { + let prefix = format!("{}{}/", keys::BINDINGS_BY_PRINCIPAL, principal); + let pairs = self.backend.scan_prefix(prefix.as_bytes(), 10000).await?; + + let mut bindings = Vec::new(); + for pair in pairs { + // Index stores binding ID, need to fetch actual binding + let id = String::from_utf8_lossy(&pair.value); + if let Some(binding) = self.get_by_id(&id).await? { + bindings.push(binding); + } + } + + Ok(bindings) + } + + /// List bindings for a scope + pub async fn list_by_scope(&self, scope: &Scope) -> Result> { + let prefix = format!("{}{}/", keys::BINDINGS_BY_SCOPE, scope.to_key()); + let pairs = self.backend.scan_prefix(prefix.as_bytes(), 10000).await?; + + let mut bindings = Vec::new(); + for pair in pairs { + let binding: PolicyBinding = serde_json::from_slice(&pair.value) + .map_err(|e| Error::Serialization(e.to_string()))?; + bindings.push(binding); + } + + Ok(bindings) + } + + /// List all bindings across scopes (used for admin listing) + pub async fn list_all(&self) -> Result> { + let pairs = self + .backend + .scan_prefix(keys::BINDINGS_BY_SCOPE.as_bytes(), 10000) + .await?; + + let mut bindings = Vec::new(); + for pair in pairs { + let binding: PolicyBinding = serde_json::from_slice(&pair.value) + .map_err(|e| Error::Serialization(e.to_string()))?; + bindings.push(binding); + } + + Ok(bindings) + } + + /// Rebuild secondary indexes for all bindings (best-effort maintenance tool) + pub async fn rebuild_indexes(&self) -> Result { + let bindings = self.list_all().await?; + let mut repaired = 0; + for binding in bindings { + self.create_indexes(&binding).await?; + repaired += 1; + } + Ok(repaired) + } + + /// List bindings for a role + pub async fn list_by_role(&self, role_ref: &str) -> Result> { + let prefix = format!("{}{}/", keys::BINDINGS_BY_ROLE, role_ref); + let pairs = self.backend.scan_prefix(prefix.as_bytes(), 10000).await?; + + let mut bindings = Vec::new(); + for pair in pairs { + // Index stores binding ID, need to fetch actual binding + let id = String::from_utf8_lossy(&pair.value); + if let Some(binding) = self.get_by_id(&id).await? { + bindings.push(binding); + } + } + + Ok(bindings) + } + + /// Get all bindings that apply to a principal in a given scope + /// This considers scope hierarchy (e.g., System scope contains all) + pub async fn get_effective_bindings( + &self, + principal: &PrincipalRef, + target_scope: &Scope, + ) -> Result> { + let all_bindings = self.list_by_principal(principal).await?; + + // Filter to bindings whose scope contains the target scope + let effective: Vec<_> = all_bindings + .into_iter() + .filter(|b| b.scope.contains(target_scope) && b.enabled) + .collect(); + + Ok(effective) + } + + // Helper methods + + fn make_primary_key(&self, binding: &PolicyBinding) -> String { + format!( + "{}{}principal/{}/{}", + keys::BINDINGS_BY_SCOPE, + binding.scope.to_key(), + binding.principal_ref, + binding.id + ) + } + + async fn create_indexes(&self, binding: &PolicyBinding) -> Result<()> { + // Principal index + let principal_key = format!( + "{}{}/{}", + keys::BINDINGS_BY_PRINCIPAL, + binding.principal_ref, + binding.id + ); + self.backend + .put(principal_key.as_bytes(), binding.id.as_bytes()) + .await?; + + // Role index + let role_key = format!( + "{}{}/{}", + keys::BINDINGS_BY_ROLE, + binding.role_ref, + binding.id + ); + self.backend + .put(role_key.as_bytes(), binding.id.as_bytes()) + .await?; + + Ok(()) + } + + async fn delete_indexes(&self, binding: &PolicyBinding) -> Result<()> { + // Principal index + let principal_key = format!( + "{}{}/{}", + keys::BINDINGS_BY_PRINCIPAL, + binding.principal_ref, + binding.id + ); + self.backend.delete(principal_key.as_bytes()).await?; + + // Role index + let role_key = format!( + "{}{}/{}", + keys::BINDINGS_BY_ROLE, + binding.role_ref, + binding.id + ); + self.backend.delete(role_key.as_bytes()).await?; + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use iam_types::PrincipalKind; + + fn test_backend() -> Arc { + Arc::new(Backend::memory()) + } + + #[tokio::test] + async fn test_binding_crud() { + let store = BindingStore::new(test_backend()); + + let binding = PolicyBinding::new( + "binding-1", + PrincipalRef::user("alice"), + "roles/ProjectAdmin", + Scope::project("proj-1", "org-1"), + ); + + // Create + let version = store.create(&binding).await.unwrap(); + assert!(version > 0); + + // Get + let fetched = store + .get( + &Scope::project("proj-1", "org-1"), + &PrincipalRef::user("alice"), + "binding-1", + ) + .await + .unwrap(); + assert!(fetched.is_some()); + assert_eq!(fetched.unwrap().id, "binding-1"); + + // Delete + let deleted = store + .delete( + &Scope::project("proj-1", "org-1"), + &PrincipalRef::user("alice"), + "binding-1", + ) + .await + .unwrap(); + assert!(deleted); + + // Verify deleted + let fetched = store + .get( + &Scope::project("proj-1", "org-1"), + &PrincipalRef::user("alice"), + "binding-1", + ) + .await + .unwrap(); + assert!(fetched.is_none()); + } + + #[tokio::test] + async fn test_list_by_principal() { + let store = BindingStore::new(test_backend()); + + let alice = PrincipalRef::user("alice"); + + store + .create(&PolicyBinding::new( + "b1", + alice.clone(), + "roles/Admin", + Scope::System, + )) + .await + .unwrap(); + + store + .create(&PolicyBinding::new( + "b2", + alice.clone(), + "roles/ReadOnly", + Scope::project("p1", "org-1"), + )) + .await + .unwrap(); + + store + .create(&PolicyBinding::new( + "b3", + PrincipalRef::user("bob"), + "roles/Admin", + Scope::System, + )) + .await + .unwrap(); + + let bindings = store.list_by_principal(&alice).await.unwrap(); + assert_eq!(bindings.len(), 2); + } + + #[tokio::test] + async fn test_effective_bindings() { + let store = BindingStore::new(test_backend()); + + let alice = PrincipalRef::user("alice"); + + // System-wide binding + store + .create(&PolicyBinding::new( + "b1", + alice.clone(), + "roles/SystemAdmin", + Scope::System, + )) + .await + .unwrap(); + + // Project-specific binding + store + .create(&PolicyBinding::new( + "b2", + alice.clone(), + "roles/ProjectAdmin", + Scope::project("proj-1", "org-1"), + )) + .await + .unwrap(); + + // Check effective bindings for project scope + let effective = store + .get_effective_bindings(&alice, &Scope::project("proj-1", "org-1")) + .await + .unwrap(); + + // Should include both system (contains project) and project-specific + assert_eq!(effective.len(), 2); + } +} diff --git a/iam/crates/iam-store/src/group_store.rs b/iam/crates/iam-store/src/group_store.rs new file mode 100644 index 0000000..9e5ea2b --- /dev/null +++ b/iam/crates/iam-store/src/group_store.rs @@ -0,0 +1,221 @@ +//! Group membership storage +//! +//! Tracks which principals are members of which groups. + +use std::sync::Arc; + +use iam_types::{Error, PrincipalKind, PrincipalRef, Result}; + +use crate::backend::{Backend, CasResult, JsonStore, KvPair, StorageBackend}; + +/// Key prefixes for group membership storage +mod keys { + /// Members of a group + /// Format: iam/groups/{group_id}/members/{kind}/{member_id} + pub const GROUP_MEMBERS: &str = "iam/groups/"; + + /// Groups a principal belongs to (reverse index) + /// Format: iam/memberships/{kind}/{principal_id}/{group_id} + pub const PRINCIPAL_GROUPS: &str = "iam/memberships/"; +} + +/// Store for group memberships +pub struct GroupStore { + backend: Arc, +} + +impl JsonStore for GroupStore { + fn backend(&self) -> &Backend { + &self.backend + } +} + +impl GroupStore { + /// Create a new group store + pub fn new(backend: Arc) -> Self { + Self { backend } + } + + /// Add a member to a group + pub async fn add_member(&self, group_id: &str, member: &PrincipalRef) -> Result<()> { + // Create forward index (group -> member) + let forward_key = format!( + "{}{}/members/{}/{}", + keys::GROUP_MEMBERS, + group_id, + member.kind, + member.id + ); + + // Create reverse index (member -> group) + let reverse_key = format!( + "{}{}/{}/{}", + keys::PRINCIPAL_GROUPS, + member.kind, + member.id, + group_id + ); + + // Store membership marker (empty value, just presence matters) + let marker = b"1"; + + // Create forward index + match self.backend.cas(forward_key.as_bytes(), 0, marker).await? { + CasResult::Success(_) => {} + CasResult::Conflict { .. } => { + // Already a member, that's fine + return Ok(()); + } + CasResult::NotFound => return Err(Error::Internal("Unexpected CAS result".into())), + } + + // Create reverse index + self.backend + .cas(reverse_key.as_bytes(), 0, marker) + .await + .ok(); + + Ok(()) + } + + /// Remove a member from a group + pub async fn remove_member(&self, group_id: &str, member: &PrincipalRef) -> Result { + let forward_key = format!( + "{}{}/members/{}/{}", + keys::GROUP_MEMBERS, + group_id, + member.kind, + member.id + ); + let reverse_key = format!( + "{}{}/{}/{}", + keys::PRINCIPAL_GROUPS, + member.kind, + member.id, + group_id + ); + + // Delete forward index + let deleted = self.backend.delete(forward_key.as_bytes()).await?; + + // Delete reverse index + self.backend.delete(reverse_key.as_bytes()).await.ok(); + + Ok(deleted) + } + + /// List all members of a group + pub async fn list_members(&self, group_id: &str) -> Result> { + let prefix = format!("{}{}/members/", keys::GROUP_MEMBERS, group_id); + let pairs = self.backend.scan_prefix(prefix.as_bytes(), 1000).await?; + + let mut members = Vec::new(); + for KvPair { key, .. } in pairs { + // Key format: iam/groups/{group_id}/members/{kind}/{member_id} + let key_str = String::from_utf8_lossy(&key); + let parts: Vec<&str> = key_str.split('/').collect(); + if parts.len() >= 6 { + let kind_str = parts[4]; + let member_id = parts[5]; + + let kind = match kind_str { + "user" => PrincipalKind::User, + "service_account" => PrincipalKind::ServiceAccount, + "group" => PrincipalKind::Group, + _ => continue, + }; + + members.push(PrincipalRef::new(kind, member_id)); + } + } + + Ok(members) + } + + /// List all groups a principal belongs to + pub async fn list_groups(&self, principal: &PrincipalRef) -> Result> { + let prefix = format!( + "{}{}/{}/", + keys::PRINCIPAL_GROUPS, + principal.kind, + principal.id + ); + let pairs = self.backend.scan_prefix(prefix.as_bytes(), 1000).await?; + + let mut groups = Vec::new(); + for KvPair { key, .. } in pairs { + // Key format: iam/memberships/{kind}/{principal_id}/{group_id} + let key_str = String::from_utf8_lossy(&key); + let parts: Vec<&str> = key_str.split('/').collect(); + if parts.len() >= 5 { + groups.push(parts[4].to_string()); + } + } + + Ok(groups) + } + + /// Check if a principal is a member of a group + pub async fn is_member(&self, group_id: &str, member: &PrincipalRef) -> Result { + let key = format!( + "{}{}/members/{}/{}", + keys::GROUP_MEMBERS, + group_id, + member.kind, + member.id + ); + let result = self.backend.get(key.as_bytes()).await?; + Ok(result.is_some()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::Backend; + + #[tokio::test] + async fn test_add_and_list_members() { + let backend = Arc::new(Backend::memory()); + let store = GroupStore::new(backend); + + let alice = PrincipalRef::user("alice"); + let bob = PrincipalRef::user("bob"); + + store.add_member("devs", &alice).await.unwrap(); + store.add_member("devs", &bob).await.unwrap(); + + let members = store.list_members("devs").await.unwrap(); + assert_eq!(members.len(), 2); + } + + #[tokio::test] + async fn test_remove_member() { + let backend = Arc::new(Backend::memory()); + let store = GroupStore::new(backend); + + let alice = PrincipalRef::user("alice"); + + store.add_member("devs", &alice).await.unwrap(); + assert!(store.is_member("devs", &alice).await.unwrap()); + + store.remove_member("devs", &alice).await.unwrap(); + assert!(!store.is_member("devs", &alice).await.unwrap()); + } + + #[tokio::test] + async fn test_list_groups_for_principal() { + let backend = Arc::new(Backend::memory()); + let store = GroupStore::new(backend); + + let alice = PrincipalRef::user("alice"); + + store.add_member("devs", &alice).await.unwrap(); + store.add_member("admins", &alice).await.unwrap(); + + let groups = store.list_groups(&alice).await.unwrap(); + assert_eq!(groups.len(), 2); + assert!(groups.contains(&"devs".to_string())); + assert!(groups.contains(&"admins".to_string())); + } +} diff --git a/iam/crates/iam-store/src/lib.rs b/iam/crates/iam-store/src/lib.rs new file mode 100644 index 0000000..9e0b969 --- /dev/null +++ b/iam/crates/iam-store/src/lib.rs @@ -0,0 +1,20 @@ +#![recursion_limit = "1024"] + +//! Storage abstraction layer for IAM +//! +//! Provides a unified interface for storing IAM data in either +//! Chainfire or FlareDB backends. + +pub mod backend; +pub mod binding_store; +pub mod group_store; +pub mod principal_store; +pub mod role_store; +pub mod token_store; + +pub use backend::{Backend, BackendConfig, CasResult, KvPair, StorageBackend}; +pub use binding_store::BindingStore; +pub use group_store::GroupStore; +pub use principal_store::PrincipalStore; +pub use role_store::RoleStore; +pub use token_store::TokenStore; diff --git a/iam/crates/iam-store/src/principal_store.rs b/iam/crates/iam-store/src/principal_store.rs new file mode 100644 index 0000000..a2fe8c1 --- /dev/null +++ b/iam/crates/iam-store/src/principal_store.rs @@ -0,0 +1,403 @@ +//! Principal storage +//! +//! Stores and retrieves principals (users, service accounts, groups). + +use std::sync::Arc; + +use iam_types::{Error, IamError, Principal, PrincipalKind, PrincipalRef, Result}; + +use crate::backend::{Backend, CasResult, JsonStore, StorageBackend}; + +/// Key prefixes for principal storage +mod keys { + /// Primary key: by kind and ID + /// Format: iam/principals/{kind}/{id} + pub const PRINCIPALS: &str = "iam/principals/"; + + /// Secondary index: by org + /// Format: iam/principals/by-org/{org_id}/{kind}/{id} + pub const BY_ORG: &str = "iam/principals/by-org/"; + + /// Secondary index: by project (for service accounts) + /// Format: iam/principals/by-project/{project_id}/{id} + pub const BY_PROJECT: &str = "iam/principals/by-project/"; + + /// Secondary index: by email + /// Format: iam/principals/by-email/{email} + pub const BY_EMAIL: &str = "iam/principals/by-email/"; + + /// Secondary index: by OIDC subject + /// Format: iam/principals/by-oidc/{iss_hash}/{sub} + pub const BY_OIDC: &str = "iam/principals/by-oidc/"; +} + +/// Store for principals +pub struct PrincipalStore { + backend: Arc, +} + +impl JsonStore for PrincipalStore { + fn backend(&self) -> &Backend { + &self.backend + } +} + +impl PrincipalStore { + /// Create a new principal store + pub fn new(backend: Arc) -> Self { + Self { backend } + } + + /// Create a new principal + pub async fn create(&self, principal: &Principal) -> Result { + let key = self.make_primary_key(&principal.kind, &principal.id); + + let bytes = + serde_json::to_vec(principal).map_err(|e| Error::Serialization(e.to_string()))?; + + match self.backend.cas(key.as_bytes(), 0, &bytes).await? { + CasResult::Success(version) => { + // Create secondary indexes + self.create_indexes(principal).await?; + Ok(version) + } + CasResult::Conflict { .. } => Err(Error::Iam(IamError::PrincipalAlreadyExists( + principal.to_ref().to_string(), + ))), + CasResult::NotFound => Err(Error::Internal("Unexpected CAS result".into())), + } + } + + /// Get a principal by reference + pub async fn get(&self, principal_ref: &PrincipalRef) -> Result> { + let key = self.make_primary_key(&principal_ref.kind, &principal_ref.id); + match self.get_json::(key.as_bytes()).await? { + Some((principal, _)) => Ok(Some(principal)), + None => Ok(None), + } + } + + /// Get a principal with version + pub async fn get_with_version( + &self, + principal_ref: &PrincipalRef, + ) -> Result> { + let key = self.make_primary_key(&principal_ref.kind, &principal_ref.id); + self.get_json::(key.as_bytes()).await + } + + /// Get a principal by email + pub async fn get_by_email(&self, email: &str) -> Result> { + let index_key = format!("{}{}", keys::BY_EMAIL, email.to_lowercase()); + + if let Some((ref_bytes, _)) = self.backend.get(index_key.as_bytes()).await? { + let principal_ref: PrincipalRef = serde_json::from_slice(&ref_bytes) + .map_err(|e| Error::Serialization(e.to_string()))?; + return self.get(&principal_ref).await; + } + + Ok(None) + } + + /// Get a principal by OIDC subject + pub async fn get_by_oidc(&self, issuer: &str, subject: &str) -> Result> { + let iss_hash = self.hash_issuer(issuer); + let index_key = format!("{}{}/{}", keys::BY_OIDC, iss_hash, subject); + + if let Some((ref_bytes, _)) = self.backend.get(index_key.as_bytes()).await? { + let principal_ref: PrincipalRef = serde_json::from_slice(&ref_bytes) + .map_err(|e| Error::Serialization(e.to_string()))?; + return self.get(&principal_ref).await; + } + + Ok(None) + } + + /// Update a principal + pub async fn update(&self, principal: &Principal, expected_version: u64) -> Result { + let key = self.make_primary_key(&principal.kind, &principal.id); + let bytes = + serde_json::to_vec(principal).map_err(|e| Error::Serialization(e.to_string()))?; + + match self + .backend + .cas(key.as_bytes(), expected_version, &bytes) + .await? + { + CasResult::Success(version) => { + // Update indexes if needed (email, oidc changes) + // For simplicity, recreate all indexes + self.create_indexes(principal).await?; + Ok(version) + } + CasResult::Conflict { expected, actual } => { + Err(Error::Storage(iam_types::StorageError::CasConflict { + expected, + actual, + })) + } + CasResult::NotFound => Err(Error::Iam(IamError::PrincipalNotFound( + principal.to_ref().to_string(), + ))), + } + } + + /// Delete a principal + pub async fn delete(&self, principal_ref: &PrincipalRef) -> Result { + // First get the principal to know what indexes to delete + if let Some(principal) = self.get(principal_ref).await? { + let key = self.make_primary_key(&principal.kind, &principal.id); + + let deleted = self.backend.delete(key.as_bytes()).await?; + + if deleted { + self.delete_indexes(&principal).await?; + } + + Ok(deleted) + } else { + Ok(false) + } + } + + /// List principals by kind + pub async fn list_by_kind(&self, kind: &PrincipalKind) -> Result> { + let prefix = format!("{}{}/", keys::PRINCIPALS, kind); + let pairs = self.backend.scan_prefix(prefix.as_bytes(), 10000).await?; + + let mut principals = Vec::new(); + for pair in pairs { + let principal: Principal = serde_json::from_slice(&pair.value) + .map_err(|e| Error::Serialization(e.to_string()))?; + principals.push(principal); + } + + Ok(principals) + } + + /// List principals by organization + pub async fn list_by_org(&self, org_id: &str) -> Result> { + let prefix = format!("{}{}/", keys::BY_ORG, org_id); + let pairs = self.backend.scan_prefix(prefix.as_bytes(), 10000).await?; + + let mut principals = Vec::new(); + for pair in pairs { + let principal_ref: PrincipalRef = serde_json::from_slice(&pair.value) + .map_err(|e| Error::Serialization(e.to_string()))?; + if let Some(principal) = self.get(&principal_ref).await? { + principals.push(principal); + } + } + + Ok(principals) + } + + /// List service accounts by project + pub async fn list_by_project(&self, project_id: &str) -> Result> { + let prefix = format!("{}{}/", keys::BY_PROJECT, project_id); + let pairs = self.backend.scan_prefix(prefix.as_bytes(), 10000).await?; + + let mut principals = Vec::new(); + for pair in pairs { + let principal_ref: PrincipalRef = serde_json::from_slice(&pair.value) + .map_err(|e| Error::Serialization(e.to_string()))?; + if let Some(principal) = self.get(&principal_ref).await? { + principals.push(principal); + } + } + + Ok(principals) + } + + /// Check if a principal exists + pub async fn exists(&self, principal_ref: &PrincipalRef) -> Result { + let key = self.make_primary_key(&principal_ref.kind, &principal_ref.id); + Ok(self.backend.get(key.as_bytes()).await?.is_some()) + } + + // Helper methods + + fn make_primary_key(&self, kind: &PrincipalKind, id: &str) -> String { + format!("{}{}/{}", keys::PRINCIPALS, kind, id) + } + + fn hash_issuer(&self, issuer: &str) -> String { + // Simple hash for issuer to avoid special characters in keys + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + let mut hasher = DefaultHasher::new(); + issuer.hash(&mut hasher); + format!("{:016x}", hasher.finish()) + } + + async fn create_indexes(&self, principal: &Principal) -> Result<()> { + let ref_bytes = serde_json::to_vec(&principal.to_ref()) + .map_err(|e| Error::Serialization(e.to_string()))?; + + // Org index + if let Some(org_id) = &principal.org_id { + let key = format!( + "{}{}/{}/{}", + keys::BY_ORG, + org_id, + principal.kind, + principal.id + ); + self.backend.put(key.as_bytes(), &ref_bytes).await?; + } + + // Project index (for service accounts) + if let Some(project_id) = &principal.project_id { + let key = format!("{}{}/{}", keys::BY_PROJECT, project_id, principal.id); + self.backend.put(key.as_bytes(), &ref_bytes).await?; + } + + // Email index + if let Some(email) = &principal.email { + let key = format!("{}{}", keys::BY_EMAIL, email.to_lowercase()); + self.backend.put(key.as_bytes(), &ref_bytes).await?; + } + + // OIDC index + if let Some(oidc_sub) = &principal.oidc_sub { + // Assume we store issuer in metadata + if let Some(issuer) = principal.metadata.get("oidc_issuer") { + let iss_hash = self.hash_issuer(issuer); + let key = format!("{}{}/{}", keys::BY_OIDC, iss_hash, oidc_sub); + self.backend.put(key.as_bytes(), &ref_bytes).await?; + } + } + + Ok(()) + } + + async fn delete_indexes(&self, principal: &Principal) -> Result<()> { + // Org index + if let Some(org_id) = &principal.org_id { + let key = format!( + "{}{}/{}/{}", + keys::BY_ORG, + org_id, + principal.kind, + principal.id + ); + self.backend.delete(key.as_bytes()).await?; + } + + // Project index + if let Some(project_id) = &principal.project_id { + let key = format!("{}{}/{}", keys::BY_PROJECT, project_id, principal.id); + self.backend.delete(key.as_bytes()).await?; + } + + // Email index + if let Some(email) = &principal.email { + let key = format!("{}{}", keys::BY_EMAIL, email.to_lowercase()); + self.backend.delete(key.as_bytes()).await?; + } + + // OIDC index + if let Some(oidc_sub) = &principal.oidc_sub { + if let Some(issuer) = principal.metadata.get("oidc_issuer") { + let iss_hash = self.hash_issuer(issuer); + let key = format!("{}{}/{}", keys::BY_OIDC, iss_hash, oidc_sub); + self.backend.delete(key.as_bytes()).await?; + } + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_backend() -> Arc { + Arc::new(Backend::memory()) + } + + #[tokio::test] + async fn test_principal_crud() { + let store = PrincipalStore::new(test_backend()); + + let mut principal = Principal::new_user("alice", "Alice Smith"); + principal.email = Some("alice@example.com".into()); + principal.org_id = Some("org-1".into()); + + // Create + let version = store.create(&principal).await.unwrap(); + assert!(version > 0); + + // Get + let fetched = store.get(&PrincipalRef::user("alice")).await.unwrap(); + assert!(fetched.is_some()); + assert_eq!(fetched.unwrap().name, "Alice Smith"); + + // Get by email + let fetched = store.get_by_email("alice@example.com").await.unwrap(); + assert!(fetched.is_some()); + assert_eq!(fetched.unwrap().id, "alice"); + + // Delete + let deleted = store.delete(&PrincipalRef::user("alice")).await.unwrap(); + assert!(deleted); + + // Verify deleted + let fetched = store.get(&PrincipalRef::user("alice")).await.unwrap(); + assert!(fetched.is_none()); + } + + #[tokio::test] + async fn test_service_account() { + let store = PrincipalStore::new(test_backend()); + + let sa = Principal::new_service_account("compute-agent", "Compute Agent", "proj-1"); + + store.create(&sa).await.unwrap(); + + // List by project + let sas = store.list_by_project("proj-1").await.unwrap(); + assert_eq!(sas.len(), 1); + assert_eq!(sas[0].id, "compute-agent"); + } + + #[tokio::test] + async fn test_list_by_kind() { + let store = PrincipalStore::new(test_backend()); + + store + .create(&Principal::new_user("user1", "User 1")) + .await + .unwrap(); + store + .create(&Principal::new_user("user2", "User 2")) + .await + .unwrap(); + store + .create(&Principal::new_service_account("sa1", "SA 1", "proj-1")) + .await + .unwrap(); + + let users = store.list_by_kind(&PrincipalKind::User).await.unwrap(); + assert_eq!(users.len(), 2); + + let sas = store + .list_by_kind(&PrincipalKind::ServiceAccount) + .await + .unwrap(); + assert_eq!(sas.len(), 1); + } + + #[tokio::test] + async fn test_duplicate_create() { + let store = PrincipalStore::new(test_backend()); + + let principal = Principal::new_user("alice", "Alice"); + store.create(&principal).await.unwrap(); + + // Try to create again + let result = store.create(&principal).await; + assert!(result.is_err()); + } +} diff --git a/iam/crates/iam-store/src/role_store.rs b/iam/crates/iam-store/src/role_store.rs new file mode 100644 index 0000000..0fed984 --- /dev/null +++ b/iam/crates/iam-store/src/role_store.rs @@ -0,0 +1,388 @@ +//! Role storage +//! +//! Stores and retrieves roles and their permissions. + +use std::sync::Arc; + +use iam_types::{builtin_roles, Error, IamError, Result, Role, Scope}; + +use crate::backend::{Backend, CasResult, JsonStore, StorageBackend}; + +/// Key prefixes for role storage +mod keys { + /// Primary key: by name + /// Format: iam/roles/{name} + pub const ROLES: &str = "iam/roles/"; + + /// Secondary index: by scope + /// Format: iam/roles/by-scope/{scope}/{name} + pub const BY_SCOPE: &str = "iam/roles/by-scope/"; + + /// Builtin roles marker + /// Format: iam/roles/builtin/{name} + pub const BUILTIN: &str = "iam/roles/builtin/"; +} + +/// Store for roles +pub struct RoleStore { + backend: Arc, +} + +impl JsonStore for RoleStore { + fn backend(&self) -> &Backend { + &self.backend + } +} + +impl RoleStore { + /// Create a new role store + pub fn new(backend: Arc) -> Self { + Self { backend } + } + + /// Initialize builtin roles + pub async fn init_builtin_roles(&self) -> Result<()> { + for role in builtin_roles::all() { + // Check if already exists + let key = format!("{}{}", keys::ROLES, role.name); + if self.backend.get(key.as_bytes()).await?.is_none() { + // Create builtin role + self.create_internal(&role).await?; + } + } + Ok(()) + } + + /// Create a new role + pub async fn create(&self, role: &Role) -> Result { + if role.builtin { + return Err(Error::Iam(IamError::CannotModifyBuiltinRole( + role.name.clone(), + ))); + } + self.create_internal(role).await + } + + /// Create a role (internal use, bypasses builtin check) + /// Used by init_builtin_roles and tests + pub async fn create_internal(&self, role: &Role) -> Result { + let key = format!("{}{}", keys::ROLES, role.name); + + let bytes = serde_json::to_vec(role).map_err(|e| Error::Serialization(e.to_string()))?; + + match self.backend.cas(key.as_bytes(), 0, &bytes).await? { + CasResult::Success(version) => { + // Create secondary indexes + self.create_indexes(role).await?; + Ok(version) + } + CasResult::Conflict { .. } => { + Err(Error::Iam(IamError::RoleAlreadyExists(role.name.clone()))) + } + CasResult::NotFound => Err(Error::Internal("Unexpected CAS result".into())), + } + } + + /// Get a role by name + pub async fn get(&self, name: &str) -> Result> { + let key = format!("{}{}", keys::ROLES, name); + match self.get_json::(key.as_bytes()).await? { + Some((role, _)) => Ok(Some(role)), + None => Ok(None), + } + } + + /// Get a role by reference (e.g., "roles/ProjectAdmin") + pub async fn get_by_ref(&self, role_ref: &str) -> Result> { + let name = role_ref.strip_prefix("roles/").unwrap_or(role_ref); + self.get(name).await + } + + /// Get a role with version + pub async fn get_with_version(&self, name: &str) -> Result> { + let key = format!("{}{}", keys::ROLES, name); + self.get_json::(key.as_bytes()).await + } + + /// Update a role + pub async fn update(&self, role: &Role, expected_version: u64) -> Result { + // Check if trying to modify builtin role + if let Some(existing) = self.get(&role.name).await? { + if existing.builtin { + return Err(Error::Iam(IamError::CannotModifyBuiltinRole( + role.name.clone(), + ))); + } + } + + let key = format!("{}{}", keys::ROLES, role.name); + let bytes = serde_json::to_vec(role).map_err(|e| Error::Serialization(e.to_string()))?; + + match self + .backend + .cas(key.as_bytes(), expected_version, &bytes) + .await? + { + CasResult::Success(version) => { + // Update indexes + self.create_indexes(role).await?; + Ok(version) + } + CasResult::Conflict { expected, actual } => { + Err(Error::Storage(iam_types::StorageError::CasConflict { + expected, + actual, + })) + } + CasResult::NotFound => Err(Error::Iam(IamError::RoleNotFound(role.name.clone()))), + } + } + + /// Delete a role + pub async fn delete(&self, name: &str) -> Result { + // Check if builtin + if let Some(role) = self.get(name).await? { + if role.builtin { + return Err(Error::Iam(IamError::CannotModifyBuiltinRole(name.into()))); + } + + let key = format!("{}{}", keys::ROLES, name); + let deleted = self.backend.delete(key.as_bytes()).await?; + + if deleted { + self.delete_indexes(&role).await?; + } + + Ok(deleted) + } else { + Ok(false) + } + } + + /// List all roles + pub async fn list(&self) -> Result> { + let pairs = self + .backend + .scan_prefix(keys::ROLES.as_bytes(), 10000) + .await?; + + let mut roles = Vec::new(); + for pair in pairs { + // Skip index keys + let key_str = String::from_utf8_lossy(&pair.key); + if key_str.contains("/by-scope/") || key_str.contains("/builtin/") { + continue; + } + + let role: Role = serde_json::from_slice(&pair.value) + .map_err(|e| Error::Serialization(e.to_string()))?; + roles.push(role); + } + + Ok(roles) + } + + /// List roles by scope + pub async fn list_by_scope(&self, scope: &Scope) -> Result> { + let prefix = format!("{}{}/", keys::BY_SCOPE, scope.to_key()); + let pairs = self.backend.scan_prefix(prefix.as_bytes(), 10000).await?; + + let mut roles = Vec::new(); + for pair in pairs { + // Index stores role name + let name = String::from_utf8_lossy(&pair.value); + if let Some(role) = self.get(&name).await? { + roles.push(role); + } + } + + Ok(roles) + } + + /// List builtin roles + pub async fn list_builtin(&self) -> Result> { + let pairs = self + .backend + .scan_prefix(keys::BUILTIN.as_bytes(), 100) + .await?; + + let mut roles = Vec::new(); + for pair in pairs { + let name = String::from_utf8_lossy(&pair.value); + if let Some(role) = self.get(&name).await? { + roles.push(role); + } + } + + Ok(roles) + } + + /// List custom (non-builtin) roles + pub async fn list_custom(&self) -> Result> { + let all_roles = self.list().await?; + Ok(all_roles.into_iter().filter(|r| !r.builtin).collect()) + } + + /// Check if a role exists + pub async fn exists(&self, name: &str) -> Result { + let key = format!("{}{}", keys::ROLES, name); + Ok(self.backend.get(key.as_bytes()).await?.is_some()) + } + + // Helper methods + + async fn create_indexes(&self, role: &Role) -> Result<()> { + // Scope index + let scope_key = format!("{}{}/{}", keys::BY_SCOPE, role.scope.to_key(), role.name); + self.backend + .put(scope_key.as_bytes(), role.name.as_bytes()) + .await?; + + // Builtin marker + if role.builtin { + let builtin_key = format!("{}{}", keys::BUILTIN, role.name); + self.backend + .put(builtin_key.as_bytes(), role.name.as_bytes()) + .await?; + } + + Ok(()) + } + + async fn delete_indexes(&self, role: &Role) -> Result<()> { + // Scope index + let scope_key = format!("{}{}/{}", keys::BY_SCOPE, role.scope.to_key(), role.name); + self.backend.delete(scope_key.as_bytes()).await?; + + // Builtin marker (shouldn't delete builtin roles, but just in case) + if role.builtin { + let builtin_key = format!("{}{}", keys::BUILTIN, role.name); + self.backend.delete(builtin_key.as_bytes()).await?; + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use iam_types::Permission; + + fn test_backend() -> Arc { + Arc::new(Backend::memory()) + } + + #[tokio::test] + async fn test_role_crud() { + let store = RoleStore::new(test_backend()); + + let role = Role::new( + "CustomViewer", + Scope::project("*", "*"), + vec![Permission::new("*:read", "project/${project}/*")], + ) + .with_display_name("Custom Viewer") + .with_description("Read-only access"); + + // Create + let version = store.create(&role).await.unwrap(); + assert!(version > 0); + + // Get + let fetched = store.get("CustomViewer").await.unwrap(); + assert!(fetched.is_some()); + assert_eq!(fetched.unwrap().display_name, "Custom Viewer"); + + // Get by ref + let fetched = store.get_by_ref("roles/CustomViewer").await.unwrap(); + assert!(fetched.is_some()); + + // Delete + let deleted = store.delete("CustomViewer").await.unwrap(); + assert!(deleted); + + // Verify deleted + let fetched = store.get("CustomViewer").await.unwrap(); + assert!(fetched.is_none()); + } + + #[tokio::test] + async fn test_builtin_roles() { + let store = RoleStore::new(test_backend()); + + // Initialize builtin roles + store.init_builtin_roles().await.unwrap(); + + // Verify they exist + let admin = store.get("SystemAdmin").await.unwrap(); + assert!(admin.is_some()); + assert!(admin.unwrap().builtin); + + // Try to delete builtin role + let result = store.delete("SystemAdmin").await; + assert!(result.is_err()); + + // Try to create role marked as builtin + let fake_builtin = + Role::builtin("FakeBuiltin", Scope::System, vec![Permission::wildcard()]); + let result = store.create(&fake_builtin).await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_list_roles() { + let store = RoleStore::new(test_backend()); + + // Init builtin + store.init_builtin_roles().await.unwrap(); + + // Create custom role + let custom = Role::new( + "MyRole", + Scope::project("*", "*"), + vec![Permission::new("compute:*", "*")], + ); + store.create(&custom).await.unwrap(); + + // List all + let all = store.list().await.unwrap(); + assert!(all.len() > 1); + + // List custom only + let custom_roles = store.list_custom().await.unwrap(); + assert_eq!(custom_roles.len(), 1); + assert_eq!(custom_roles[0].name, "MyRole"); + + // List builtin only + let builtin_roles = store.list_builtin().await.unwrap(); + assert!(!builtin_roles.is_empty()); + assert!(builtin_roles.iter().all(|r| r.builtin)); + } + + #[tokio::test] + async fn test_list_by_scope() { + let store = RoleStore::new(test_backend()); + + store.init_builtin_roles().await.unwrap(); + + // List system scope roles + let system_roles = store.list_by_scope(&Scope::System).await.unwrap(); + assert!(!system_roles.is_empty()); + + // SystemAdmin should be in system scope + assert!(system_roles.iter().any(|r| r.name == "SystemAdmin")); + } + + #[tokio::test] + async fn test_duplicate_role() { + let store = RoleStore::new(test_backend()); + + let role = Role::new("TestRole", Scope::System, vec![]); + store.create(&role).await.unwrap(); + + // Try to create again + let result = store.create(&role).await; + assert!(result.is_err()); + } +} diff --git a/iam/crates/iam-store/src/token_store.rs b/iam/crates/iam-store/src/token_store.rs new file mode 100644 index 0000000..140c181 --- /dev/null +++ b/iam/crates/iam-store/src/token_store.rs @@ -0,0 +1,77 @@ +//! Token metadata storage +//! +//! Persists token metadata and revocation state for internal tokens. + +use std::sync::Arc; + +use iam_types::{Error, IamError, Result, StorageError, TokenMetadata}; + +use crate::backend::{Backend, CasResult, JsonStore}; + +/// Store for token metadata and revocations +pub struct TokenStore { + backend: Arc, +} + +impl JsonStore for TokenStore { + fn backend(&self) -> &Backend { + &self.backend + } +} + +impl TokenStore { + /// Create a new token store + pub fn new(backend: Arc) -> Self { + Self { backend } + } + + /// Persist token metadata + pub async fn put(&self, meta: &TokenMetadata) -> Result { + self.put_json(meta.storage_key().as_bytes(), meta).await + } + + /// Get metadata for a token + pub async fn get( + &self, + principal_id: &str, + token_id: &str, + ) -> Result> { + let key = format!("iam/tokens/{}/{}", principal_id, token_id); + self.get_json(key.as_bytes()).await + } + + /// Revoke a token if it exists + pub async fn revoke( + &self, + principal_id: &str, + token_id: &str, + reason: &str, + now: u64, + ) -> Result { + let key = format!("iam/tokens/{}/{}", principal_id, token_id); + let current = self.get_json::(key.as_bytes()).await?; + + let (mut meta, version) = match current { + Some(v) => v, + None => return Ok(false), + }; + + if meta.revoked { + return Ok(false); + } + + meta.revoke(now, reason); + match self.cas_json(key.as_bytes(), version, &meta).await? { + CasResult::Success(_) => Ok(true), + CasResult::Conflict { expected, actual } => { + Err(Error::Storage(StorageError::CasConflict { + expected, + actual, + })) + } + CasResult::NotFound => Err(Error::Iam(IamError::InvalidToken( + "token disappeared during revoke".into(), + ))), + } + } +} diff --git a/iam/crates/iam-types/Cargo.toml b/iam/crates/iam-types/Cargo.toml new file mode 100644 index 0000000..4bfa5d2 --- /dev/null +++ b/iam/crates/iam-types/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "iam-types" +version.workspace = true +edition.workspace = true +license.workspace = true +rust-version.workspace = true +description = "Core types for IAM (Identity and Access Management)" + +[dependencies] +serde = { workspace = true } +serde_json = { workspace = true } +thiserror = { workspace = true } +uuid = { workspace = true } +chrono = { workspace = true } + +[dev-dependencies] +serde_json = { workspace = true } + +[lints] +workspace = true diff --git a/iam/crates/iam-types/src/condition.rs b/iam/crates/iam-types/src/condition.rs new file mode 100644 index 0000000..da8a785 --- /dev/null +++ b/iam/crates/iam-types/src/condition.rs @@ -0,0 +1,582 @@ +//! Condition types for ABAC (Attribute-Based Access Control) +//! +//! Conditions allow fine-grained access control based on attributes of +//! the principal, resource, and request context. + +use serde::{Deserialize, Serialize}; + +/// A condition that must be satisfied for a permission to apply +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Condition { + /// The condition expression + pub expression: ConditionExpr, +} + +impl Condition { + /// Create a new condition + pub fn new(expression: ConditionExpr) -> Self { + Self { expression } + } + + /// Create a string equals condition + pub fn string_equals(key: impl Into, value: impl Into) -> Self { + Self::new(ConditionExpr::StringEquals { + key: key.into(), + value: value.into(), + }) + } + + /// Create a string not equals condition + pub fn string_not_equals(key: impl Into, value: impl Into) -> Self { + Self::new(ConditionExpr::StringNotEquals { + key: key.into(), + value: value.into(), + }) + } + + /// Create a string like condition (glob pattern) + pub fn string_like(key: impl Into, pattern: impl Into) -> Self { + Self::new(ConditionExpr::StringLike { + key: key.into(), + pattern: pattern.into(), + }) + } + + /// Create an IP address condition + pub fn ip_address(key: impl Into, cidr: impl Into) -> Self { + Self::new(ConditionExpr::IpAddress { + key: key.into(), + cidr: cidr.into(), + }) + } + + /// Create a time between condition + pub fn time_between(start: impl Into, end: impl Into) -> Self { + Self::new(ConditionExpr::TimeBetween { + start: start.into(), + end: end.into(), + }) + } + + /// Create an AND condition + pub fn and(conditions: Vec) -> Self { + Self::new(ConditionExpr::And( + conditions.into_iter().map(|c| c.expression).collect(), + )) + } + + /// Create an OR condition + pub fn or(conditions: Vec) -> Self { + Self::new(ConditionExpr::Or( + conditions.into_iter().map(|c| c.expression).collect(), + )) + } + + /// Create a NOT condition + pub fn not(condition: Condition) -> Self { + Self::new(ConditionExpr::Not(Box::new(condition.expression))) + } +} + +/// Condition expression types +/// +/// Note: Manual Serialize/Deserialize implementation to avoid deep recursion +/// during macro expansion (which would require recursion_limit = 2048). +#[derive(Debug, Clone, PartialEq)] +pub enum ConditionExpr { + /// String equality check + StringEquals { + /// Key to check (e.g., "resource.owner", "principal.id") + key: String, + /// Expected value (can use ${variable} syntax) + value: String, + }, + + /// String inequality check + StringNotEquals { key: String, value: String }, + + /// String pattern match (glob-style with * and ?) + StringLike { key: String, pattern: String }, + + /// String not matching pattern + StringNotLike { key: String, pattern: String }, + + /// Numeric equality + NumericEquals { key: String, value: i64 }, + + /// Numeric less than + NumericLessThan { key: String, value: i64 }, + + /// Numeric less than or equal + NumericLessThanEquals { key: String, value: i64 }, + + /// Numeric greater than + NumericGreaterThan { key: String, value: i64 }, + + /// Numeric greater than or equal + NumericGreaterThanEquals { key: String, value: i64 }, + + /// IP address in CIDR range + IpAddress { + /// Key to check (typically "request.source_ip") + key: String, + /// CIDR notation (e.g., "10.0.0.0/8") + cidr: String, + }, + + /// IP address not in CIDR range + NotIpAddress { key: String, cidr: String }, + + /// Time-based condition + TimeBetween { + /// Start time (ISO 8601 format or HH:MM) + start: String, + /// End time (ISO 8601 format or HH:MM) + end: String, + }, + + /// Check if a key exists and is not null + Exists { key: String }, + + /// Check if value is in a list + StringEqualsAny { key: String, values: Vec }, + + /// Boolean value check + Bool { key: String, value: bool }, + + /// Logical AND of multiple conditions + And(Vec), + + /// Logical OR of multiple conditions + Or(Vec), + + /// Logical NOT of a condition + Not(Box), +} + +impl ConditionExpr { + /// Check if this expression uses variable substitution + pub fn uses_variables(&self) -> bool { + match self { + ConditionExpr::StringEquals { value, .. } + | ConditionExpr::StringNotEquals { value, .. } => value.contains("${"), + ConditionExpr::StringLike { pattern, .. } + | ConditionExpr::StringNotLike { pattern, .. } => pattern.contains("${"), + ConditionExpr::And(exprs) | ConditionExpr::Or(exprs) => { + exprs.iter().any(|e| e.uses_variables()) + } + ConditionExpr::Not(expr) => expr.uses_variables(), + _ => false, + } + } +} + +// Manual Serialize implementation for ConditionExpr +// This avoids the deep trait bound recursion that occurs with derive macros +// on internally-tagged recursive enums. +impl Serialize for ConditionExpr { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + use serde::ser::SerializeMap; + + match self { + ConditionExpr::StringEquals { key, value } => { + let mut map = serializer.serialize_map(Some(3))?; + map.serialize_entry("type", "string_equals")?; + map.serialize_entry("key", key)?; + map.serialize_entry("value", value)?; + map.end() + } + ConditionExpr::StringNotEquals { key, value } => { + let mut map = serializer.serialize_map(Some(3))?; + map.serialize_entry("type", "string_not_equals")?; + map.serialize_entry("key", key)?; + map.serialize_entry("value", value)?; + map.end() + } + ConditionExpr::StringLike { key, pattern } => { + let mut map = serializer.serialize_map(Some(3))?; + map.serialize_entry("type", "string_like")?; + map.serialize_entry("key", key)?; + map.serialize_entry("pattern", pattern)?; + map.end() + } + ConditionExpr::StringNotLike { key, pattern } => { + let mut map = serializer.serialize_map(Some(3))?; + map.serialize_entry("type", "string_not_like")?; + map.serialize_entry("key", key)?; + map.serialize_entry("pattern", pattern)?; + map.end() + } + ConditionExpr::NumericEquals { key, value } => { + let mut map = serializer.serialize_map(Some(3))?; + map.serialize_entry("type", "numeric_equals")?; + map.serialize_entry("key", key)?; + map.serialize_entry("value", value)?; + map.end() + } + ConditionExpr::NumericLessThan { key, value } => { + let mut map = serializer.serialize_map(Some(3))?; + map.serialize_entry("type", "numeric_less_than")?; + map.serialize_entry("key", key)?; + map.serialize_entry("value", value)?; + map.end() + } + ConditionExpr::NumericLessThanEquals { key, value } => { + let mut map = serializer.serialize_map(Some(3))?; + map.serialize_entry("type", "numeric_less_than_equals")?; + map.serialize_entry("key", key)?; + map.serialize_entry("value", value)?; + map.end() + } + ConditionExpr::NumericGreaterThan { key, value } => { + let mut map = serializer.serialize_map(Some(3))?; + map.serialize_entry("type", "numeric_greater_than")?; + map.serialize_entry("key", key)?; + map.serialize_entry("value", value)?; + map.end() + } + ConditionExpr::NumericGreaterThanEquals { key, value } => { + let mut map = serializer.serialize_map(Some(3))?; + map.serialize_entry("type", "numeric_greater_than_equals")?; + map.serialize_entry("key", key)?; + map.serialize_entry("value", value)?; + map.end() + } + ConditionExpr::IpAddress { key, cidr } => { + let mut map = serializer.serialize_map(Some(3))?; + map.serialize_entry("type", "ip_address")?; + map.serialize_entry("key", key)?; + map.serialize_entry("cidr", cidr)?; + map.end() + } + ConditionExpr::NotIpAddress { key, cidr } => { + let mut map = serializer.serialize_map(Some(3))?; + map.serialize_entry("type", "not_ip_address")?; + map.serialize_entry("key", key)?; + map.serialize_entry("cidr", cidr)?; + map.end() + } + ConditionExpr::TimeBetween { start, end } => { + let mut map = serializer.serialize_map(Some(3))?; + map.serialize_entry("type", "time_between")?; + map.serialize_entry("start", start)?; + map.serialize_entry("end", end)?; + map.end() + } + ConditionExpr::Exists { key } => { + let mut map = serializer.serialize_map(Some(2))?; + map.serialize_entry("type", "exists")?; + map.serialize_entry("key", key)?; + map.end() + } + ConditionExpr::StringEqualsAny { key, values } => { + let mut map = serializer.serialize_map(Some(3))?; + map.serialize_entry("type", "string_equals_any")?; + map.serialize_entry("key", key)?; + map.serialize_entry("values", values)?; + map.end() + } + ConditionExpr::Bool { key, value } => { + let mut map = serializer.serialize_map(Some(3))?; + map.serialize_entry("type", "bool")?; + map.serialize_entry("key", key)?; + map.serialize_entry("value", value)?; + map.end() + } + ConditionExpr::And(exprs) => { + let mut map = serializer.serialize_map(Some(2))?; + map.serialize_entry("type", "and")?; + map.serialize_entry("conditions", exprs)?; + map.end() + } + ConditionExpr::Or(exprs) => { + let mut map = serializer.serialize_map(Some(2))?; + map.serialize_entry("type", "or")?; + map.serialize_entry("conditions", exprs)?; + map.end() + } + ConditionExpr::Not(expr) => { + let mut map = serializer.serialize_map(Some(2))?; + map.serialize_entry("type", "not")?; + map.serialize_entry("condition", expr)?; + map.end() + } + } + } +} + +// Manual Deserialize implementation for ConditionExpr +impl<'de> Deserialize<'de> for ConditionExpr { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + use serde::de::{Error, MapAccess, Visitor}; + use std::fmt; + + struct ConditionExprVisitor; + + impl<'de> Visitor<'de> for ConditionExprVisitor { + type Value = ConditionExpr; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("a condition expression object with a 'type' field") + } + + fn visit_map(self, mut map: M) -> Result + where + M: MapAccess<'de>, + { + let mut expr_type: Option = None; + let mut key: Option = None; + let mut value_str: Option = None; + let mut value_i64: Option = None; + let mut value_bool: Option = None; + let mut pattern: Option = None; + let mut cidr: Option = None; + let mut start: Option = None; + let mut end: Option = None; + let mut values: Option> = None; + let mut conditions: Option> = None; + let mut condition: Option> = None; + + while let Some(field) = map.next_key::()? { + match field.as_str() { + "type" => expr_type = Some(map.next_value()?), + "key" => key = Some(map.next_value()?), + "value" => { + // Value can be string, i64, or bool depending on type + // We'll try to deserialize as each + let raw: serde_json::Value = map.next_value()?; + match &raw { + serde_json::Value::String(s) => value_str = Some(s.clone()), + serde_json::Value::Number(n) => { + value_i64 = n.as_i64(); + } + serde_json::Value::Bool(b) => value_bool = Some(*b), + _ => {} + } + } + "pattern" => pattern = Some(map.next_value()?), + "cidr" => cidr = Some(map.next_value()?), + "start" => start = Some(map.next_value()?), + "end" => end = Some(map.next_value()?), + "values" => values = Some(map.next_value()?), + "conditions" => conditions = Some(map.next_value()?), + "condition" => condition = Some(map.next_value()?), + _ => { + // Skip unknown fields + let _: serde_json::Value = map.next_value()?; + } + } + } + + let expr_type = + expr_type.ok_or_else(|| Error::missing_field("type"))?; + + match expr_type.as_str() { + "string_equals" => Ok(ConditionExpr::StringEquals { + key: key.ok_or_else(|| Error::missing_field("key"))?, + value: value_str.ok_or_else(|| Error::missing_field("value"))?, + }), + "string_not_equals" => Ok(ConditionExpr::StringNotEquals { + key: key.ok_or_else(|| Error::missing_field("key"))?, + value: value_str.ok_or_else(|| Error::missing_field("value"))?, + }), + "string_like" => Ok(ConditionExpr::StringLike { + key: key.ok_or_else(|| Error::missing_field("key"))?, + pattern: pattern.ok_or_else(|| Error::missing_field("pattern"))?, + }), + "string_not_like" => Ok(ConditionExpr::StringNotLike { + key: key.ok_or_else(|| Error::missing_field("key"))?, + pattern: pattern.ok_or_else(|| Error::missing_field("pattern"))?, + }), + "numeric_equals" => Ok(ConditionExpr::NumericEquals { + key: key.ok_or_else(|| Error::missing_field("key"))?, + value: value_i64.ok_or_else(|| Error::missing_field("value"))?, + }), + "numeric_less_than" => Ok(ConditionExpr::NumericLessThan { + key: key.ok_or_else(|| Error::missing_field("key"))?, + value: value_i64.ok_or_else(|| Error::missing_field("value"))?, + }), + "numeric_less_than_equals" => Ok(ConditionExpr::NumericLessThanEquals { + key: key.ok_or_else(|| Error::missing_field("key"))?, + value: value_i64.ok_or_else(|| Error::missing_field("value"))?, + }), + "numeric_greater_than" => Ok(ConditionExpr::NumericGreaterThan { + key: key.ok_or_else(|| Error::missing_field("key"))?, + value: value_i64.ok_or_else(|| Error::missing_field("value"))?, + }), + "numeric_greater_than_equals" => Ok(ConditionExpr::NumericGreaterThanEquals { + key: key.ok_or_else(|| Error::missing_field("key"))?, + value: value_i64.ok_or_else(|| Error::missing_field("value"))?, + }), + "ip_address" => Ok(ConditionExpr::IpAddress { + key: key.ok_or_else(|| Error::missing_field("key"))?, + cidr: cidr.ok_or_else(|| Error::missing_field("cidr"))?, + }), + "not_ip_address" => Ok(ConditionExpr::NotIpAddress { + key: key.ok_or_else(|| Error::missing_field("key"))?, + cidr: cidr.ok_or_else(|| Error::missing_field("cidr"))?, + }), + "time_between" => Ok(ConditionExpr::TimeBetween { + start: start.ok_or_else(|| Error::missing_field("start"))?, + end: end.ok_or_else(|| Error::missing_field("end"))?, + }), + "exists" => Ok(ConditionExpr::Exists { + key: key.ok_or_else(|| Error::missing_field("key"))?, + }), + "string_equals_any" => Ok(ConditionExpr::StringEqualsAny { + key: key.ok_or_else(|| Error::missing_field("key"))?, + values: values.ok_or_else(|| Error::missing_field("values"))?, + }), + "bool" => Ok(ConditionExpr::Bool { + key: key.ok_or_else(|| Error::missing_field("key"))?, + value: value_bool.ok_or_else(|| Error::missing_field("value"))?, + }), + "and" => Ok(ConditionExpr::And( + conditions.ok_or_else(|| Error::missing_field("conditions"))?, + )), + "or" => Ok(ConditionExpr::Or( + conditions.ok_or_else(|| Error::missing_field("conditions"))?, + )), + "not" => Ok(ConditionExpr::Not( + condition.ok_or_else(|| Error::missing_field("condition"))?, + )), + other => Err(Error::unknown_variant( + other, + &[ + "string_equals", + "string_not_equals", + "string_like", + "string_not_like", + "numeric_equals", + "numeric_less_than", + "numeric_less_than_equals", + "numeric_greater_than", + "numeric_greater_than_equals", + "ip_address", + "not_ip_address", + "time_between", + "exists", + "string_equals_any", + "bool", + "and", + "or", + "not", + ], + )), + } + } + } + + deserializer.deserialize_map(ConditionExprVisitor) + } +} + +/// Context keys that can be used in conditions +pub mod keys { + /// Principal-related keys + pub mod principal { + pub const ID: &str = "principal.id"; + pub const KIND: &str = "principal.kind"; + pub const ORG_ID: &str = "principal.org_id"; + pub const PROJECT_ID: &str = "principal.project_id"; + pub const NODE_ID: &str = "principal.node_id"; + } + + /// Resource-related keys + pub mod resource { + pub const KIND: &str = "resource.kind"; + pub const ID: &str = "resource.id"; + pub const PROJECT_ID: &str = "resource.project_id"; + pub const OWNER: &str = "resource.owner"; + pub const NODE: &str = "resource.node"; + pub const REGION: &str = "resource.region"; + } + + /// Request-related keys + pub mod request { + pub const SOURCE_IP: &str = "request.source_ip"; + pub const TIME: &str = "request.time"; + pub const AUTH_LEVEL: &str = "request.auth_level"; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_condition_serialization() { + let cond = Condition::string_equals("resource.owner", "${principal.id}"); + let json = serde_json::to_string(&cond).unwrap(); + assert!(json.contains("string_equals")); + + let parsed: Condition = serde_json::from_str(&json).unwrap(); + assert_eq!(cond, parsed); + } + + #[test] + fn test_condition_and() { + let cond = Condition::and(vec![ + Condition::string_equals("resource.owner", "${principal.id}"), + Condition::ip_address("request.source_ip", "10.0.0.0/8"), + ]); + + if let ConditionExpr::And(exprs) = &cond.expression { + assert_eq!(exprs.len(), 2); + } else { + panic!("expected And expression"); + } + } + + #[test] + fn test_uses_variables() { + let cond = Condition::string_equals("resource.owner", "${principal.id}"); + assert!(cond.expression.uses_variables()); + + let cond = Condition::string_equals("resource.owner", "fixed-value"); + assert!(!cond.expression.uses_variables()); + } + + #[test] + fn test_nested_condition_serialization() { + let cond = Condition::and(vec![ + Condition::string_equals("a", "b"), + Condition::or(vec![ + Condition::ip_address("ip", "10.0.0.0/8"), + Condition::not(Condition::string_like("key", "pattern*")), + ]), + ]); + + let json = serde_json::to_string_pretty(&cond).unwrap(); + let parsed: Condition = serde_json::from_str(&json).unwrap(); + assert_eq!(cond, parsed); + } + + #[test] + fn test_numeric_condition_serialization() { + let cond = Condition::new(ConditionExpr::NumericGreaterThan { + key: "count".to_string(), + value: 100, + }); + let json = serde_json::to_string(&cond).unwrap(); + let parsed: Condition = serde_json::from_str(&json).unwrap(); + assert_eq!(cond, parsed); + } + + #[test] + fn test_bool_condition_serialization() { + let cond = Condition::new(ConditionExpr::Bool { + key: "is_admin".to_string(), + value: true, + }); + let json = serde_json::to_string(&cond).unwrap(); + let parsed: Condition = serde_json::from_str(&json).unwrap(); + assert_eq!(cond, parsed); + } +} diff --git a/iam/crates/iam-types/src/error.rs b/iam/crates/iam-types/src/error.rs new file mode 100644 index 0000000..431f5f9 --- /dev/null +++ b/iam/crates/iam-types/src/error.rs @@ -0,0 +1,159 @@ +//! Error types for IAM + +use thiserror::Error; + +/// Result type alias using IAM's Error +pub type Result = std::result::Result; + +/// Main error type for IAM operations +#[derive(Error, Debug)] +pub enum Error { + /// IAM-specific errors + #[error("IAM error: {0}")] + Iam(#[from] IamError), + + /// Storage layer errors + #[error("Storage error: {0}")] + Storage(#[from] StorageError), + + /// Serialization errors + #[error("Serialization error: {0}")] + Serialization(String), + + /// Configuration errors + #[error("Configuration error: {0}")] + Config(String), + + /// Generic internal error + #[error("Internal error: {0}")] + Internal(String), +} + +/// IAM-specific errors +#[derive(Error, Debug)] +pub enum IamError { + /// Authentication failed + #[error("Authentication failed: {0}")] + AuthnFailed(String), + + /// Authorization denied + #[error("Authorization denied: {reason}")] + AuthzDenied { reason: String }, + + /// Principal not found + #[error("Principal not found: {0}")] + PrincipalNotFound(String), + + /// Role not found + #[error("Role not found: {0}")] + RoleNotFound(String), + + /// Binding not found + #[error("Binding not found: {0}")] + BindingNotFound(String), + + /// Invalid token + #[error("Invalid token: {0}")] + InvalidToken(String), + + /// Token expired + #[error("Token expired")] + TokenExpired, + + /// Invalid scope + #[error("Invalid scope: {0}")] + InvalidScope(String), + + /// Invalid condition + #[error("Invalid condition: {0}")] + InvalidCondition(String), + + /// Invalid action format + #[error("Invalid action format: {0}")] + InvalidAction(String), + + /// Invalid resource pattern + #[error("Invalid resource pattern: {0}")] + InvalidResourcePattern(String), + + /// Principal already exists + #[error("Principal already exists: {0}")] + PrincipalAlreadyExists(String), + + /// Role already exists + #[error("Role already exists: {0}")] + RoleAlreadyExists(String), + + /// Cannot modify builtin role + #[error("Cannot modify builtin role: {0}")] + CannotModifyBuiltinRole(String), + + /// Binding already exists + #[error("Binding already exists: {0}")] + BindingAlreadyExists(String), + + /// Insufficient permissions + #[error("Insufficient permissions to perform this operation")] + InsufficientPermissions, +} + +/// Storage layer errors +#[derive(Error, Debug)] +pub enum StorageError { + /// Key not found + #[error("Key not found: {0}")] + KeyNotFound(String), + + /// IO error + #[error("IO error: {0}")] + Io(#[from] std::io::Error), + + /// Backend error + #[error("Backend error: {0}")] + Backend(String), + + /// Serialization error + #[error("Serialization error: {0}")] + Serialization(String), + + /// Connection error + #[error("Connection error: {0}")] + Connection(String), + + /// CAS conflict + #[error("CAS conflict: expected version {expected}, actual version {actual}")] + CasConflict { expected: u64, actual: u64 }, + + /// Timeout + #[error("Storage operation timeout")] + Timeout, +} + +impl From for Error { + fn from(err: serde_json::Error) -> Self { + Error::Serialization(err.to_string()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_error_display() { + let err = Error::Iam(IamError::AuthnFailed("invalid credentials".into())); + assert!(err.to_string().contains("Authentication failed")); + + let err = Error::Iam(IamError::AuthzDenied { + reason: "no matching policy".into(), + }); + assert!(err.to_string().contains("Authorization denied")); + } + + #[test] + fn test_error_conversion() { + let iam_err = IamError::PrincipalNotFound("user:alice".into()); + let err: Error = iam_err.into(); + assert!(matches!(err, Error::Iam(_))); + } +} diff --git a/iam/crates/iam-types/src/lib.rs b/iam/crates/iam-types/src/lib.rs new file mode 100644 index 0000000..bffbac0 --- /dev/null +++ b/iam/crates/iam-types/src/lib.rs @@ -0,0 +1,30 @@ +//! Core types for IAM (Identity and Access Management) +//! +//! This crate contains all shared type definitions used across the IAM system: +//! - Principal types (User, ServiceAccount, Group) +//! - Resource representation +//! - Role and Permission definitions +//! - Policy bindings +//! - Condition expressions for ABAC +//! - Token claims +//! - Error types + +pub mod condition; +pub mod error; +pub mod policy; +pub mod principal; +pub mod resource; +pub mod role; +pub mod scope; +pub mod token; + +pub use condition::{Condition, ConditionExpr}; +pub use error::{Error, IamError, Result, StorageError}; +pub use policy::{CreateBindingRequest, EffectivePolicy, PolicyBinding}; +pub use principal::{Principal, PrincipalKind, PrincipalRef}; +pub use resource::{Resource, ResourceRef}; +pub use role::{builtin as builtin_roles, Permission, Role}; +pub use scope::Scope; +pub use token::{ + AuthMethod, InternalTokenClaims, JwtClaims, TokenMetadata, TokenType, TokenValidationError, +}; diff --git a/iam/crates/iam-types/src/policy.rs b/iam/crates/iam-types/src/policy.rs new file mode 100644 index 0000000..415e805 --- /dev/null +++ b/iam/crates/iam-types/src/policy.rs @@ -0,0 +1,318 @@ +//! Policy types for IAM +//! +//! Policies bind principals to roles within a specific scope. + +use serde::{Deserialize, Serialize}; + +use crate::condition::Condition; +use crate::principal::PrincipalRef; +use crate::scope::Scope; + +/// A binding that assigns a role to a principal within a scope +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PolicyBinding { + /// Unique identifier for this binding + pub id: String, + + /// Reference to the principal (e.g., "user:alice", "group:developers") + pub principal_ref: PrincipalRef, + + /// Reference to the role (e.g., "roles/ProjectAdmin") + pub role_ref: String, + + /// Scope where this binding applies + pub scope: Scope, + + /// Optional condition for this binding (time-limited, IP-restricted, etc.) + #[serde(skip_serializing_if = "Option::is_none")] + pub condition: Option, + + /// Creation timestamp (TSO format) + pub created_at: u64, + + /// Last update timestamp (TSO format) + pub updated_at: u64, + + /// Who created this binding + pub created_by: String, + + /// Optional expiration timestamp (TSO format) + #[serde(skip_serializing_if = "Option::is_none")] + pub expires_at: Option, + + /// Whether the binding is enabled + pub enabled: bool, +} + +impl PolicyBinding { + /// Create a new policy binding + pub fn new( + id: impl Into, + principal_ref: PrincipalRef, + role_ref: impl Into, + scope: Scope, + ) -> Self { + Self { + id: id.into(), + principal_ref, + role_ref: role_ref.into(), + scope, + condition: None, + created_at: 0, + updated_at: 0, + created_by: String::new(), + expires_at: None, + enabled: true, + } + } + + /// Set the condition + pub fn with_condition(mut self, condition: Condition) -> Self { + self.condition = Some(condition); + self + } + + /// Set the creator + pub fn with_created_by(mut self, created_by: impl Into) -> Self { + self.created_by = created_by.into(); + self + } + + /// Set expiration + pub fn with_expires_at(mut self, expires_at: u64) -> Self { + self.expires_at = Some(expires_at); + self + } + + /// Disable the binding + pub fn disabled(mut self) -> Self { + self.enabled = false; + self + } + + /// Check if the binding is expired (given current TSO timestamp) + pub fn is_expired(&self, now: u64) -> bool { + self.expires_at.map_or(false, |exp| now > exp) + } + + /// Check if the binding is active (enabled and not expired) + pub fn is_active(&self, now: u64) -> bool { + self.enabled && !self.is_expired(now) + } + + /// Generate a storage key for this binding + /// + /// Key format: `iam/bindings/scope/{scope}/principal/{principal_kind}:{principal_id}/{binding_id}` + pub fn storage_key(&self) -> String { + format!( + "iam/bindings/scope/{}/principal/{}/{}", + self.scope.to_key(), + self.principal_ref, + self.id + ) + } + + /// Generate a secondary index key by principal + /// + /// Key format: `iam/bindings/by-principal/{principal_kind}:{principal_id}/{binding_id}` + pub fn principal_index_key(&self) -> String { + format!( + "iam/bindings/by-principal/{}/{}", + self.principal_ref, self.id + ) + } + + /// Generate a secondary index key by role + /// + /// Key format: `iam/bindings/by-role/{role_ref}/{binding_id}` + pub fn role_index_key(&self) -> String { + format!("iam/bindings/by-role/{}/{}", self.role_ref, self.id) + } +} + +/// Request to create a new binding +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CreateBindingRequest { + /// Principal to bind + pub principal_ref: PrincipalRef, + + /// Role to assign + pub role_ref: String, + + /// Scope for the binding + pub scope: Scope, + + /// Optional condition + #[serde(skip_serializing_if = "Option::is_none")] + pub condition: Option, + + /// Optional expiration (seconds from now) + #[serde(skip_serializing_if = "Option::is_none")] + pub ttl_seconds: Option, +} + +/// Result of checking a binding's effectiveness +#[derive(Debug, Clone)] +pub struct BindingEffect { + /// The binding that was checked + pub binding: PolicyBinding, + + /// Whether the binding is currently effective + pub effective: bool, + + /// Reason if not effective + pub reason: Option, +} + +/// Aggregated policy for a principal +#[derive(Debug, Clone, Default)] +pub struct EffectivePolicy { + /// All active bindings for the principal + pub bindings: Vec, + + /// Roles referenced by the bindings + pub roles: Vec, + + /// Scopes where the principal has permissions + pub scopes: Vec, +} + +impl EffectivePolicy { + /// Create an empty effective policy + pub fn new() -> Self { + Self::default() + } + + /// Add a binding to the effective policy + pub fn add_binding(&mut self, binding: PolicyBinding) { + if !self.roles.contains(&binding.role_ref) { + self.roles.push(binding.role_ref.clone()); + } + if !self.scopes.contains(&binding.scope) { + self.scopes.push(binding.scope.clone()); + } + self.bindings.push(binding); + } + + /// Check if the principal has any bindings + pub fn has_bindings(&self) -> bool { + !self.bindings.is_empty() + } + + /// Get bindings for a specific scope + pub fn bindings_for_scope(&self, scope: &Scope) -> Vec<&PolicyBinding> { + self.bindings + .iter() + .filter(|b| b.scope.contains(scope) || scope.contains(&b.scope)) + .collect() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::principal::PrincipalKind; + + #[test] + fn test_binding_creation() { + let binding = PolicyBinding::new( + "binding-1", + PrincipalRef::user("alice"), + "roles/ProjectAdmin", + Scope::project("my-project", "my-org"), + ); + + assert_eq!(binding.id, "binding-1"); + assert_eq!(binding.principal_ref.kind, PrincipalKind::User); + assert_eq!(binding.principal_ref.id, "alice"); + assert!(binding.enabled); + } + + #[test] + fn test_binding_expiration() { + let binding = PolicyBinding::new( + "binding-1", + PrincipalRef::user("alice"), + "roles/ReadOnly", + Scope::System, + ) + .with_expires_at(1000); + + assert!(!binding.is_expired(500)); + assert!(binding.is_expired(1001)); + assert!(binding.is_active(500)); + assert!(!binding.is_active(1001)); + } + + #[test] + fn test_binding_disabled() { + let binding = PolicyBinding::new( + "binding-1", + PrincipalRef::user("alice"), + "roles/ReadOnly", + Scope::System, + ) + .disabled(); + + assert!(!binding.enabled); + assert!(!binding.is_active(0)); + } + + #[test] + fn test_storage_key() { + let binding = PolicyBinding::new( + "binding-123", + PrincipalRef::user("alice"), + "roles/ProjectAdmin", + Scope::project("proj-1", "org-1"), + ); + + let key = binding.storage_key(); + assert!(key.contains("project/proj-1")); + assert!(key.contains("user:alice")); + assert!(key.contains("binding-123")); + } + + #[test] + fn test_effective_policy() { + let mut policy = EffectivePolicy::new(); + + policy.add_binding(PolicyBinding::new( + "b1", + PrincipalRef::user("alice"), + "roles/ProjectAdmin", + Scope::project("proj-1", "org-1"), + )); + + policy.add_binding(PolicyBinding::new( + "b2", + PrincipalRef::user("alice"), + "roles/ReadOnly", + Scope::project("proj-2", "org-1"), + )); + + assert!(policy.has_bindings()); + assert_eq!(policy.bindings.len(), 2); + assert_eq!(policy.roles.len(), 2); + assert_eq!(policy.scopes.len(), 2); + } + + #[test] + fn test_binding_serialization() { + let binding = PolicyBinding::new( + "binding-1", + PrincipalRef::service_account("compute-agent"), + "roles/ServiceRole-ComputeAgent", + Scope::System, + ) + .with_condition(Condition::ip_address("request.source_ip", "10.0.0.0/8")) + .with_created_by("admin"); + + let json = serde_json::to_string(&binding).unwrap(); + let parsed: PolicyBinding = serde_json::from_str(&json).unwrap(); + + assert_eq!(parsed.id, "binding-1"); + assert!(parsed.condition.is_some()); + assert_eq!(parsed.created_by, "admin"); + } +} diff --git a/iam/crates/iam-types/src/principal.rs b/iam/crates/iam-types/src/principal.rs new file mode 100644 index 0000000..ba47c58 --- /dev/null +++ b/iam/crates/iam-types/src/principal.rs @@ -0,0 +1,235 @@ +//! Principal types for IAM +//! +//! Principals represent identities that can be authenticated and authorized. + +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::fmt; + +/// Kind of principal +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum PrincipalKind { + /// Human user + User, + /// Service account (for machine-to-machine auth) + ServiceAccount, + /// Group of principals + Group, +} + +impl fmt::Display for PrincipalKind { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + PrincipalKind::User => write!(f, "user"), + PrincipalKind::ServiceAccount => write!(f, "service_account"), + PrincipalKind::Group => write!(f, "group"), + } + } +} + +impl std::str::FromStr for PrincipalKind { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "user" => Ok(PrincipalKind::User), + "service_account" | "sa" => Ok(PrincipalKind::ServiceAccount), + "group" => Ok(PrincipalKind::Group), + _ => Err(format!("unknown principal kind: {}", s)), + } + } +} + +/// A principal (identity) in the IAM system +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Principal { + /// Unique identifier + pub id: String, + + /// Kind of principal + pub kind: PrincipalKind, + + /// Display name + pub name: String, + + /// Optional organization ID + pub org_id: Option, + + /// Optional project ID (for service accounts) + pub project_id: Option, + + /// Email address (for users) + pub email: Option, + + /// OIDC subject (for federated users) + pub oidc_sub: Option, + + /// Node ID (for service accounts bound to nodes) + pub node_id: Option, + + /// Arbitrary metadata + pub metadata: HashMap, + + /// Creation timestamp (TSO format) + pub created_at: u64, + + /// Last update timestamp (TSO format) + pub updated_at: u64, + + /// Whether the principal is enabled + pub enabled: bool, +} + +impl Principal { + /// Create a new user principal + pub fn new_user(id: impl Into, name: impl Into) -> Self { + Self { + id: id.into(), + kind: PrincipalKind::User, + name: name.into(), + org_id: None, + project_id: None, + email: None, + oidc_sub: None, + node_id: None, + metadata: HashMap::new(), + created_at: 0, + updated_at: 0, + enabled: true, + } + } + + /// Create a new service account principal + pub fn new_service_account( + id: impl Into, + name: impl Into, + project_id: impl Into, + ) -> Self { + Self { + id: id.into(), + kind: PrincipalKind::ServiceAccount, + name: name.into(), + org_id: None, + project_id: Some(project_id.into()), + email: None, + oidc_sub: None, + node_id: None, + metadata: HashMap::new(), + created_at: 0, + updated_at: 0, + enabled: true, + } + } + + /// Create a new group principal + pub fn new_group(id: impl Into, name: impl Into) -> Self { + Self { + id: id.into(), + kind: PrincipalKind::Group, + name: name.into(), + org_id: None, + project_id: None, + email: None, + oidc_sub: None, + node_id: None, + metadata: HashMap::new(), + created_at: 0, + updated_at: 0, + enabled: true, + } + } + + /// Get the principal reference string (e.g., "user:alice") + pub fn to_ref(&self) -> PrincipalRef { + PrincipalRef { + kind: self.kind.clone(), + id: self.id.clone(), + } + } +} + +/// A reference to a principal (kind:id format) +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct PrincipalRef { + pub kind: PrincipalKind, + pub id: String, +} + +impl PrincipalRef { + /// Create a new principal reference + pub fn new(kind: PrincipalKind, id: impl Into) -> Self { + Self { + kind, + id: id.into(), + } + } + + /// Create a user reference + pub fn user(id: impl Into) -> Self { + Self::new(PrincipalKind::User, id) + } + + /// Create a service account reference + pub fn service_account(id: impl Into) -> Self { + Self::new(PrincipalKind::ServiceAccount, id) + } + + /// Create a group reference + pub fn group(id: impl Into) -> Self { + Self::new(PrincipalKind::Group, id) + } +} + +impl fmt::Display for PrincipalRef { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}:{}", self.kind, self.id) + } +} + +impl std::str::FromStr for PrincipalRef { + type Err = String; + + fn from_str(s: &str) -> Result { + let parts: Vec<&str> = s.splitn(2, ':').collect(); + if parts.len() != 2 { + return Err(format!("invalid principal ref format: {}", s)); + } + let kind = parts[0].parse()?; + Ok(PrincipalRef { + kind, + id: parts[1].to_string(), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_principal_ref_parse() { + let pr: PrincipalRef = "user:alice".parse().unwrap(); + assert_eq!(pr.kind, PrincipalKind::User); + assert_eq!(pr.id, "alice"); + + let pr: PrincipalRef = "service_account:compute-agent".parse().unwrap(); + assert_eq!(pr.kind, PrincipalKind::ServiceAccount); + assert_eq!(pr.id, "compute-agent"); + } + + #[test] + fn test_principal_ref_display() { + let pr = PrincipalRef::user("alice"); + assert_eq!(pr.to_string(), "user:alice"); + } + + #[test] + fn test_principal_serialization() { + let principal = Principal::new_user("u1", "Alice"); + let json = serde_json::to_string(&principal).unwrap(); + let parsed: Principal = serde_json::from_str(&json).unwrap(); + assert_eq!(parsed.id, "u1"); + assert_eq!(parsed.name, "Alice"); + } +} diff --git a/iam/crates/iam-types/src/resource.rs b/iam/crates/iam-types/src/resource.rs new file mode 100644 index 0000000..c876126 --- /dev/null +++ b/iam/crates/iam-types/src/resource.rs @@ -0,0 +1,214 @@ +//! Resource types for IAM +//! +//! Resources represent objects that can be protected by IAM policies. + +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::fmt; + +/// A resource that can be protected by IAM policies +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Resource { + /// Resource kind (e.g., "instance", "volume", "bucket") + pub kind: String, + + /// Resource identifier + pub id: String, + + /// Organization ID that owns this resource + pub org_id: String, + + /// Project ID that owns this resource + pub project_id: String, + + /// Optional owner principal ID + pub owner_id: Option, + + /// Node ID where the resource is located (for node-scoped resources) + pub node_id: Option, + + /// Region/zone information + pub region: Option, + + /// Resource tags + pub tags: HashMap, +} + +impl Resource { + /// Create a new resource + pub fn new( + kind: impl Into, + id: impl Into, + org_id: impl Into, + project_id: impl Into, + ) -> Self { + Self { + kind: kind.into(), + id: id.into(), + org_id: org_id.into(), + project_id: project_id.into(), + owner_id: None, + node_id: None, + region: None, + tags: HashMap::new(), + } + } + + /// Set the owner + pub fn with_owner(mut self, owner_id: impl Into) -> Self { + self.owner_id = Some(owner_id.into()); + self + } + + /// Set the node ID + pub fn with_node(mut self, node_id: impl Into) -> Self { + self.node_id = Some(node_id.into()); + self + } + + /// Set the region + pub fn with_region(mut self, region: impl Into) -> Self { + self.region = Some(region.into()); + self + } + + /// Add a tag + pub fn with_tag(mut self, key: impl Into, value: impl Into) -> Self { + self.tags.insert(key.into(), value.into()); + self + } + + /// Convert to resource path (e.g., "org/my-org/project/my-project/instances/vm-123") + pub fn to_path(&self) -> String { + format!( + "org/{}/project/{}/{}/{}", + self.org_id, self.project_id, self.kind, self.id + ) + } + + /// Get a property value for condition evaluation + pub fn get_property(&self, key: &str) -> Option { + match key { + "kind" => Some(self.kind.clone()), + "id" => Some(self.id.clone()), + "org_id" | "org" => Some(self.org_id.clone()), + "project_id" | "project" => Some(self.project_id.clone()), + "owner" | "owner_id" => self.owner_id.clone(), + "node" | "node_id" => self.node_id.clone(), + "region" => self.region.clone(), + key if key.starts_with("tags.") => { + let tag_key = &key[5..]; + self.tags.get(tag_key).cloned() + } + _ => None, + } + } +} + +impl fmt::Display for Resource { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.to_path()) + } +} + +/// Reference to a resource (kind:org:project:id or pattern) +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct ResourceRef { + pub kind: String, + pub org_id: String, + pub project_id: String, + pub id: String, +} + +impl ResourceRef { + /// Create a new resource reference + pub fn new( + kind: impl Into, + org_id: impl Into, + project_id: impl Into, + id: impl Into, + ) -> Self { + Self { + kind: kind.into(), + org_id: org_id.into(), + project_id: project_id.into(), + id: id.into(), + } + } + + /// Create from a Resource + pub fn from_resource(resource: &Resource) -> Self { + Self { + kind: resource.kind.clone(), + org_id: resource.org_id.clone(), + project_id: resource.project_id.clone(), + id: resource.id.clone(), + } + } +} + +impl fmt::Display for ResourceRef { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "org/{}/project/{}/{}/{}", + self.org_id, self.project_id, self.kind, self.id + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_resource_path() { + let resource = Resource::new("instance", "vm-123", "my-org", "my-project"); + assert_eq!( + resource.to_path(), + "org/my-org/project/my-project/instance/vm-123" + ); + } + + #[test] + fn test_resource_property() { + let resource = Resource::new("instance", "vm-123", "my-org", "my-project") + .with_owner("user:alice") + .with_tag("env", "prod"); + + assert_eq!(resource.get_property("kind"), Some("instance".into())); + assert_eq!(resource.get_property("org_id"), Some("my-org".into())); + assert_eq!( + resource.get_property("project_id"), + Some("my-project".into()) + ); + assert_eq!(resource.get_property("owner"), Some("user:alice".into())); + assert_eq!(resource.get_property("tags.env"), Some("prod".into())); + assert_eq!(resource.get_property("tags.unknown"), None); + } + + #[test] + fn test_resource_serialization() { + let resource = + Resource::new("instance", "vm-123", "my-org", "my-project").with_owner("user:alice"); + let json = serde_json::to_string(&resource).unwrap(); + let parsed: Resource = serde_json::from_str(&json).unwrap(); + assert_eq!(parsed.kind, "instance"); + assert_eq!(parsed.org_id, "my-org"); + assert_eq!(parsed.owner_id, Some("user:alice".into())); + } + + #[test] + fn test_resource_ref() { + let resource = Resource::new("instance", "vm-123", "my-org", "my-project"); + let resource_ref = ResourceRef::from_resource(&resource); + assert_eq!(resource_ref.kind, "instance"); + assert_eq!(resource_ref.org_id, "my-org"); + assert_eq!(resource_ref.project_id, "my-project"); + assert_eq!(resource_ref.id, "vm-123"); + assert_eq!( + resource_ref.to_string(), + "org/my-org/project/my-project/instance/vm-123" + ); + } +} diff --git a/iam/crates/iam-types/src/role.rs b/iam/crates/iam-types/src/role.rs new file mode 100644 index 0000000..14c325f --- /dev/null +++ b/iam/crates/iam-types/src/role.rs @@ -0,0 +1,310 @@ +//! Role and Permission types for IAM +//! +//! Roles define sets of permissions that can be assigned to principals. + +use serde::{Deserialize, Serialize}; + +use crate::condition::Condition; +use crate::scope::Scope; + +/// A role that groups permissions together +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Role { + /// Role name (e.g., "ProjectAdmin", "ReadOnly") + pub name: String, + + /// Display name for UI + pub display_name: String, + + /// Description of the role + pub description: String, + + /// Scope where this role can be applied + pub scope: Scope, + + /// Permissions granted by this role + pub permissions: Vec, + + /// Whether this is a builtin system role + pub builtin: bool, + + /// Creation timestamp (TSO format) + pub created_at: u64, + + /// Last update timestamp (TSO format) + pub updated_at: u64, +} + +impl Role { + /// Create a new custom role + pub fn new(name: impl Into, scope: Scope, permissions: Vec) -> Self { + Self { + name: name.into(), + display_name: String::new(), + description: String::new(), + scope, + permissions, + builtin: false, + created_at: 0, + updated_at: 0, + } + } + + /// Create a builtin role + pub fn builtin(name: impl Into, scope: Scope, permissions: Vec) -> Self { + let name = name.into(); + Self { + display_name: name.clone(), + name, + description: String::new(), + scope, + permissions, + builtin: true, + created_at: 0, + updated_at: 0, + } + } + + /// Set display name + pub fn with_display_name(mut self, display_name: impl Into) -> Self { + self.display_name = display_name.into(); + self + } + + /// Set description + pub fn with_description(mut self, description: impl Into) -> Self { + self.description = description.into(); + self + } + + /// Get the role reference string (e.g., "roles/ProjectAdmin") + pub fn to_ref(&self) -> String { + format!("roles/{}", self.name) + } +} + +/// A permission within a role +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Permission { + /// Action pattern (e.g., "compute:instances:create", "compute:*:read") + pub action: String, + + /// Resource pattern (e.g., "project/*/instances/*") + pub resource_pattern: String, + + /// Optional condition for this permission (ABAC) + #[serde(skip_serializing_if = "Option::is_none")] + pub condition: Option, +} + +impl Permission { + /// Create a new permission + pub fn new(action: impl Into, resource_pattern: impl Into) -> Self { + Self { + action: action.into(), + resource_pattern: resource_pattern.into(), + condition: None, + } + } + + /// Create a permission with a condition + pub fn with_condition(mut self, condition: Condition) -> Self { + self.condition = Some(condition); + self + } + + /// Create a wildcard permission (all actions on all resources) + pub fn wildcard() -> Self { + Self::new("*", "*") + } + + /// Check if this is a wildcard permission + pub fn is_wildcard(&self) -> bool { + self.action == "*" && self.resource_pattern == "*" + } +} + +/// Builtin roles +pub mod builtin { + use super::*; + + /// System administrator - full access to everything + pub fn system_admin() -> Role { + Role::builtin("SystemAdmin", Scope::System, vec![Permission::wildcard()]) + .with_display_name("System Administrator") + .with_description("Full access to all resources in the system") + } + + /// Organization administrator + pub fn org_admin() -> Role { + Role::builtin( + "OrgAdmin", + Scope::org("*"), + vec![Permission::new("*", "org/${org}/*")], + ) + .with_display_name("Organization Administrator") + .with_description("Full access to all resources within the organization") + } + + /// Project administrator + pub fn project_admin() -> Role { + Role::builtin( + "ProjectAdmin", + Scope::project("*", "*"), + vec![Permission::new("*", "project/${project}/*")], + ) + .with_display_name("Project Administrator") + .with_description("Full access to all resources within the project") + } + + /// Project member - can manage own resources + pub fn project_member() -> Role { + Role::builtin( + "ProjectMember", + Scope::project("*", "*"), + vec![ + // Full access to own resources + Permission::new("compute:instances:*", "project/${project}/instances/*") + .with_condition(Condition::string_equals( + "resource.owner", + "${principal.id}", + )), + // Read access to all project resources + Permission::new("*:read", "project/${project}/*"), + Permission::new("*:list", "project/${project}/*"), + ], + ) + .with_display_name("Project Member") + .with_description("Can manage own resources and read all project resources") + } + + /// Read-only access + pub fn read_only() -> Role { + Role::builtin( + "ReadOnly", + Scope::project("*", "*"), + vec![ + Permission::new("*:read", "project/${project}/*"), + Permission::new("*:list", "project/${project}/*"), + Permission::new("*:get", "project/${project}/*"), + ], + ) + .with_display_name("Read Only") + .with_description("Read-only access to all project resources") + } + + /// Compute agent service role + pub fn compute_agent() -> Role { + Role::builtin( + "ServiceRole-ComputeAgent", + Scope::System, + vec![Permission::new("compute:instances:*", "*").with_condition( + Condition::string_equals("resource.node", "${principal.node_id}"), + )], + ) + .with_display_name("Compute Agent") + .with_description("Service role for compute agents - can manage instances on their node") + } + + /// Storage agent service role + pub fn storage_agent() -> Role { + Role::builtin( + "ServiceRole-StorageAgent", + Scope::System, + vec![Permission::new("storage:volumes:*", "*").with_condition( + Condition::string_equals("resource.node", "${principal.node_id}"), + )], + ) + .with_display_name("Storage Agent") + .with_description("Service role for storage agents - can manage volumes on their node") + } + + /// Get all builtin roles + pub fn all() -> Vec { + vec![ + system_admin(), + org_admin(), + project_admin(), + project_member(), + read_only(), + compute_agent(), + storage_agent(), + ] + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_role_creation() { + let role = Role::new( + "CustomRole", + Scope::project("my-project", "my-org"), + vec![Permission::new( + "compute:instances:create", + "project/my-project/instances/*", + )], + ); + + assert_eq!(role.name, "CustomRole"); + assert!(!role.builtin); + assert_eq!(role.permissions.len(), 1); + } + + #[test] + fn test_role_ref() { + let role = Role::new("TestRole", Scope::System, vec![]); + assert_eq!(role.to_ref(), "roles/TestRole"); + } + + #[test] + fn test_permission_with_condition() { + let perm = + Permission::new("compute:instances:delete", "project/*/instances/*").with_condition( + Condition::string_equals("resource.owner", "${principal.id}"), + ); + + assert!(perm.condition.is_some()); + } + + #[test] + fn test_wildcard_permission() { + let perm = Permission::wildcard(); + assert!(perm.is_wildcard()); + + let perm = Permission::new("compute:*", "*"); + assert!(!perm.is_wildcard()); + } + + #[test] + fn test_builtin_roles() { + let roles = builtin::all(); + assert!(roles.len() >= 5); + + for role in &roles { + assert!(role.builtin); + } + + let admin = builtin::system_admin(); + assert!(admin.permissions[0].is_wildcard()); + } + + #[test] + fn test_role_serialization() { + let role = Role::new( + "TestRole", + Scope::project("proj-1", "org-1"), + vec![Permission::new("compute:instances:*", "project/proj-1/*")], + ) + .with_display_name("Test Role") + .with_description("A test role"); + + let json = serde_json::to_string(&role).unwrap(); + let parsed: Role = serde_json::from_str(&json).unwrap(); + + assert_eq!(parsed.name, "TestRole"); + assert_eq!(parsed.display_name, "Test Role"); + } +} diff --git a/iam/crates/iam-types/src/scope.rs b/iam/crates/iam-types/src/scope.rs new file mode 100644 index 0000000..390a07d --- /dev/null +++ b/iam/crates/iam-types/src/scope.rs @@ -0,0 +1,494 @@ +//! Scope types for IAM +//! +//! Scopes define the hierarchical boundary for permissions. +//! The hierarchy is: System > Org > Project > Resource + +use serde::{Deserialize, Serialize}; +use std::fmt; + +/// Scope represents the hierarchical boundary for permissions +/// +/// Hierarchy: System > Org > Project > Resource +/// +/// Each scope level includes context about its parent scopes, +/// enabling proper hierarchical permission evaluation. +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum Scope { + /// System-wide scope (affects entire cluster) + System, + + /// Organization scope + Org { + /// Organization ID + id: String, + }, + + /// Project scope (belongs to an organization) + Project { + /// Project ID + id: String, + /// Parent organization ID + org_id: String, + }, + + /// Resource-specific scope (belongs to a project) + Resource { + /// Resource ID + id: String, + /// Parent project ID + project_id: String, + /// Parent organization ID + org_id: String, + }, +} + +impl Scope { + /// Create a system scope + pub fn system() -> Self { + Scope::System + } + + /// Create an organization scope + pub fn org(id: impl Into) -> Self { + Scope::Org { id: id.into() } + } + + /// Create a project scope + pub fn project(id: impl Into, org_id: impl Into) -> Self { + Scope::Project { + id: id.into(), + org_id: org_id.into(), + } + } + + /// Create a resource scope + pub fn resource( + id: impl Into, + project_id: impl Into, + org_id: impl Into, + ) -> Self { + Scope::Resource { + id: id.into(), + project_id: project_id.into(), + org_id: org_id.into(), + } + } + + /// Check if this scope contains another scope + /// + /// A scope contains another if: + /// - They are equal + /// - This scope is a parent of the other scope in the hierarchy + /// + /// Examples: + /// - System contains everything + /// - Org("acme") contains Project { org_id: "acme", ... } + /// - Project { id: "proj1", org_id: "acme" } contains Resource { project_id: "proj1", org_id: "acme", ... } + pub fn contains(&self, other: &Scope) -> bool { + match (self, other) { + // System contains everything + (Scope::System, _) => true, + + // Org contains itself + (Scope::Org { id: self_id }, Scope::Org { id: other_id }) => self_id == other_id, + // Org contains projects within it + (Scope::Org { id: org_id }, Scope::Project { org_id: proj_org_id, .. }) => { + org_id == proj_org_id + } + // Org contains resources within its projects + (Scope::Org { id: org_id }, Scope::Resource { org_id: res_org_id, .. }) => { + org_id == res_org_id + } + (Scope::Org { .. }, Scope::System) => false, + + // Project contains itself + ( + Scope::Project { + id: self_id, + org_id: self_org_id, + }, + Scope::Project { + id: other_id, + org_id: other_org_id, + }, + ) => self_id == other_id && self_org_id == other_org_id, + // Project contains resources within it + ( + Scope::Project { + id: proj_id, + org_id: proj_org_id, + }, + Scope::Resource { + project_id: res_proj_id, + org_id: res_org_id, + .. + }, + ) => proj_id == res_proj_id && proj_org_id == res_org_id, + (Scope::Project { .. }, _) => false, + + // Resource only contains itself + ( + Scope::Resource { + id: self_id, + project_id: self_proj_id, + org_id: self_org_id, + }, + Scope::Resource { + id: other_id, + project_id: other_proj_id, + org_id: other_org_id, + }, + ) => { + self_id == other_id + && self_proj_id == other_proj_id + && self_org_id == other_org_id + } + (Scope::Resource { .. }, _) => false, + } + } + + /// Get the parent scope, if any + /// + /// - System has no parent + /// - Org's parent is System + /// - Project's parent is its Org + /// - Resource's parent is its Project + pub fn parent(&self) -> Option { + match self { + Scope::System => None, + Scope::Org { .. } => Some(Scope::System), + Scope::Project { org_id, .. } => Some(Scope::Org { id: org_id.clone() }), + Scope::Resource { + project_id, + org_id, + .. + } => Some(Scope::Project { + id: project_id.clone(), + org_id: org_id.clone(), + }), + } + } + + /// Get all ancestor scopes, from immediate parent to System + /// + /// Does not include self. + pub fn ancestors(&self) -> Vec { + let mut result = Vec::new(); + let mut current = self.parent(); + while let Some(parent) = current { + let next_parent = parent.parent(); + result.push(parent); + current = next_parent; + } + result + } + + /// Get all ancestor scopes including self, from self to System + pub fn ancestors_including_self(&self) -> Vec { + let mut result = vec![self.clone()]; + result.extend(self.ancestors()); + result + } + + /// Get the scope level (lower = broader) + /// + /// - System: 0 + /// - Org: 1 + /// - Project: 2 + /// - Resource: 3 + pub fn level(&self) -> u8 { + match self { + Scope::System => 0, + Scope::Org { .. } => 1, + Scope::Project { .. } => 2, + Scope::Resource { .. } => 3, + } + } + + /// Get the organization ID if this scope is within an organization + pub fn org_id(&self) -> Option<&str> { + match self { + Scope::System => None, + Scope::Org { id } => Some(id), + Scope::Project { org_id, .. } => Some(org_id), + Scope::Resource { org_id, .. } => Some(org_id), + } + } + + /// Get the project ID if this scope is within a project + pub fn project_id(&self) -> Option<&str> { + match self { + Scope::System | Scope::Org { .. } => None, + Scope::Project { id, .. } => Some(id), + Scope::Resource { project_id, .. } => Some(project_id), + } + } + + /// Convert to a key string for storage + /// + /// Format: + /// - System: "system" + /// - Org: "org/{org_id}" + /// - Project: "org/{org_id}/project/{project_id}" + /// - Resource: "org/{org_id}/project/{project_id}/resource/{resource_id}" + pub fn to_key(&self) -> String { + match self { + Scope::System => "system".to_string(), + Scope::Org { id } => format!("org/{}", id), + Scope::Project { id, org_id } => format!("org/{}/project/{}", org_id, id), + Scope::Resource { + id, + project_id, + org_id, + } => format!("org/{}/project/{}/resource/{}", org_id, project_id, id), + } + } + + /// Parse from a key string + pub fn from_key(s: &str) -> Option { + if s == "system" { + return Some(Scope::System); + } + + let parts: Vec<&str> = s.split('/').collect(); + + match parts.as_slice() { + ["org", org_id] => Some(Scope::Org { + id: (*org_id).to_string(), + }), + ["org", org_id, "project", proj_id] => Some(Scope::Project { + id: (*proj_id).to_string(), + org_id: (*org_id).to_string(), + }), + ["org", org_id, "project", proj_id, "resource", res_id] => Some(Scope::Resource { + id: (*res_id).to_string(), + project_id: (*proj_id).to_string(), + org_id: (*org_id).to_string(), + }), + _ => None, + } + } +} + +impl Default for Scope { + fn default() -> Self { + Scope::System + } +} + +impl fmt::Display for Scope { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Scope::System => write!(f, "system"), + Scope::Org { id } => write!(f, "org:{}", id), + Scope::Project { id, org_id } => write!(f, "org:{}:project:{}", org_id, id), + Scope::Resource { + id, + project_id, + org_id, + } => write!(f, "org:{}:project:{}:resource:{}", org_id, project_id, id), + } + } +} + +impl std::str::FromStr for Scope { + type Err = String; + + fn from_str(s: &str) -> Result { + if s == "system" { + return Ok(Scope::System); + } + + let parts: Vec<&str> = s.split(':').collect(); + + match parts.as_slice() { + ["org", org_id] => Ok(Scope::Org { + id: (*org_id).to_string(), + }), + ["org", org_id, "project", proj_id] => Ok(Scope::Project { + id: (*proj_id).to_string(), + org_id: (*org_id).to_string(), + }), + ["org", org_id, "project", proj_id, "resource", res_id] => Ok(Scope::Resource { + id: (*res_id).to_string(), + project_id: (*proj_id).to_string(), + org_id: (*org_id).to_string(), + }), + _ => Err(format!("invalid scope format: {}", s)), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_scope_constructors() { + let sys = Scope::system(); + assert_eq!(sys, Scope::System); + + let org = Scope::org("acme"); + assert_eq!(org, Scope::Org { id: "acme".into() }); + + let proj = Scope::project("proj1", "acme"); + assert_eq!( + proj, + Scope::Project { + id: "proj1".into(), + org_id: "acme".into() + } + ); + + let res = Scope::resource("res1", "proj1", "acme"); + assert_eq!( + res, + Scope::Resource { + id: "res1".into(), + project_id: "proj1".into(), + org_id: "acme".into() + } + ); + } + + #[test] + fn test_scope_contains() { + let system = Scope::system(); + let org = Scope::org("acme"); + let proj = Scope::project("proj1", "acme"); + let res = Scope::resource("res1", "proj1", "acme"); + + // System contains everything + assert!(system.contains(&system)); + assert!(system.contains(&org)); + assert!(system.contains(&proj)); + assert!(system.contains(&res)); + + // Org contains itself and its children + assert!(org.contains(&org)); + assert!(org.contains(&proj)); + assert!(org.contains(&res)); + assert!(!org.contains(&system)); + assert!(!org.contains(&Scope::org("other"))); + assert!(!org.contains(&Scope::project("proj1", "other"))); + + // Project contains itself and its resources + assert!(proj.contains(&proj)); + assert!(proj.contains(&res)); + assert!(!proj.contains(&system)); + assert!(!proj.contains(&org)); + assert!(!proj.contains(&Scope::resource("res1", "proj2", "acme"))); + + // Resource only contains itself + assert!(res.contains(&res)); + assert!(!res.contains(&system)); + assert!(!res.contains(&org)); + assert!(!res.contains(&proj)); + } + + #[test] + fn test_scope_parent() { + let system = Scope::system(); + let org = Scope::org("acme"); + let proj = Scope::project("proj1", "acme"); + let res = Scope::resource("res1", "proj1", "acme"); + + assert_eq!(system.parent(), None); + assert_eq!(org.parent(), Some(system.clone())); + assert_eq!(proj.parent(), Some(org.clone())); + assert_eq!(res.parent(), Some(proj.clone())); + } + + #[test] + fn test_scope_ancestors() { + let res = Scope::resource("res1", "proj1", "acme"); + let ancestors = res.ancestors(); + + assert_eq!(ancestors.len(), 3); + assert_eq!(ancestors[0], Scope::project("proj1", "acme")); + assert_eq!(ancestors[1], Scope::org("acme")); + assert_eq!(ancestors[2], Scope::system()); + } + + #[test] + fn test_scope_parse() { + let scope: Scope = "system".parse().unwrap(); + assert_eq!(scope, Scope::System); + + let scope: Scope = "org:acme".parse().unwrap(); + assert_eq!(scope, Scope::org("acme")); + + let scope: Scope = "org:acme:project:proj1".parse().unwrap(); + assert_eq!(scope, Scope::project("proj1", "acme")); + + let scope: Scope = "org:acme:project:proj1:resource:res1".parse().unwrap(); + assert_eq!(scope, Scope::resource("res1", "proj1", "acme")); + } + + #[test] + fn test_scope_key() { + assert_eq!(Scope::System.to_key(), "system"); + assert_eq!(Scope::org("acme").to_key(), "org/acme"); + assert_eq!( + Scope::project("proj1", "acme").to_key(), + "org/acme/project/proj1" + ); + assert_eq!( + Scope::resource("res1", "proj1", "acme").to_key(), + "org/acme/project/proj1/resource/res1" + ); + + assert_eq!(Scope::from_key("system"), Some(Scope::System)); + assert_eq!(Scope::from_key("org/acme"), Some(Scope::org("acme"))); + assert_eq!( + Scope::from_key("org/acme/project/proj1"), + Some(Scope::project("proj1", "acme")) + ); + assert_eq!( + Scope::from_key("org/acme/project/proj1/resource/res1"), + Some(Scope::resource("res1", "proj1", "acme")) + ); + } + + #[test] + fn test_scope_display() { + assert_eq!(Scope::System.to_string(), "system"); + assert_eq!(Scope::org("acme").to_string(), "org:acme"); + assert_eq!( + Scope::project("proj1", "acme").to_string(), + "org:acme:project:proj1" + ); + assert_eq!( + Scope::resource("res1", "proj1", "acme").to_string(), + "org:acme:project:proj1:resource:res1" + ); + } + + #[test] + fn test_scope_level() { + assert_eq!(Scope::system().level(), 0); + assert_eq!(Scope::org("acme").level(), 1); + assert_eq!(Scope::project("p", "o").level(), 2); + assert_eq!(Scope::resource("r", "p", "o").level(), 3); + } + + #[test] + fn test_scope_org_id() { + assert_eq!(Scope::system().org_id(), None); + assert_eq!(Scope::org("acme").org_id(), Some("acme")); + assert_eq!(Scope::project("p", "acme").org_id(), Some("acme")); + assert_eq!(Scope::resource("r", "p", "acme").org_id(), Some("acme")); + } + + #[test] + fn test_scope_project_id() { + assert_eq!(Scope::system().project_id(), None); + assert_eq!(Scope::org("acme").project_id(), None); + assert_eq!(Scope::project("proj1", "acme").project_id(), Some("proj1")); + assert_eq!( + Scope::resource("r", "proj1", "acme").project_id(), + Some("proj1") + ); + } +} diff --git a/iam/crates/iam-types/src/token.rs b/iam/crates/iam-types/src/token.rs new file mode 100644 index 0000000..f0b89a9 --- /dev/null +++ b/iam/crates/iam-types/src/token.rs @@ -0,0 +1,506 @@ +//! Token types for IAM +//! +//! Defines token claims for both external (JWT/OIDC) and internal tokens. + +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +use crate::principal::PrincipalKind; +use crate::scope::Scope; + +/// Claims from an external JWT/OIDC token +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct JwtClaims { + /// Subject (user identifier from IdP) + pub sub: String, + + /// Issuer (IdP URL) + pub iss: String, + + /// Audience (our service) + pub aud: Vec, + + /// Expiration timestamp (Unix seconds) + pub exp: u64, + + /// Issued at timestamp (Unix seconds) + pub iat: u64, + + /// Not before timestamp (Unix seconds) + #[serde(skip_serializing_if = "Option::is_none")] + pub nbf: Option, + + /// JWT ID (unique identifier) + #[serde(skip_serializing_if = "Option::is_none")] + pub jti: Option, + + /// User's email + #[serde(skip_serializing_if = "Option::is_none")] + pub email: Option, + + /// Email verified flag + #[serde(skip_serializing_if = "Option::is_none")] + pub email_verified: Option, + + /// User's name + #[serde(skip_serializing_if = "Option::is_none")] + pub name: Option, + + /// Groups the user belongs to + #[serde(default)] + pub groups: Vec, + + /// Organization ID (custom claim) + #[serde(skip_serializing_if = "Option::is_none")] + pub org_id: Option, + + /// Project ID (custom claim) + #[serde(skip_serializing_if = "Option::is_none")] + pub project_id: Option, + + /// Additional claims + #[serde(flatten)] + pub extra: HashMap, +} + +impl JwtClaims { + /// Check if the token is expired + pub fn is_expired(&self, now_secs: u64) -> bool { + now_secs > self.exp + } + + /// Check if the token is not yet valid + pub fn is_not_yet_valid(&self, now_secs: u64) -> bool { + self.nbf.map_or(false, |nbf| now_secs < nbf) + } + + /// Validate basic claims + pub fn validate( + &self, + expected_issuer: &str, + expected_audience: &str, + now_secs: u64, + ) -> Result<(), TokenValidationError> { + if self.iss != expected_issuer { + return Err(TokenValidationError::InvalidIssuer { + expected: expected_issuer.to_string(), + actual: self.iss.clone(), + }); + } + + if !self.aud.iter().any(|a| a == expected_audience) { + return Err(TokenValidationError::InvalidAudience { + expected: expected_audience.to_string(), + actual: self.aud.clone(), + }); + } + + if self.is_expired(now_secs) { + return Err(TokenValidationError::Expired); + } + + if self.is_not_yet_valid(now_secs) { + return Err(TokenValidationError::NotYetValid); + } + + Ok(()) + } +} + +/// Token validation errors +#[derive(Debug, Clone)] +pub enum TokenValidationError { + /// Token issuer doesn't match + InvalidIssuer { expected: String, actual: String }, + /// Token audience doesn't match + InvalidAudience { + expected: String, + actual: Vec, + }, + /// Token is expired + Expired, + /// Token is not yet valid + NotYetValid, + /// Invalid signature + InvalidSignature, + /// Malformed token + Malformed(String), +} + +impl std::fmt::Display for TokenValidationError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::InvalidIssuer { expected, actual } => { + write!(f, "invalid issuer: expected {}, got {}", expected, actual) + } + Self::InvalidAudience { expected, actual } => { + write!( + f, + "invalid audience: expected {}, got {:?}", + expected, actual + ) + } + Self::Expired => write!(f, "token expired"), + Self::NotYetValid => write!(f, "token not yet valid"), + Self::InvalidSignature => write!(f, "invalid signature"), + Self::Malformed(msg) => write!(f, "malformed token: {}", msg), + } + } +} + +impl std::error::Error for TokenValidationError {} + +/// Internal token claims (issued by IAM service) +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct InternalTokenClaims { + /// Principal ID + pub principal_id: String, + + /// Principal kind + pub principal_kind: PrincipalKind, + + /// Display name + pub principal_name: String, + + /// Roles assigned to this principal + pub roles: Vec, + + /// Effective scope for this token + pub scope: Scope, + + /// Organization ID (if applicable) + #[serde(skip_serializing_if = "Option::is_none")] + pub org_id: Option, + + /// Project ID (if applicable) + #[serde(skip_serializing_if = "Option::is_none")] + pub project_id: Option, + + /// Node ID (for service accounts bound to nodes) + #[serde(skip_serializing_if = "Option::is_none")] + pub node_id: Option, + + /// Issued at timestamp (TSO format) + pub iat: u64, + + /// Expiration timestamp (TSO format) + pub exp: u64, + + /// Unique session/token ID + pub session_id: String, + + /// Original authentication method + pub auth_method: AuthMethod, + + /// Additional metadata + #[serde(default)] + pub metadata: HashMap, +} + +impl InternalTokenClaims { + /// Create new internal token claims + pub fn new( + principal_id: impl Into, + principal_kind: PrincipalKind, + principal_name: impl Into, + scope: Scope, + session_id: impl Into, + ) -> Self { + Self { + principal_id: principal_id.into(), + principal_kind, + principal_name: principal_name.into(), + roles: Vec::new(), + scope, + org_id: None, + project_id: None, + node_id: None, + iat: 0, + exp: 0, + session_id: session_id.into(), + auth_method: AuthMethod::Unknown, + metadata: HashMap::new(), + } + } + + /// Set roles + pub fn with_roles(mut self, roles: Vec) -> Self { + self.roles = roles; + self + } + + /// Set organization ID + pub fn with_org_id(mut self, org_id: impl Into) -> Self { + self.org_id = Some(org_id.into()); + self + } + + /// Set project ID + pub fn with_project_id(mut self, project_id: impl Into) -> Self { + self.project_id = Some(project_id.into()); + self + } + + /// Set node ID + pub fn with_node_id(mut self, node_id: impl Into) -> Self { + self.node_id = Some(node_id.into()); + self + } + + /// Set timestamps + pub fn with_timestamps(mut self, iat: u64, exp: u64) -> Self { + self.iat = iat; + self.exp = exp; + self + } + + /// Set auth method + pub fn with_auth_method(mut self, method: AuthMethod) -> Self { + self.auth_method = method; + self + } + + /// Check if the token is expired (TSO timestamp comparison) + pub fn is_expired(&self, now_tso: u64) -> bool { + now_tso > self.exp + } + + /// Get remaining validity in TSO units + pub fn remaining_validity(&self, now_tso: u64) -> Option { + if now_tso >= self.exp { + None + } else { + Some(self.exp - now_tso) + } + } +} + +/// Authentication method used +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum AuthMethod { + /// JWT/OIDC authentication + Jwt, + /// mTLS certificate authentication + Mtls, + /// API key authentication + ApiKey, + /// Internal service-to-service + Internal, + /// Unknown/unspecified + Unknown, +} + +impl std::fmt::Display for AuthMethod { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + AuthMethod::Jwt => write!(f, "jwt"), + AuthMethod::Mtls => write!(f, "mtls"), + AuthMethod::ApiKey => write!(f, "api_key"), + AuthMethod::Internal => write!(f, "internal"), + AuthMethod::Unknown => write!(f, "unknown"), + } + } +} + +/// Token type enumeration +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum TokenType { + /// Access token for API calls + Access, + /// Refresh token for obtaining new access tokens + Refresh, + /// ID token containing user identity info + Id, +} + +/// Token metadata for storage/tracking +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TokenMetadata { + /// Token ID (hash of the token) + pub token_id: String, + + /// Principal who owns this token + pub principal_id: String, + + /// Token type + pub token_type: TokenType, + + /// Issued at (TSO) + pub issued_at: u64, + + /// Expires at (TSO) + pub expires_at: u64, + + /// Whether the token has been revoked + pub revoked: bool, + + /// Revocation timestamp (TSO) + #[serde(skip_serializing_if = "Option::is_none")] + pub revoked_at: Option, + + /// Revocation reason + #[serde(skip_serializing_if = "Option::is_none")] + pub revocation_reason: Option, + + /// Source IP that requested the token + #[serde(skip_serializing_if = "Option::is_none")] + pub source_ip: Option, + + /// User agent that requested the token + #[serde(skip_serializing_if = "Option::is_none")] + pub user_agent: Option, +} + +impl TokenMetadata { + /// Create new token metadata + pub fn new( + token_id: impl Into, + principal_id: impl Into, + token_type: TokenType, + issued_at: u64, + expires_at: u64, + ) -> Self { + Self { + token_id: token_id.into(), + principal_id: principal_id.into(), + token_type, + issued_at, + expires_at, + revoked: false, + revoked_at: None, + revocation_reason: None, + source_ip: None, + user_agent: None, + } + } + + /// Mark the token as revoked + pub fn revoke(&mut self, now_tso: u64, reason: impl Into) { + self.revoked = true; + self.revoked_at = Some(now_tso); + self.revocation_reason = Some(reason.into()); + } + + /// Check if the token is valid (not expired and not revoked) + pub fn is_valid(&self, now_tso: u64) -> bool { + !self.revoked && now_tso <= self.expires_at + } + + /// Storage key for token metadata + pub fn storage_key(&self) -> String { + format!("iam/tokens/{}/{}", self.principal_id, self.token_id) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_jwt_claims_validation() { + let claims = JwtClaims { + sub: "user123".into(), + iss: "https://auth.example.com".into(), + aud: vec!["my-service".into()], + exp: 2000, + iat: 1000, + nbf: Some(1000), + jti: None, + email: Some("user@example.com".into()), + email_verified: Some(true), + name: Some("Test User".into()), + groups: vec!["developers".into()], + org_id: Some("org-1".into()), + project_id: None, + extra: HashMap::new(), + }; + + // Valid + assert!(claims + .validate("https://auth.example.com", "my-service", 1500) + .is_ok()); + + // Expired + assert!(matches!( + claims.validate("https://auth.example.com", "my-service", 2500), + Err(TokenValidationError::Expired) + )); + + // Wrong issuer + assert!(matches!( + claims.validate("https://other.com", "my-service", 1500), + Err(TokenValidationError::InvalidIssuer { .. }) + )); + + // Wrong audience + assert!(matches!( + claims.validate("https://auth.example.com", "other-service", 1500), + Err(TokenValidationError::InvalidAudience { .. }) + )); + } + + #[test] + fn test_internal_token_claims() { + let claims = InternalTokenClaims::new( + "user-123", + PrincipalKind::User, + "Alice", + Scope::project("proj-1", "org-1"), + "session-abc", + ) + .with_roles(vec!["roles/ProjectAdmin".into()]) + .with_org_id("org-1") + .with_timestamps(1000, 2000) + .with_auth_method(AuthMethod::Jwt); + + assert_eq!(claims.principal_id, "user-123"); + assert_eq!(claims.roles.len(), 1); + assert!(!claims.is_expired(1500)); + assert!(claims.is_expired(2500)); + assert_eq!(claims.remaining_validity(1500), Some(500)); + assert_eq!(claims.remaining_validity(2500), None); + } + + #[test] + fn test_token_metadata() { + let mut meta = + TokenMetadata::new("token-hash-123", "user-456", TokenType::Access, 1000, 2000); + + assert!(meta.is_valid(1500)); + assert!(!meta.is_valid(2500)); // expired + + meta.revoke(1600, "user logout"); + assert!(!meta.is_valid(1700)); // revoked + assert!(meta.revoked); + assert_eq!(meta.revoked_at, Some(1600)); + } + + #[test] + fn test_auth_method_display() { + assert_eq!(AuthMethod::Jwt.to_string(), "jwt"); + assert_eq!(AuthMethod::Mtls.to_string(), "mtls"); + assert_eq!(AuthMethod::ApiKey.to_string(), "api_key"); + } + + #[test] + fn test_token_serialization() { + let claims = InternalTokenClaims::new( + "sa-compute", + PrincipalKind::ServiceAccount, + "Compute Agent", + Scope::System, + "sess-1", + ) + .with_node_id("node-001") + .with_auth_method(AuthMethod::Mtls); + + let json = serde_json::to_string(&claims).unwrap(); + let parsed: InternalTokenClaims = serde_json::from_str(&json).unwrap(); + + assert_eq!(parsed.principal_id, "sa-compute"); + assert_eq!(parsed.node_id, Some("node-001".into())); + assert_eq!(parsed.auth_method, AuthMethod::Mtls); + } +} diff --git a/iam/flake.lock b/iam/flake.lock new file mode 100644 index 0000000..e61740f --- /dev/null +++ b/iam/flake.lock @@ -0,0 +1,96 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1764947035, + "narHash": "sha256-EYHSjVM4Ox4lvCXUMiKKs2vETUSL5mx+J2FfutM7T9w=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "a672be65651c80d3f592a89b3945466584a22069", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "nixpkgs_2": { + "locked": { + "lastModified": 1744536153, + "narHash": "sha256-awS2zRgF4uTwrOKwwiJcByDzDOdo3Q1rPZbiHQg/N38=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "18dd725c29603f582cf1900e0d25f9f1063dbf11", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs", + "rust-overlay": "rust-overlay" + } + }, + "rust-overlay": { + "inputs": { + "nixpkgs": "nixpkgs_2" + }, + "locked": { + "lastModified": 1764988672, + "narHash": "sha256-FIJtt3Zil89/hLy9i7f0R2xXcJDPc3CeqiiCLfsFV0Y=", + "owner": "oxalica", + "repo": "rust-overlay", + "rev": "086fd19a68e80fcc8a298e9df4674982e4c498a6", + "type": "github" + }, + "original": { + "owner": "oxalica", + "repo": "rust-overlay", + "type": "github" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/iam/flake.nix b/iam/flake.nix new file mode 100644 index 0000000..1dfc340 --- /dev/null +++ b/iam/flake.nix @@ -0,0 +1,43 @@ +{ + description = "IAM dev shell aligned with Chainfire/FlareDB toolchain"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable"; + rust-overlay.url = "github:oxalica/rust-overlay"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = { self, nixpkgs, rust-overlay, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + overlays = [ (import rust-overlay) ]; + pkgs = import nixpkgs { inherit system overlays; }; + + rustToolchain = pkgs.rust-bin.stable.latest.default.override { + extensions = [ "rust-src" "rust-analyzer" ]; + }; + in { + devShells.default = pkgs.mkShell { + buildInputs = with pkgs; [ + rustToolchain + pkg-config + openssl + protobuf + rocksdb + clang + llvmPackages.libclang + cmake + ]; + + shellHook = '' + export LIBCLANG_PATH="${pkgs.llvmPackages.libclang.lib}/lib" + export PROTOC="${pkgs.protobuf}/bin/protoc" + export ROCKSDB_LIB_DIR="${pkgs.rocksdb}/lib" + export ROCKSDB_INCLUDE_DIR="${pkgs.rocksdb}/include" + echo "IAM development environment loaded!" + echo "Rust: $(rustc --version)" + echo "Protoc: $(protoc --version)" + ''; + }; + }); +} diff --git a/iam/proto/iam.proto b/iam/proto/iam.proto new file mode 100644 index 0000000..deb7e74 --- /dev/null +++ b/iam/proto/iam.proto @@ -0,0 +1,622 @@ +syntax = "proto3"; + +package iam.v1; + +option go_package = "github.com/cloud/iam/proto/iam/v1;iamv1"; + +// ============================================================================ +// Authorization Service +// ============================================================================ + +// IamAuthz provides authorization decision (PDP - Policy Decision Point) +service IamAuthz { + // Authorize checks if a principal is allowed to perform an action on a resource + rpc Authorize(AuthorizeRequest) returns (AuthorizeResponse); + + // BatchAuthorize checks multiple authorization requests at once + rpc BatchAuthorize(BatchAuthorizeRequest) returns (BatchAuthorizeResponse); +} + +message AuthorizeRequest { + // Principal making the request + PrincipalRef principal = 1; + + // Action being performed (e.g., "compute:instances:create") + string action = 2; + + // Resource being accessed + ResourceRef resource = 3; + + // Request context (source IP, metadata, etc.) + AuthzContext context = 4; +} + +message AuthorizeResponse { + // Whether the action is allowed + bool allowed = 1; + + // Reason for denial (if not allowed) + string reason = 2; + + // Matched binding ID (for auditing) + string matched_binding = 3; + + // Matched role (for auditing) + string matched_role = 4; +} + +message BatchAuthorizeRequest { + repeated AuthorizeRequest requests = 1; +} + +message BatchAuthorizeResponse { + repeated AuthorizeResponse responses = 1; +} + +message AuthzContext { + // Source IP address + string source_ip = 1; + + // Request timestamp (Unix seconds) + uint64 timestamp = 2; + + // HTTP method (if applicable) + string http_method = 3; + + // Request path (if applicable) + string request_path = 4; + + // Additional metadata + map metadata = 5; +} + +// ============================================================================ +// Token Service +// ============================================================================ + +// IamToken provides token issuance and validation +service IamToken { + // IssueToken creates a new internal token + rpc IssueToken(IssueTokenRequest) returns (IssueTokenResponse); + + // ValidateToken validates a token and returns its claims + rpc ValidateToken(ValidateTokenRequest) returns (ValidateTokenResponse); + + // RevokeToken revokes a token + rpc RevokeToken(RevokeTokenRequest) returns (RevokeTokenResponse); + + // RefreshToken exchanges a token for a new one + rpc RefreshToken(RefreshTokenRequest) returns (RefreshTokenResponse); +} + +message IssueTokenRequest { + // Principal to issue token for + string principal_id = 1; + + // Principal kind + PrincipalKind principal_kind = 2; + + // Roles to include in the token + repeated string roles = 3; + + // Scope for the token + Scope scope = 4; + + // Token TTL in seconds + uint64 ttl_seconds = 5; +} + +message IssueTokenResponse { + // The issued token + string token = 1; + + // Expiration timestamp (Unix seconds) + uint64 expires_at = 2; + + // Session ID + string session_id = 3; +} + +message ValidateTokenRequest { + // Token to validate + string token = 1; +} + +message ValidateTokenResponse { + // Whether the token is valid + bool valid = 1; + + // Token claims (if valid) + InternalTokenClaims claims = 2; + + // Reason for invalidity (if not valid) + string reason = 3; +} + +message RevokeTokenRequest { + // Token to revoke (or session_id) + string token = 1; + + // Revocation reason + string reason = 2; +} + +message RevokeTokenResponse { + // Whether revocation was successful + bool success = 1; +} + +message RefreshTokenRequest { + // Current token + string token = 1; + + // New TTL (optional, defaults to original TTL) + uint64 ttl_seconds = 2; +} + +message RefreshTokenResponse { + // New token + string token = 1; + + // Expiration timestamp + uint64 expires_at = 2; +} + +message InternalTokenClaims { + string principal_id = 1; + PrincipalKind principal_kind = 2; + string principal_name = 3; + repeated string roles = 4; + Scope scope = 5; + optional string org_id = 6; + optional string project_id = 7; + optional string node_id = 8; + uint64 iat = 9; + uint64 exp = 10; + string session_id = 11; + string auth_method = 12; +} + +// ============================================================================ +// Admin Service +// ============================================================================ + +// IamAdmin provides administrative operations +service IamAdmin { + // Principal management + rpc CreatePrincipal(CreatePrincipalRequest) returns (Principal); + rpc GetPrincipal(GetPrincipalRequest) returns (Principal); + rpc UpdatePrincipal(UpdatePrincipalRequest) returns (Principal); + rpc DeletePrincipal(DeletePrincipalRequest) returns (DeletePrincipalResponse); + rpc ListPrincipals(ListPrincipalsRequest) returns (ListPrincipalsResponse); + + // Role management + rpc CreateRole(CreateRoleRequest) returns (Role); + rpc GetRole(GetRoleRequest) returns (Role); + rpc UpdateRole(UpdateRoleRequest) returns (Role); + rpc DeleteRole(DeleteRoleRequest) returns (DeleteRoleResponse); + rpc ListRoles(ListRolesRequest) returns (ListRolesResponse); + + // Binding management + rpc CreateBinding(CreateBindingRequest) returns (PolicyBinding); + rpc GetBinding(GetBindingRequest) returns (PolicyBinding); + rpc UpdateBinding(UpdateBindingRequest) returns (PolicyBinding); + rpc DeleteBinding(DeleteBindingRequest) returns (DeleteBindingResponse); + rpc ListBindings(ListBindingsRequest) returns (ListBindingsResponse); +} + +// ---------------------------------------------------------------------------- +// Principal Messages +// ---------------------------------------------------------------------------- + +message CreatePrincipalRequest { + // Principal ID (unique identifier) + string id = 1; + + // Principal kind + PrincipalKind kind = 2; + + // Display name + string name = 3; + + // Organization ID (optional) + optional string org_id = 4; + + // Project ID (for service accounts) + optional string project_id = 5; + + // Email (for users) + optional string email = 6; + + // Metadata + map metadata = 7; +} + +message GetPrincipalRequest { + PrincipalRef principal = 1; +} + +message UpdatePrincipalRequest { + // Principal to update + PrincipalRef principal = 1; + + // Fields to update + optional string name = 2; + optional string email = 3; + map metadata = 4; + optional bool enabled = 5; +} + +message DeletePrincipalRequest { + PrincipalRef principal = 1; +} + +message DeletePrincipalResponse { + bool deleted = 1; +} + +message ListPrincipalsRequest { + // Filter by kind + optional PrincipalKind kind = 1; + + // Filter by org + optional string org_id = 2; + + // Filter by project + optional string project_id = 3; + + // Pagination + int32 page_size = 4; + string page_token = 5; +} + +message ListPrincipalsResponse { + repeated Principal principals = 1; + string next_page_token = 2; +} + +// ---------------------------------------------------------------------------- +// Role Messages +// ---------------------------------------------------------------------------- + +message CreateRoleRequest { + // Role name (unique identifier) + string name = 1; + + // Display name + string display_name = 2; + + // Description + string description = 3; + + // Scope where this role can be applied + Scope scope = 4; + + // Permissions granted by this role + repeated Permission permissions = 5; +} + +message GetRoleRequest { + string name = 1; +} + +message UpdateRoleRequest { + // Role to update + string name = 1; + + // Fields to update + optional string display_name = 2; + optional string description = 3; + repeated Permission permissions = 4; +} + +message DeleteRoleRequest { + string name = 1; +} + +message DeleteRoleResponse { + bool deleted = 1; +} + +message ListRolesRequest { + // Filter by scope + optional Scope scope = 1; + + // Include builtin roles + bool include_builtin = 2; + + // Pagination + int32 page_size = 3; + string page_token = 4; +} + +message ListRolesResponse { + repeated Role roles = 1; + string next_page_token = 2; +} + +// ---------------------------------------------------------------------------- +// Binding Messages +// ---------------------------------------------------------------------------- + +message CreateBindingRequest { + // Principal to bind + PrincipalRef principal = 1; + + // Role to assign (e.g., "roles/ProjectAdmin") + string role = 2; + + // Scope for the binding + Scope scope = 3; + + // Optional condition + optional Condition condition = 4; + + // Expiration (optional, Unix seconds) + optional uint64 expires_at = 5; +} + +message GetBindingRequest { + string id = 1; +} + +message UpdateBindingRequest { + // Binding to update + string id = 1; + + // Fields to update + optional Condition condition = 2; + optional uint64 expires_at = 3; + optional bool enabled = 4; +} + +message DeleteBindingRequest { + string id = 1; +} + +message DeleteBindingResponse { + bool deleted = 1; +} + +message ListBindingsRequest { + // Filter by principal + optional PrincipalRef principal = 1; + + // Filter by role + optional string role = 2; + + // Filter by scope + optional Scope scope = 3; + + // Include disabled bindings + bool include_disabled = 4; + + // Pagination + int32 page_size = 5; + string page_token = 6; +} + +message ListBindingsResponse { + repeated PolicyBinding bindings = 1; + string next_page_token = 2; +} + +// ============================================================================ +// Common Types +// ============================================================================ + +enum PrincipalKind { + PRINCIPAL_KIND_UNSPECIFIED = 0; + PRINCIPAL_KIND_USER = 1; + PRINCIPAL_KIND_SERVICE_ACCOUNT = 2; + PRINCIPAL_KIND_GROUP = 3; +} + +message PrincipalRef { + PrincipalKind kind = 1; + string id = 2; +} + +message Principal { + string id = 1; + PrincipalKind kind = 2; + string name = 3; + optional string org_id = 4; + optional string project_id = 5; + optional string email = 6; + optional string oidc_sub = 7; + optional string node_id = 8; + map metadata = 9; + uint64 created_at = 10; + uint64 updated_at = 11; + bool enabled = 12; +} + +message ResourceRef { + // Resource kind (e.g., "instance", "volume") + string kind = 1; + + // Resource ID + string id = 2; + + // Organization ID + string org_id = 3; + + // Project ID + string project_id = 4; + + // Owner ID (optional) + optional string owner_id = 5; + + // Node ID (optional) + optional string node_id = 6; + + // Region (optional) + optional string region = 7; + + // Tags + map tags = 8; +} + +message Scope { + oneof scope { + bool system = 1; + OrgScope org = 2; + ProjectScope project = 3; + ResourceScope resource = 4; + } +} + +// Organization scope +message OrgScope { + string id = 1; +} + +// Project scope with parent org context +message ProjectScope { + string id = 1; + string org_id = 2; +} + +// Resource scope with parent project and org context +message ResourceScope { + string id = 1; + string project_id = 2; + string org_id = 3; +} + +message Role { + string name = 1; + string display_name = 2; + string description = 3; + Scope scope = 4; + repeated Permission permissions = 5; + bool builtin = 6; + uint64 created_at = 7; + uint64 updated_at = 8; +} + +message Permission { + // Action pattern (e.g., "compute:instances:create") + string action = 1; + + // Resource pattern (e.g., "project/*/instances/*") + string resource_pattern = 2; + + // Optional condition + optional Condition condition = 3; +} + +message PolicyBinding { + string id = 1; + PrincipalRef principal = 2; + string role = 3; + Scope scope = 4; + optional Condition condition = 5; + uint64 created_at = 6; + uint64 updated_at = 7; + string created_by = 8; + optional uint64 expires_at = 9; + bool enabled = 10; +} + +message Condition { + ConditionExpr expression = 1; +} + +message ConditionExpr { + oneof expr { + StringEqualsExpr string_equals = 1; + StringNotEqualsExpr string_not_equals = 2; + StringLikeExpr string_like = 3; + StringNotLikeExpr string_not_like = 4; + NumericEqualsExpr numeric_equals = 5; + NumericLessThanExpr numeric_less_than = 6; + NumericGreaterThanExpr numeric_greater_than = 7; + IpAddressExpr ip_address = 8; + NotIpAddressExpr not_ip_address = 9; + TimeBetweenExpr time_between = 10; + ExistsExpr exists = 11; + StringEqualsAnyExpr string_equals_any = 12; + BoolExpr bool_expr = 13; + AndExpr and_expr = 14; + OrExpr or_expr = 15; + NotExpr not_expr = 16; + } +} + +message StringEqualsExpr { + string key = 1; + string value = 2; +} + +message StringNotEqualsExpr { + string key = 1; + string value = 2; +} + +message StringLikeExpr { + string key = 1; + string pattern = 2; +} + +message StringNotLikeExpr { + string key = 1; + string pattern = 2; +} + +message NumericEqualsExpr { + string key = 1; + int64 value = 2; +} + +message NumericLessThanExpr { + string key = 1; + int64 value = 2; +} + +message NumericGreaterThanExpr { + string key = 1; + int64 value = 2; +} + +message IpAddressExpr { + string key = 1; + string cidr = 2; +} + +message NotIpAddressExpr { + string key = 1; + string cidr = 2; +} + +message TimeBetweenExpr { + string start = 1; + string end = 2; +} + +message ExistsExpr { + string key = 1; +} + +message StringEqualsAnyExpr { + string key = 1; + repeated string values = 2; +} + +message BoolExpr { + string key = 1; + bool value = 2; +} + +message AndExpr { + repeated ConditionExpr expressions = 1; +} + +message OrExpr { + repeated ConditionExpr expressions = 1; +} + +message NotExpr { + ConditionExpr expression = 1; +}