photoncloud-monorepo/chainfire/crates/chainfire-raft/src/core.rs
centra d2149b6249 fix(lightningstor): Fix SigV4 canonicalization for AWS S3 auth
- Replace form_urlencoded with RFC 3986 compliant URI encoding
- Implement aws_uri_encode() matching AWS SigV4 spec exactly
- Unreserved chars (A-Z,a-z,0-9,-,_,.,~) not encoded
- All other chars percent-encoded with uppercase hex
- Preserve slashes in paths, encode in query params
- Normalize empty paths to '/' per AWS spec
- Fix test expectations (body hash, HMAC values)
- Add comprehensive SigV4 signature determinism test

This fixes the canonicalization mismatch that caused signature
validation failures in T047. Auth can now be enabled for production.

Refs: T058.S1
2025-12-12 06:23:46 +09:00

1454 lines
53 KiB
Rust

//! Custom Raft Consensus Implementation
//!
//! This module implements the Raft consensus algorithm from scratch,
//! replacing OpenRaft for ChainFire's single Raft group use case.
//!
//! Architecture:
//! - RaftCore: Main consensus state machine
//! - RaftState: Follower/Candidate/Leader role management
//! - RaftTimer: Election and heartbeat timeout management
//! - Integration with existing chainfire-storage and network layers
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::{mpsc, oneshot, RwLock, Mutex};
use tokio::time;
use chainfire_storage::{LogStorage, StateMachine, LogEntry, EntryPayload, LogId};
use chainfire_types::command::RaftCommand;
use crate::network::RaftRpcClient;
use tracing::{debug, trace};
pub type NodeId = u64;
pub type Term = u64;
pub type LogIndex = u64;
// ============================================================================
// Core Raft Types
// ============================================================================
/// Node role in the Raft cluster
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RaftRole {
Follower,
Candidate,
Leader,
}
/// Persistent state (must be saved to stable storage before responding to RPCs)
#[derive(Debug, Clone)]
pub struct PersistentState {
/// Latest term server has seen (initialized to 0, increases monotonically)
pub current_term: Term,
/// Candidate that received vote in current term (or None)
pub voted_for: Option<NodeId>,
}
/// Volatile state on all servers
#[derive(Debug, Clone)]
pub struct VolatileState {
/// Index of highest log entry known to be committed
pub commit_index: LogIndex,
/// Index of highest log entry applied to state machine
pub last_applied: LogIndex,
/// Current leader (None if unknown)
pub current_leader: Option<NodeId>,
}
/// Volatile state on candidates (during election)
#[derive(Debug, Clone)]
pub struct CandidateState {
/// Nodes that have granted votes (includes self)
pub votes_received: std::collections::HashSet<NodeId>,
}
/// Volatile state on leaders (reinitialized after election)
#[derive(Debug, Clone)]
pub struct LeaderState {
/// For each server, index of next log entry to send
pub next_index: HashMap<NodeId, LogIndex>,
/// For each server, index of highest log entry known to be replicated
pub match_index: HashMap<NodeId, LogIndex>,
}
// ============================================================================
// RPC Request/Response Types
// ============================================================================
/// RequestVote RPC request
#[derive(Debug, Clone)]
pub struct VoteRequest {
/// Candidate's term
pub term: Term,
/// Candidate requesting vote
pub candidate_id: NodeId,
/// Index of candidate's last log entry
pub last_log_index: LogIndex,
/// Term of candidate's last log entry
pub last_log_term: Term,
}
/// RequestVote RPC response
#[derive(Debug, Clone)]
pub struct VoteResponse {
/// Current term, for candidate to update itself
pub term: Term,
/// True means candidate received vote
pub vote_granted: bool,
}
/// AppendEntries RPC request (also used as heartbeat)
#[derive(Debug, Clone)]
pub struct AppendEntriesRequest {
/// Leader's term
pub term: Term,
/// So follower can redirect clients
pub leader_id: NodeId,
/// Index of log entry immediately preceding new ones
pub prev_log_index: LogIndex,
/// Term of prev_log_index entry
pub prev_log_term: Term,
/// Log entries to store (empty for heartbeat)
pub entries: Vec<LogEntry<RaftCommand>>,
/// Leader's commit_index
pub leader_commit: LogIndex,
}
/// AppendEntries RPC response
#[derive(Debug, Clone)]
pub struct AppendEntriesResponse {
/// Current term, for leader to update itself
pub term: Term,
/// True if follower contained entry matching prev_log_index and prev_log_term
pub success: bool,
/// For fast log backtracking on conflict
pub conflict_index: Option<LogIndex>,
/// For fast log backtracking on conflict
pub conflict_term: Option<Term>,
}
// ============================================================================
// Internal Events
// ============================================================================
/// Internal events for Raft state machine
#[derive(Debug)]
pub enum RaftEvent {
/// Election timeout fired
ElectionTimeout,
/// Heartbeat timeout fired (leader only)
HeartbeatTimeout,
/// Client write request
ClientWrite {
command: RaftCommand,
response_tx: oneshot::Sender<Result<(), RaftError>>,
},
/// RequestVote RPC received
VoteRequest {
req: VoteRequest,
response_tx: oneshot::Sender<VoteResponse>,
},
/// AppendEntries RPC received
AppendEntries {
req: AppendEntriesRequest,
response_tx: oneshot::Sender<AppendEntriesResponse>,
},
/// RequestVote RPC response received
VoteResponse {
from: NodeId,
resp: VoteResponse,
},
/// AppendEntries RPC response received
AppendEntriesResponse {
from: NodeId,
resp: AppendEntriesResponse,
},
}
// ============================================================================
// Error Types
// ============================================================================
#[derive(Debug, Clone)]
pub enum RaftError {
NotLeader { leader_id: Option<NodeId> },
StorageError(String),
NetworkError(String),
Timeout,
}
impl std::fmt::Display for RaftError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
RaftError::NotLeader { leader_id } => {
write!(f, "Not leader, leader is: {:?}", leader_id)
}
RaftError::StorageError(msg) => write!(f, "Storage error: {}", msg),
RaftError::NetworkError(msg) => write!(f, "Network error: {}", msg),
RaftError::Timeout => write!(f, "Operation timed out"),
}
}
}
impl std::error::Error for RaftError {}
// ============================================================================
// RaftCore: Main Consensus Engine
// ============================================================================
pub struct RaftCore {
/// This node's ID
node_id: NodeId,
/// Cluster members (excluding self)
peers: Vec<NodeId>,
/// Persistent state
persistent: Arc<RwLock<PersistentState>>,
/// Volatile state
volatile: Arc<RwLock<VolatileState>>,
/// Candidate state (None if not candidate)
candidate_state: Arc<RwLock<Option<CandidateState>>>,
/// Leader state (None if not leader)
leader_state: Arc<RwLock<Option<LeaderState>>>,
/// Current role
role: Arc<RwLock<RaftRole>>,
/// Storage backend
storage: Arc<LogStorage>,
/// State machine
state_machine: Arc<StateMachine>,
/// Network client
network: Arc<dyn RaftRpcClient>,
/// Event channel
event_tx: mpsc::UnboundedSender<RaftEvent>,
event_rx: Arc<Mutex<mpsc::UnboundedReceiver<RaftEvent>>>,
/// Election timer reset notifier
election_timer_reset: Arc<tokio::sync::Notify>,
/// Configuration
config: RaftConfig,
}
#[derive(Debug, Clone)]
pub struct RaftConfig {
/// Election timeout range (ms)
pub election_timeout_min: u64,
pub election_timeout_max: u64,
/// Heartbeat interval (ms)
pub heartbeat_interval: u64,
}
impl Default for RaftConfig {
fn default() -> Self {
Self {
election_timeout_min: 300,
election_timeout_max: 600,
heartbeat_interval: 150,
}
}
}
impl RaftCore {
pub fn new(
node_id: NodeId,
peers: Vec<NodeId>,
storage: Arc<LogStorage>,
state_machine: Arc<StateMachine>,
network: Arc<dyn RaftRpcClient>,
config: RaftConfig,
) -> Self {
let (event_tx, event_rx) = mpsc::unbounded_channel();
Self {
node_id,
peers,
persistent: Arc::new(RwLock::new(PersistentState {
current_term: 0,
voted_for: None,
})),
volatile: Arc::new(RwLock::new(VolatileState {
commit_index: 0,
last_applied: 0,
current_leader: None,
})),
candidate_state: Arc::new(RwLock::new(None)),
leader_state: Arc::new(RwLock::new(None)),
role: Arc::new(RwLock::new(RaftRole::Follower)),
storage,
state_machine,
network,
event_tx,
event_rx: Arc::new(Mutex::new(event_rx)),
election_timer_reset: Arc::new(tokio::sync::Notify::new()),
config,
}
}
/// Initialize Raft node (load persistent state from storage)
pub async fn initialize(&self) -> Result<(), RaftError> {
// Load persistent state from storage
match self.storage.read_vote() {
Ok(Some(vote)) => {
let mut persistent = self.persistent.write().await;
persistent.current_term = vote.term;
persistent.voted_for = vote.node_id;
tracing::info!(
term = vote.term,
voted_for = ?vote.node_id,
"Loaded persistent state from storage"
);
}
Ok(None) => {
tracing::info!("No persistent state found, starting fresh");
}
Err(e) => {
return Err(RaftError::StorageError(format!("Failed to load vote: {}", e)));
}
}
Ok(())
}
/// Persist current term and vote to storage
async fn persist_vote(&self) -> Result<(), RaftError> {
let persistent = self.persistent.read().await;
let vote = chainfire_storage::Vote {
term: persistent.current_term,
node_id: persistent.voted_for,
committed: false,
};
self.storage
.save_vote(vote)
.map_err(|e| RaftError::StorageError(format!("Failed to save vote: {}", e)))?;
Ok(())
}
/// Start the Raft event loop
pub async fn run(&self) -> Result<(), RaftError> {
eprintln!("[Node {}] EVENT LOOP STARTING", self.node_id);
// Start election timer
self.spawn_election_timer();
// Start heartbeat timer
self.spawn_heartbeat_timer();
// Main event loop
let mut event_rx = self.event_rx.lock().await;
eprintln!("[Node {}] EVENT LOOP acquired event_rx, starting recv loop", self.node_id);
loop {
tokio::select! {
Some(event) = event_rx.recv() => {
let event_type = match &event {
RaftEvent::ElectionTimeout => "ElectionTimeout",
RaftEvent::HeartbeatTimeout => "HeartbeatTimeout",
RaftEvent::VoteRequest { .. } => "VoteRequest",
RaftEvent::VoteResponse { .. } => "VoteResponse",
RaftEvent::AppendEntries { .. } => "AppendEntries",
RaftEvent::AppendEntriesResponse { .. } => "AppendEntriesResponse",
RaftEvent::ClientWrite { .. } => "ClientWrite",
};
eprintln!("[Node {}] EVENT LOOP received: {}", self.node_id, event_type);
if let Err(e) = self.handle_event(event).await {
eprintln!("[Node {}] EVENT LOOP error: {:?}, continuing...", self.node_id, e);
// Continue loop instead of exiting - event loop must stay alive
}
}
else => {
eprintln!("[Node {}] EVENT LOOP channel closed, exiting", self.node_id);
break;
}
}
}
eprintln!("[Node {}] EVENT LOOP EXITED", self.node_id);
Ok(())
}
/// Handle a single event
async fn handle_event(&self, event: RaftEvent) -> Result<(), RaftError> {
match event {
RaftEvent::ElectionTimeout => {
self.handle_election_timeout().await?;
}
RaftEvent::HeartbeatTimeout => {
self.handle_heartbeat_timeout().await?;
}
RaftEvent::ClientWrite { command, response_tx } => {
let result = self.handle_client_write(command).await;
let _ = response_tx.send(result);
}
RaftEvent::VoteRequest { req, response_tx } => {
let resp = self.handle_vote_request(req).await?;
let _ = response_tx.send(resp);
}
RaftEvent::AppendEntries { req, response_tx } => {
eprintln!("[Node {}] EVENT LOOP processing AppendEntries from {} term={}",
self.node_id, req.leader_id, req.term);
let resp = self.handle_append_entries(req).await?;
let _ = response_tx.send(resp);
}
RaftEvent::VoteResponse { from, resp } => {
self.handle_vote_response(from, resp).await?;
}
RaftEvent::AppendEntriesResponse { from, resp } => {
self.handle_append_entries_response(from, resp).await?;
}
}
Ok(())
}
// ========================================================================
// P1: Leader Election Implementation
// ========================================================================
/// Handle election timeout - transition to candidate and start election
async fn handle_election_timeout(&self) -> Result<(), RaftError> {
let role = *self.role.read().await;
eprintln!("[Node {}] handle_election_timeout: role={:?}", self.node_id, role);
// Only followers and candidates start elections
if role == RaftRole::Leader {
eprintln!("[Node {}] Already leader, ignoring election timeout", self.node_id);
return Ok(());
}
// Transition to candidate
*self.role.write().await = RaftRole::Candidate;
eprintln!("[Node {}] Transitioned to Candidate", self.node_id);
// Clear current leader (election in progress)
self.volatile.write().await.current_leader = None;
// Increment current term and vote for self
let mut persistent = self.persistent.write().await;
persistent.current_term += 1;
persistent.voted_for = Some(self.node_id);
let current_term = persistent.current_term;
drop(persistent);
eprintln!("[Node {}] Starting election for term {}", self.node_id, current_term);
// Persist vote to storage before sending RPCs (Raft safety)
self.persist_vote().await?;
// Initialize candidate state with self-vote
let mut votes = std::collections::HashSet::new();
votes.insert(self.node_id);
*self.candidate_state.write().await = Some(CandidateState {
votes_received: votes,
});
// Check if already have majority (single-node case)
let cluster_size = self.peers.len() + 1;
let majority = cluster_size / 2 + 1;
eprintln!("[Node {}] Cluster size={}, majority={}, peers={:?}",
self.node_id, cluster_size, majority, self.peers);
if 1 >= majority {
// For single-node cluster, immediately become leader
eprintln!("[Node {}] Single-node cluster, becoming leader immediately", self.node_id);
self.become_leader().await?;
return Ok(());
}
// Get last log index and term
let (last_log_index, last_log_term) = self.get_last_log_info().await?;
// Send RequestVote RPCs to all peers
let vote_request = VoteRequest {
term: current_term,
candidate_id: self.node_id,
last_log_index,
last_log_term,
};
// Send vote requests in parallel
for peer_id in &self.peers {
let peer_id = *peer_id;
let network = self.network.clone();
let req = vote_request.clone();
let event_tx = self.event_tx.clone();
tokio::spawn(async move {
// TODO: Use actual network layer instead of mock
let resp = network.vote(peer_id, req).await
.unwrap_or(VoteResponse {
term: current_term,
vote_granted: false,
});
// Send response back to main event loop
let _ = event_tx.send(RaftEvent::VoteResponse { from: peer_id, resp });
});
}
Ok(())
}
/// Handle RequestVote RPC
async fn handle_vote_request(&self, req: VoteRequest) -> Result<VoteResponse, RaftError> {
let mut persistent = self.persistent.write().await;
// Reply false if term < currentTerm
if req.term < persistent.current_term {
return Ok(VoteResponse {
term: persistent.current_term,
vote_granted: false,
});
}
// If RPC request or response contains term T > currentTerm:
// set currentTerm = T, convert to follower
if req.term > persistent.current_term {
persistent.current_term = req.term;
persistent.voted_for = None;
*self.role.write().await = RaftRole::Follower;
drop(persistent);
self.persist_vote().await?;
persistent = self.persistent.write().await;
}
// Check if we can grant vote
let can_vote = persistent.voted_for.is_none()
|| persistent.voted_for == Some(req.candidate_id);
if !can_vote {
return Ok(VoteResponse {
term: persistent.current_term,
vote_granted: false,
});
}
// Check if candidate's log is at least as up-to-date as receiver's log
let (last_log_index, last_log_term) = self.get_last_log_info().await?;
let log_ok = req.last_log_term > last_log_term
|| (req.last_log_term == last_log_term && req.last_log_index >= last_log_index);
if log_ok {
persistent.voted_for = Some(req.candidate_id);
let term = persistent.current_term;
drop(persistent);
// Persist vote to storage before responding (Raft safety)
self.persist_vote().await?;
// Reset election timer since we granted a vote
self.reset_election_timer();
Ok(VoteResponse {
term,
vote_granted: true,
})
} else {
Ok(VoteResponse {
term: persistent.current_term,
vote_granted: false,
})
}
}
/// Handle VoteResponse from a peer
async fn handle_vote_response(&self, from: NodeId, resp: VoteResponse) -> Result<(), RaftError> {
let role = *self.role.read().await;
let persistent = self.persistent.read().await;
// Ignore if not candidate
if role != RaftRole::Candidate {
return Ok(());
}
// If response term > current term, step down
if resp.term > persistent.current_term {
drop(persistent);
self.step_down(resp.term).await?;
return Ok(());
}
// Ignore stale responses
if resp.term < persistent.current_term {
return Ok(());
}
// Count votes
if resp.vote_granted {
let mut candidate_state_guard = self.candidate_state.write().await;
if let Some(candidate_state) = candidate_state_guard.as_mut() {
candidate_state.votes_received.insert(from);
// Calculate majority (cluster size = peers + 1 for self)
let cluster_size = self.peers.len() + 1;
let majority = cluster_size / 2 + 1;
let votes_count = candidate_state.votes_received.len();
// If received majority, become leader
if votes_count >= majority {
drop(candidate_state_guard);
drop(persistent);
self.become_leader().await?;
}
}
}
Ok(())
}
/// Transition to leader
async fn become_leader(&self) -> Result<(), RaftError> {
*self.role.write().await = RaftRole::Leader;
// Set self as current leader
self.volatile.write().await.current_leader = Some(self.node_id);
// Clear candidate state
*self.candidate_state.write().await = None;
// Initialize leader state
let last_log_index = self.get_last_log_info().await?.0;
let next_index = last_log_index + 1;
let mut leader_state = LeaderState {
next_index: HashMap::new(),
match_index: HashMap::new(),
};
for peer_id in &self.peers {
leader_state.next_index.insert(*peer_id, next_index);
leader_state.match_index.insert(*peer_id, 0);
}
*self.leader_state.write().await = Some(leader_state);
// Start sending heartbeats immediately
self.event_tx.send(RaftEvent::HeartbeatTimeout)
.map_err(|e| RaftError::NetworkError(format!("Failed to send heartbeat: {}", e)))?;
Ok(())
}
/// Step down to follower
async fn step_down(&self, new_term: Term) -> Result<(), RaftError> {
let mut persistent = self.persistent.write().await;
persistent.current_term = new_term;
persistent.voted_for = None;
drop(persistent);
// Persist term and vote to storage
self.persist_vote().await?;
*self.role.write().await = RaftRole::Follower;
*self.candidate_state.write().await = None;
*self.leader_state.write().await = None;
// Reset election timer when stepping down to follower
self.reset_election_timer();
Ok(())
}
// ========================================================================
// P2: Log Replication (Stub implementations)
// ========================================================================
async fn handle_heartbeat_timeout(&self) -> Result<(), RaftError> {
// Only leaders send heartbeats
let role = *self.role.read().await;
if role != RaftRole::Leader {
return Ok(());
}
let term = self.persistent.read().await.current_term;
let (last_log_index, _) = self.get_last_log_info().await?;
eprintln!("[Node {}] Sending heartbeat to peers: {:?} (term={})",
self.node_id, self.peers, term);
// Send AppendEntries (with entries if available) to all peers
for peer_id in &self.peers {
let peer_id = *peer_id;
// Read commit_index fresh for each peer to ensure it's up-to-date
let commit_index = self.volatile.read().await.commit_index;
// Get prevLogIndex and prevLogTerm for this peer
let leader_state = self.leader_state.read().await;
let next_index = leader_state.as_ref()
.and_then(|ls| ls.next_index.get(&peer_id).copied())
.unwrap_or(1);
drop(leader_state);
let prev_log_index = next_index.saturating_sub(1);
let prev_log_term = if prev_log_index > 0 {
// Read as Vec<u8> since that's how it's stored
let entries: Vec<LogEntry<Vec<u8>>> = self.storage
.get_log_entries(prev_log_index..=prev_log_index)
.map_err(|e| RaftError::StorageError(format!("Failed to read log: {}", e)))?;
if entries.is_empty() {
0
} else {
entries[0].log_id.term
}
} else {
0
};
// Get entries to send (if any)
let entries: Vec<LogEntry<RaftCommand>> = if next_index <= last_log_index {
// Read entries from storage (stored as Vec<u8>)
let stored_entries: Vec<LogEntry<Vec<u8>>> = self.storage
.get_log_entries(next_index..=last_log_index)
.map_err(|e| RaftError::StorageError(format!("Failed to read log entries: {}", e)))?;
// Convert Vec<u8> back to RaftCommand
stored_entries.into_iter().map(|entry| {
let command = bincode::deserialize(&match &entry.payload {
EntryPayload::Normal(data) => data,
EntryPayload::Blank => return Ok(LogEntry {
log_id: entry.log_id,
payload: EntryPayload::Blank,
}),
EntryPayload::Membership(nodes) => return Ok(LogEntry {
log_id: entry.log_id,
payload: EntryPayload::Membership(nodes.clone()),
}),
}).map_err(|e| RaftError::StorageError(format!("Failed to deserialize command: {}", e)))?;
Ok(LogEntry {
log_id: entry.log_id,
payload: EntryPayload::Normal(command),
})
}).collect::<Result<Vec<_>, RaftError>>()?
} else {
// No entries to send, just heartbeat
vec![]
};
eprintln!("[Node {}] HEARTBEAT to {}: entries.len()={} next_index={} last_log_index={}",
self.node_id, peer_id, entries.len(), next_index, last_log_index);
let req = AppendEntriesRequest {
term,
leader_id: self.node_id,
prev_log_index,
prev_log_term,
entries,
leader_commit: commit_index,
};
eprintln!("[Node {}] LEADER sending to {}: leader_commit={}",
self.node_id, peer_id, commit_index);
let network = Arc::clone(&self.network);
let event_tx = self.event_tx.clone();
// Send in background, don't wait for response
tokio::spawn(async move {
if let Ok(resp) = network.append_entries(peer_id, req).await {
let _ = event_tx.send(RaftEvent::AppendEntriesResponse {
from: peer_id,
resp,
});
}
});
}
Ok(())
}
async fn handle_append_entries(&self, req: AppendEntriesRequest) -> Result<AppendEntriesResponse, RaftError> {
let mut persistent = self.persistent.write().await;
let current_term = persistent.current_term;
// DIAGNOSTIC: Log all AppendEntries received
eprintln!("[Node {}] Received AppendEntries from {} term={} (my term={})",
self.node_id, req.leader_id, req.term, current_term);
// If RPC request contains term T > currentTerm: set currentTerm = T, convert to follower
if req.term > current_term {
eprintln!("[Node {}] STEPPING DOWN: req.term={} > my term={}",
self.node_id, req.term, current_term);
persistent.current_term = req.term;
persistent.voted_for = None;
drop(persistent);
self.persist_vote().await?;
*self.role.write().await = RaftRole::Follower;
*self.candidate_state.write().await = None;
*self.leader_state.write().await = None;
eprintln!("[Node {}] Stepped down to Follower (now term={})",
self.node_id, req.term);
} else {
drop(persistent);
}
let persistent = self.persistent.read().await;
let term = persistent.current_term;
drop(persistent);
// Reply false if term < currentTerm
if req.term < term {
return Ok(AppendEntriesResponse {
term,
success: false,
conflict_index: None,
conflict_term: None,
});
}
// Valid AppendEntries from current leader - reset election timer
self.reset_election_timer();
// Update current leader
self.volatile.write().await.current_leader = Some(req.leader_id);
// P2: Log consistency check
// Reply false if log doesn't contain an entry at prevLogIndex whose term matches prevLogTerm
if req.prev_log_index > 0 {
// Try to get the entry at prevLogIndex (stored as Vec<u8>)
let prev_entries: Vec<LogEntry<Vec<u8>>> = self.storage
.get_log_entries(req.prev_log_index..=req.prev_log_index)
.map_err(|e| RaftError::StorageError(format!("Failed to read log: {}", e)))?;
if prev_entries.is_empty() {
// Follower doesn't have entry at prevLogIndex
// Return conflict information for fast backtracking
let last_index = self.get_last_log_info().await?.0;
return Ok(AppendEntriesResponse {
term,
success: false,
conflict_index: Some(last_index + 1),
conflict_term: None,
});
}
let prev_entry = &prev_entries[0];
if prev_entry.log_id.term != req.prev_log_term {
// Entry exists but term doesn't match
// Find the first index of the conflicting term
let conflict_term = prev_entry.log_id.term;
// Search backwards to find first entry of this term
let mut conflict_index = req.prev_log_index;
for idx in (1..req.prev_log_index).rev() {
let entries: Vec<LogEntry<Vec<u8>>> = self.storage
.get_log_entries(idx..=idx)
.map_err(|e| RaftError::StorageError(format!("Failed to read log: {}", e)))?;
if !entries.is_empty() && entries[0].log_id.term != conflict_term {
conflict_index = idx + 1;
break;
}
}
return Ok(AppendEntriesResponse {
term,
success: false,
conflict_index: Some(conflict_index),
conflict_term: Some(conflict_term),
});
}
}
// P2: Log append/overwrite logic
// If an existing entry conflicts with a new one (same index but different terms),
// delete the existing entry and all that follow it
if !req.entries.is_empty() {
let first_new_index = req.entries[0].log_id.index;
// Check if there's a conflict (stored as Vec<u8>)
let existing: Vec<LogEntry<Vec<u8>>> = self.storage
.get_log_entries(first_new_index..=first_new_index)
.map_err(|e| RaftError::StorageError(format!("Failed to read log: {}", e)))?;
if !existing.is_empty() && existing[0].log_id.term != req.entries[0].log_id.term {
// Conflict detected - truncate from this index
self.storage
.truncate(first_new_index)
.map_err(|e| RaftError::StorageError(format!("Failed to truncate log: {}", e)))?;
}
// Convert RaftCommand entries to Vec<u8> before storing
let entries_to_store: Vec<LogEntry<Vec<u8>>> = req.entries.iter().map(|entry| {
let payload = match &entry.payload {
EntryPayload::Normal(cmd) => {
let bytes = bincode::serialize(cmd)
.map_err(|e| RaftError::StorageError(format!("Serialize failed: {}", e)))?;
EntryPayload::Normal(bytes)
}
EntryPayload::Blank => EntryPayload::Blank,
EntryPayload::Membership(nodes) => EntryPayload::Membership(nodes.clone()),
};
Ok(LogEntry {
log_id: entry.log_id,
payload,
})
}).collect::<Result<Vec<_>, RaftError>>()?;
// Append converted entries
self.storage
.append(&entries_to_store)
.map_err(|e| RaftError::StorageError(format!("Failed to append entries: {}", e)))?;
let (last_log_index, _) = self.get_last_log_info().await?;
eprintln!("[Node {}] FOLLOWER appended {} entries, last_index_now={}",
self.node_id, req.entries.len(), last_log_index);
}
// P2: Update commit index
// If leaderCommit > commitIndex, set commitIndex = min(leaderCommit, index of last new entry)
eprintln!("[Node {}] FOLLOWER commit check: req.leader_commit={} my_commit={}",
self.node_id, req.leader_commit, self.volatile.read().await.commit_index);
if req.leader_commit > 0 {
let mut volatile = self.volatile.write().await;
if req.leader_commit > volatile.commit_index {
let last_new_index = if !req.entries.is_empty() {
req.entries.last().unwrap().log_id.index
} else {
req.prev_log_index
};
let new_commit = std::cmp::min(req.leader_commit, last_new_index);
eprintln!("[Node {}] FOLLOWER updating commit: {} -> {}",
self.node_id, volatile.commit_index, new_commit);
volatile.commit_index = new_commit;
debug!(
commit_index = volatile.commit_index,
leader_commit = req.leader_commit,
"Updated commit index"
);
// Drop the lock before calling apply
drop(volatile);
// Apply newly committed entries to state machine
self.apply_committed_entries().await?;
}
}
Ok(AppendEntriesResponse {
term,
success: true,
conflict_index: None,
conflict_term: None,
})
}
async fn handle_append_entries_response(&self, from: NodeId, resp: AppendEntriesResponse) -> Result<(), RaftError> {
// Only leaders process AppendEntries responses
let role = *self.role.read().await;
if role != RaftRole::Leader {
return Ok(());
}
let current_term = self.persistent.read().await.current_term;
// If response term > current term, step down
if resp.term > current_term {
self.step_down(resp.term).await?;
return Ok(());
}
// Ignore stale responses
if resp.term < current_term {
return Ok(());
}
// Update next_index and match_index based on response
let mut leader_state_guard = self.leader_state.write().await;
if let Some(leader_state) = leader_state_guard.as_mut() {
if resp.success {
// Follower successfully replicated entries
// Get the old next_index to calculate what we sent
let old_next_index = leader_state.next_index.get(&from).copied().unwrap_or(1);
// Get current last_log_index after getting old_next_index
let (last_log_index, _) = self.get_last_log_info().await?;
// We sent entries from old_next_index to last_log_index (at time of sending)
// Since the response is success, the follower has all entries up to
// the last index we sent
let new_match_index = if old_next_index <= last_log_index {
// We sent some entries, follower has up to last_log_index
last_log_index
} else {
// Empty heartbeat, match_index stays at previous value
old_next_index.saturating_sub(1)
};
leader_state.match_index.insert(from, new_match_index);
leader_state.next_index.insert(from, new_match_index + 1);
eprintln!("[Node {}] RESP from {}: success={} match_index={} next_index={}",
self.node_id, from, resp.success, new_match_index, new_match_index + 1);
trace!(
peer = from,
match_index = new_match_index,
next_index = new_match_index + 1,
old_next_index = old_next_index,
"Updated peer replication progress"
);
} else {
// Follower's log is inconsistent, decrement next_index
if let Some(next_index) = leader_state.next_index.get_mut(&from) {
if let Some(conflict_index) = resp.conflict_index {
// Use conflict information for fast backtracking
*next_index = conflict_index;
} else {
// Decrement next_index by 1
*next_index = next_index.saturating_sub(1).max(1);
}
debug!(
peer = from,
new_next_index = *next_index,
conflict_index = ?resp.conflict_index,
conflict_term = ?resp.conflict_term,
"Follower log inconsistent, adjusted next_index"
);
}
}
}
drop(leader_state_guard);
// Try to advance commit index after updating match_index
if resp.success {
self.advance_commit_index().await?;
}
Ok(())
}
// ========================================================================
// P3: Commitment Logic
// ========================================================================
/// Advance commit index based on majority replication
async fn advance_commit_index(&self) -> Result<(), RaftError> {
let leader_state = self.leader_state.read().await;
if leader_state.is_none() {
return Ok(()); // Not leader
}
let leader_state = leader_state.as_ref().unwrap();
// Collect all match_index values plus leader's own log
let (last_log_index, _) = self.get_last_log_info().await?;
let mut match_indices: Vec<LogIndex> = leader_state
.match_index
.values()
.copied()
.collect();
// Add leader's own index
match_indices.push(last_log_index);
// Sort to find median (majority point)
match_indices.sort_unstable();
// Majority index is at position N/2 (0-indexed median)
let majority_index = match_indices.len() / 2;
let new_commit_index = match_indices[majority_index];
eprintln!("[Node {}] COMMIT CHECK: match_indices={:?} majority_idx={} new_commit={}",
self.node_id, match_indices, majority_index, new_commit_index);
let current_term = self.persistent.read().await.current_term;
let old_commit_index = self.volatile.read().await.commit_index;
// Only commit if:
// 1. new_commit_index > current commit_index
// 2. The entry at new_commit_index is from current term (Raft safety)
if new_commit_index > old_commit_index {
// Check term of entry at new_commit_index (stored as Vec<u8>)
let entries: Vec<LogEntry<Vec<u8>>> = self.storage
.get_log_entries(new_commit_index..=new_commit_index)
.map_err(|e| RaftError::StorageError(format!("Failed to read log for commit: {}", e)))?;
if !entries.is_empty() && entries[0].log_id.term == current_term {
// Safe to commit
self.volatile.write().await.commit_index = new_commit_index;
debug!(
old_commit = old_commit_index,
new_commit = new_commit_index,
"Advanced commit index"
);
// Apply newly committed entries
self.apply_committed_entries().await?;
}
}
Ok(())
}
/// Apply committed entries to state machine
async fn apply_committed_entries(&self) -> Result<(), RaftError> {
let mut volatile = self.volatile.write().await;
let commit_index = volatile.commit_index;
let last_applied = volatile.last_applied;
if commit_index <= last_applied {
return Ok(()); // Nothing to apply
}
// Get entries to apply (stored as Vec<u8>)
let stored_entries: Vec<LogEntry<Vec<u8>>> = self.storage
.get_log_entries((last_applied + 1)..=commit_index)
.map_err(|e| RaftError::StorageError(format!("Failed to read entries for apply: {}", e)))?;
// Apply each entry to state machine
for entry in &stored_entries {
if let EntryPayload::Normal(data) = &entry.payload {
// Deserialize the command
let command: RaftCommand = bincode::deserialize(data)
.map_err(|e| RaftError::StorageError(format!("Failed to deserialize for apply: {}", e)))?;
self.state_machine
.apply(command)
.map_err(|e| RaftError::StorageError(format!("Failed to apply to state machine: {}", e)))?;
debug!(
index = entry.log_id.index,
term = entry.log_id.term,
"Applied entry to state machine"
);
}
}
// Update last_applied
volatile.last_applied = commit_index;
debug!(
last_applied = commit_index,
entries_applied = stored_entries.len(),
"Applied committed entries to state machine"
);
Ok(())
}
// ========================================================================
// P3: Client Requests
// ========================================================================
async fn handle_client_write(&self, command: RaftCommand) -> Result<(), RaftError> {
let role = *self.role.read().await;
if role != RaftRole::Leader {
return Err(RaftError::NotLeader { leader_id: None });
}
// Get current term and last log index
let term = self.persistent.read().await.current_term;
eprintln!("[Node {}] handle_client_write: getting last_log_info...", self.node_id);
let (last_log_index, _) = match self.get_last_log_info().await {
Ok(info) => {
eprintln!("[Node {}] handle_client_write: last_log_index={}", self.node_id, info.0);
info
}
Err(e) => {
eprintln!("[Node {}] handle_client_write: ERROR getting last_log_info: {:?}", self.node_id, e);
return Err(e);
}
};
let new_index = last_log_index + 1;
// Serialize command to Vec<u8> for storage
let command_bytes = bincode::serialize(&command)
.map_err(|e| RaftError::StorageError(format!("Failed to serialize command: {}", e)))?;
// Create new log entry
let log_id = LogId {
term,
index: new_index,
};
let entry = LogEntry {
log_id,
payload: EntryPayload::Normal(command_bytes),
};
// Append to leader's log
eprintln!("[Node {}] handle_client_write: appending entry index={} term={}...", self.node_id, new_index, term);
match self.storage.append(&[entry.clone()]) {
Ok(()) => {
eprintln!("[Node {}] handle_client_write: append SUCCESS index={}", self.node_id, new_index);
}
Err(e) => {
eprintln!("[Node {}] handle_client_write: append FAILED: {:?}", self.node_id, e);
return Err(RaftError::StorageError(format!("Failed to append entry: {}", e)));
}
}
debug!(
term = term,
index = new_index,
"Leader appended entry to log"
);
// Trigger immediate replication to all followers
// Send AppendEntries with the new entry to all peers
self.event_tx
.send(RaftEvent::HeartbeatTimeout)
.map_err(|e| RaftError::NetworkError(format!("Failed to trigger replication: {}", e)))?;
// Single-node cluster: immediately commit since we're the only voter
if self.peers.is_empty() {
self.advance_commit_index().await?;
}
// Note: In a production implementation, we would wait for majority
// acknowledgment before returning success. For now, we return immediately
// and let the async replication/commit process handle it via normal
// heartbeat responses updating match_index.
Ok(())
}
// ========================================================================
// Helper Methods
// ========================================================================
/// Get last log index and term
async fn get_last_log_info(&self) -> Result<(LogIndex, Term), RaftError> {
let log_state = self.storage
.get_log_state()
.map_err(|e| RaftError::StorageError(format!("Failed to get log state: {}", e)))?;
if let Some(last_log_id) = log_state.last_log_id {
Ok((last_log_id.index, last_log_id.term))
} else {
Ok((0, 0))
}
}
/// Spawn election timer task
fn spawn_election_timer(&self) {
let event_tx = self.event_tx.clone();
let config = self.config.clone();
let reset_notify = Arc::clone(&self.election_timer_reset);
tokio::spawn(async move {
eprintln!("[ELECTION TIMER] Spawned");
loop {
let timeout = rand::random::<u64>() %
(config.election_timeout_max - config.election_timeout_min) +
config.election_timeout_min;
eprintln!("[ELECTION TIMER] Waiting {}ms", timeout);
tokio::select! {
_ = time::sleep(Duration::from_millis(timeout)) => {
// Election timeout fired
eprintln!("[ELECTION TIMER] Timeout fired, sending event");
if event_tx.send(RaftEvent::ElectionTimeout).is_err() {
eprintln!("[ELECTION TIMER] Send failed, exiting");
break;
}
eprintln!("[ELECTION TIMER] Event sent successfully");
}
_ = reset_notify.notified() => {
// Timer was reset, restart the loop with new timeout
eprintln!("[ELECTION TIMER] Reset notification received");
continue;
}
}
}
eprintln!("[ELECTION TIMER] Exited");
});
}
/// Reset the election timer (called when receiving valid RPC or becoming leader)
fn reset_election_timer(&self) {
self.election_timer_reset.notify_one();
}
/// Spawn heartbeat timer task (leader sends periodic heartbeats)
fn spawn_heartbeat_timer(&self) {
let event_tx = self.event_tx.clone();
let config = self.config.clone();
tokio::spawn(async move {
let mut interval = tokio::time::interval(Duration::from_millis(config.heartbeat_interval));
// Skip the first tick (fires immediately)
interval.tick().await;
loop {
interval.tick().await;
if event_tx.send(RaftEvent::HeartbeatTimeout).is_err() {
break;
}
}
});
}
// ========================================================================
// Public API for external access (testing, metrics, etc.)
// ========================================================================
/// Get this node's ID
pub fn node_id(&self) -> NodeId {
self.node_id
}
/// Alias for node_id() for API compatibility
pub fn id(&self) -> NodeId {
self.node_id
}
/// Get current role
pub async fn role(&self) -> RaftRole {
*self.role.read().await
}
/// Get current term
pub async fn current_term(&self) -> Term {
self.persistent.read().await.current_term
}
/// Inject RequestVote RPC (for testing)
pub async fn request_vote_rpc(
&self,
req: VoteRequest,
resp_tx: oneshot::Sender<VoteResponse>,
) {
let _ = self.event_tx.send(RaftEvent::VoteRequest { req, response_tx: resp_tx });
}
/// Inject AppendEntries RPC (for testing)
pub async fn append_entries_rpc(
&self,
req: AppendEntriesRequest,
resp_tx: oneshot::Sender<AppendEntriesResponse>,
) {
eprintln!("[Node {}] append_entries_rpc: from {} term={}",
self.node_id, req.leader_id, req.term);
let result = self.event_tx.send(RaftEvent::AppendEntries { req, response_tx: resp_tx });
if let Err(e) = result {
eprintln!("[Node {}] ERROR: Failed to send AppendEntries event: channel closed",
self.node_id);
}
}
/// Get current leader
pub async fn leader(&self) -> Option<NodeId> {
self.volatile.read().await.current_leader
}
/// Submit a client write command (non-blocking, returns immediately after append)
pub async fn client_write(&self, command: RaftCommand) -> Result<(), RaftError> {
let (tx, rx) = oneshot::channel();
self.event_tx
.send(RaftEvent::ClientWrite {
command,
response_tx: tx,
})
.map_err(|e| RaftError::NetworkError(format!("Failed to send client write: {}", e)))?;
rx.await
.map_err(|e| RaftError::NetworkError(format!("Client write response lost: {}", e)))?
}
/// Submit a client write and wait for commit (blocking version)
/// Returns RaftResponse after the command is committed and applied
pub async fn write(&self, command: RaftCommand) -> Result<chainfire_types::command::RaftResponse, RaftError> {
use chainfire_types::command::RaftResponse;
// Get current commit index before write
let initial_commit = self.volatile.read().await.commit_index;
// Submit the write
self.client_write(command).await?;
// Wait for commit to advance (with timeout)
let timeout = tokio::time::Duration::from_secs(5);
let start = tokio::time::Instant::now();
loop {
let current_commit = self.volatile.read().await.commit_index;
if current_commit > initial_commit {
// Entry committed, get current revision from state machine
let revision = self.state_machine.current_revision();
return Ok(RaftResponse {
revision,
prev_kv: None,
deleted: 0,
succeeded: true,
prev_kvs: vec![],
lease_id: None,
lease_ttl: None,
txn_responses: vec![],
});
}
if start.elapsed() > timeout {
return Err(RaftError::Timeout);
}
// Sleep briefly before checking again
tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
}
}
/// Get current commit index
pub async fn commit_index(&self) -> LogIndex {
self.volatile.read().await.commit_index
}
/// Get current last_applied index
pub async fn last_applied(&self) -> LogIndex {
self.volatile.read().await.last_applied
}
/// Get state machine reference for testing/verification
pub fn state_machine(&self) -> Arc<StateMachine> {
Arc::clone(&self.state_machine)
}
/// Get storage reference for snapshot operations
pub fn storage(&self) -> Arc<LogStorage> {
Arc::clone(&self.storage)
}
/// Get current cluster membership as list of node IDs
/// NOTE: Custom RaftCore uses static membership configured at startup
pub async fn membership(&self) -> Vec<u64> {
let mut members = vec![self.node_id];
members.extend(self.peers.iter().cloned());
members.sort();
members
}
}
// ============================================================================
// Unit Tests
// ============================================================================
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_vote_request_creation() {
let req = VoteRequest {
term: 1,
candidate_id: 1,
last_log_index: 0,
last_log_term: 0,
};
assert_eq!(req.term, 1);
assert_eq!(req.candidate_id, 1);
}
#[tokio::test]
async fn test_raft_core_creation() {
// TODO: Add proper unit tests with mock storage/network
}
}