feat: add stream replica groups and leader stepdown

This commit is contained in:
Joseph Doherty
2026-02-23 06:17:30 -05:00
parent 23216d0a48
commit c87661800d
4 changed files with 158 additions and 0 deletions

View File

@@ -0,0 +1,65 @@
using NATS.Server.Raft;
namespace NATS.Server.JetStream.Cluster;
public sealed class StreamReplicaGroup
{
private readonly List<RaftNode> _nodes;
public string StreamName { get; }
public IReadOnlyList<RaftNode> Nodes => _nodes;
public RaftNode Leader { get; private set; }
public StreamReplicaGroup(string streamName, int replicas)
{
StreamName = streamName;
var nodeCount = Math.Max(replicas, 1);
_nodes = Enumerable.Range(1, nodeCount)
.Select(i => new RaftNode($"{streamName.ToLowerInvariant()}-r{i}"))
.ToList();
foreach (var node in _nodes)
node.ConfigureCluster(_nodes);
Leader = ElectLeader(_nodes[0]);
}
public async ValueTask<long> ProposeAsync(string command, CancellationToken ct)
{
if (!Leader.IsLeader)
Leader = ElectLeader(SelectNextCandidate(Leader));
return await Leader.ProposeAsync(command, ct);
}
public Task StepDownAsync(CancellationToken ct)
{
_ = ct;
var previous = Leader;
previous.RequestStepDown();
Leader = ElectLeader(SelectNextCandidate(previous));
return Task.CompletedTask;
}
private RaftNode SelectNextCandidate(RaftNode currentLeader)
{
if (_nodes.Count == 1)
return _nodes[0];
var index = _nodes.FindIndex(n => n.Id == currentLeader.Id);
if (index < 0)
return _nodes[0];
return _nodes[(index + 1) % _nodes.Count];
}
private RaftNode ElectLeader(RaftNode candidate)
{
candidate.StartElection(_nodes.Count);
foreach (var voter in _nodes.Where(n => n.Id != candidate.Id))
candidate.ReceiveVote(voter.GrantVote(candidate.Term), _nodes.Count);
return candidate;
}
}

View File

@@ -14,6 +14,8 @@ public sealed class StreamManager
private readonly JetStreamMetaGroup? _metaGroup;
private readonly ConcurrentDictionary<string, StreamHandle> _streams =
new(StringComparer.Ordinal);
private readonly ConcurrentDictionary<string, StreamReplicaGroup> _replicaGroups =
new(StringComparer.Ordinal);
private readonly ConcurrentDictionary<string, List<MirrorCoordinator>> _mirrorsByOrigin =
new(StringComparer.Ordinal);
private readonly ConcurrentDictionary<string, List<SourceCoordinator>> _sourcesByOrigin =
@@ -36,6 +38,12 @@ public sealed class StreamManager
normalized.Name,
_ => new StreamHandle(normalized, new MemStore()),
(_, existing) => existing with { Config = normalized });
_replicaGroups.AddOrUpdate(
normalized.Name,
_ => new StreamReplicaGroup(normalized.Name, normalized.Replicas),
(_, existing) => existing.Nodes.Count == Math.Max(normalized.Replicas, 1)
? existing
: new StreamReplicaGroup(normalized.Name, normalized.Replicas));
RebuildReplicationCoordinators();
_metaGroup?.ProposeCreateStreamAsync(normalized, default).GetAwaiter().GetResult();
@@ -77,6 +85,9 @@ public sealed class StreamManager
if (stream == null)
return null;
if (_replicaGroups.TryGetValue(stream.Config.Name, out var replicaGroup))
_ = replicaGroup.ProposeAsync($"PUB {subject}", default).GetAwaiter().GetResult();
var seq = stream.Store.AppendAsync(subject, payload, default).GetAwaiter().GetResult();
EnforceLimits(stream);
var stored = stream.Store.LoadAsync(seq, default).GetAwaiter().GetResult();
@@ -90,6 +101,14 @@ public sealed class StreamManager
};
}
public Task StepDownStreamLeaderAsync(string stream, CancellationToken ct)
{
if (_replicaGroups.TryGetValue(stream, out var replicaGroup))
return replicaGroup.StepDownAsync(ct);
return Task.CompletedTask;
}
private static StreamConfig NormalizeConfig(StreamConfig config)
{
var copy = new StreamConfig

View File

@@ -9,6 +9,7 @@ public sealed class RaftNode
public string Id { get; }
public int Term => TermState.CurrentTerm;
public bool IsLeader => Role == RaftRole.Leader;
public RaftRole Role { get; private set; } = RaftRole.Follower;
public RaftTermState TermState { get; } = new();
public long AppliedIndex { get; set; }
@@ -99,6 +100,8 @@ public sealed class RaftNode
public void RequestStepDown()
{
Role = RaftRole.Follower;
_votesReceived = 0;
TermState.VotedFor = null;
}
private void TryBecomeLeader(int clusterSize)

View File

@@ -0,0 +1,71 @@
using System.Text;
using NATS.Server.JetStream;
using NATS.Server.JetStream.Models;
using NATS.Server.JetStream.Publish;
namespace NATS.Server.Tests;
public class JetStreamStreamReplicaGroupTests
{
[Fact]
public async Task Leader_stepdown_preserves_stream_write_availability_after_new_election()
{
await using var fixture = await JetStreamReplicaFixture.StartAsync(nodes: 3);
await fixture.CreateStreamAsync("ORDERS", replicas: 3);
await fixture.StepDownStreamLeaderAsync("ORDERS");
var ack = await fixture.PublishAndGetAckAsync("orders.created", "1");
ack.Stream.ShouldBe("ORDERS");
ack.Seq.ShouldBeGreaterThan((ulong)0);
}
}
internal sealed class JetStreamReplicaFixture : IAsyncDisposable
{
private readonly StreamManager _streamManager;
private readonly JetStreamPublisher _publisher;
private JetStreamReplicaFixture(StreamManager streamManager)
{
_streamManager = streamManager;
_publisher = new JetStreamPublisher(_streamManager);
}
public static Task<JetStreamReplicaFixture> StartAsync(int nodes)
{
_ = nodes;
var streamManager = new StreamManager();
return Task.FromResult(new JetStreamReplicaFixture(streamManager));
}
public Task CreateStreamAsync(string name, int replicas)
{
var response = _streamManager.CreateOrUpdate(new StreamConfig
{
Name = name,
Subjects = ["orders.*"],
Replicas = replicas,
});
if (response.Error is not null)
throw new InvalidOperationException(response.Error.Description);
return Task.CompletedTask;
}
public Task StepDownStreamLeaderAsync(string stream)
{
return _streamManager.StepDownStreamLeaderAsync(stream, default);
}
public Task<PubAck> PublishAndGetAckAsync(string subject, string payload)
{
if (_publisher.TryCapture(subject, Encoding.UTF8.GetBytes(payload), null, out var ack))
return Task.FromResult(ack);
throw new InvalidOperationException("Publish did not match a stream.");
}
public ValueTask DisposeAsync() => ValueTask.CompletedTask;
}