Merge branch 'feature/core-lifecycle' into main

Reconcile close reason tracking: feature branch's MarkClosed() and
ShouldSkipFlush/FlushAndCloseAsync now use main's ClientClosedReason
enum. ClosedState enum retained for forward compatibility.
This commit is contained in:
Joseph Doherty
2026-02-23 00:09:30 -05:00
12 changed files with 2745 additions and 18 deletions

View File

@@ -2,9 +2,11 @@ using System.Collections.Concurrent;
using System.Net;
using System.Net.Security;
using System.Net.Sockets;
using System.Runtime.InteropServices;
using System.Security.Cryptography.X509Certificates;
using System.Text;
using Microsoft.Extensions.Logging;
using NATS.NKeys;
using NATS.Server.Auth;
using NATS.Server.Monitoring;
using NATS.Server.Protocol;
@@ -25,6 +27,7 @@ public sealed class NatsServer : IMessageRouter, ISubListAccess, IDisposable
private readonly AuthService _authService;
private readonly ConcurrentDictionary<string, Account> _accounts = new(StringComparer.Ordinal);
private readonly Account _globalAccount;
private readonly Account _systemAccount;
private readonly SslServerAuthenticationOptions? _sslOptions;
private readonly TlsRateLimiter? _tlsRateLimiter;
private Socket? _listener;
@@ -32,16 +35,205 @@ public sealed class NatsServer : IMessageRouter, ISubListAccess, IDisposable
private ulong _nextClientId;
private long _startTimeTicks;
private readonly CancellationTokenSource _quitCts = new();
private readonly TaskCompletionSource _shutdownComplete = new(TaskCreationOptions.RunContinuationsAsynchronously);
private readonly TaskCompletionSource _acceptLoopExited = new(TaskCreationOptions.RunContinuationsAsynchronously);
private int _shutdown;
private int _activeClientCount;
private int _lameDuck;
private readonly List<PosixSignalRegistration> _signalRegistrations = [];
private string? _portsFilePath;
private static readonly TimeSpan AcceptMinSleep = TimeSpan.FromMilliseconds(10);
private static readonly TimeSpan AcceptMaxSleep = TimeSpan.FromSeconds(1);
public SubList SubList => _globalAccount.SubList;
public ServerStats Stats => _stats;
public DateTime StartTime => new(Interlocked.Read(ref _startTimeTicks), DateTimeKind.Utc);
public string ServerId => _serverInfo.ServerId;
public string ServerName => _serverInfo.ServerName;
public int ClientCount => _clients.Count;
public int Port => _options.Port;
public Account SystemAccount => _systemAccount;
public string ServerNKey { get; }
public bool IsShuttingDown => Volatile.Read(ref _shutdown) != 0;
public bool IsLameDuckMode => Volatile.Read(ref _lameDuck) != 0;
public IEnumerable<NatsClient> GetClients() => _clients.Values;
public Task WaitForReadyAsync() => _listeningStarted.Task;
public void WaitForShutdown() => _shutdownComplete.Task.GetAwaiter().GetResult();
public async Task ShutdownAsync()
{
if (Interlocked.CompareExchange(ref _shutdown, 1, 0) != 0)
return; // Already shutting down
_logger.LogInformation("Initiating Shutdown...");
// Signal all internal loops to stop
await _quitCts.CancelAsync();
// Close listener to stop accept loop
_listener?.Close();
// Wait for accept loop to exit
await _acceptLoopExited.Task.WaitAsync(TimeSpan.FromSeconds(5)).ConfigureAwait(ConfigureAwaitOptions.SuppressThrowing);
// Close all client connections — flush first, then mark closed
var flushTasks = new List<Task>();
foreach (var client in _clients.Values)
{
client.MarkClosed(ClientClosedReason.ServerShutdown);
flushTasks.Add(client.FlushAndCloseAsync(minimalFlush: true));
}
await Task.WhenAll(flushTasks).WaitAsync(TimeSpan.FromSeconds(2)).ConfigureAwait(ConfigureAwaitOptions.SuppressThrowing);
// Wait for active client tasks to drain (with timeout)
if (Volatile.Read(ref _activeClientCount) > 0)
{
using var drainCts = new CancellationTokenSource(TimeSpan.FromSeconds(10));
try
{
while (Volatile.Read(ref _activeClientCount) > 0 && !drainCts.IsCancellationRequested)
await Task.Delay(50, drainCts.Token);
}
catch (OperationCanceledException) { }
}
// Stop monitor server
if (_monitorServer != null)
await _monitorServer.DisposeAsync();
DeletePidFile();
DeletePortsFile();
_logger.LogInformation("Server Exiting..");
_shutdownComplete.TrySetResult();
}
public async Task LameDuckShutdownAsync()
{
if (IsShuttingDown || Interlocked.CompareExchange(ref _lameDuck, 1, 0) != 0)
return;
_logger.LogInformation("Entering lame duck mode, stop accepting new clients");
// Close listener to stop accepting new connections
_listener?.Close();
// Wait for accept loop to exit
await _acceptLoopExited.Task.WaitAsync(TimeSpan.FromSeconds(5)).ConfigureAwait(ConfigureAwaitOptions.SuppressThrowing);
var gracePeriod = _options.LameDuckGracePeriod;
if (gracePeriod < TimeSpan.Zero) gracePeriod = -gracePeriod;
// If no clients, go straight to shutdown
if (_clients.IsEmpty)
{
await ShutdownAsync();
return;
}
// Wait grace period for clients to drain naturally
_logger.LogInformation("Waiting {GracePeriod}ms grace period", gracePeriod.TotalMilliseconds);
try
{
await Task.Delay(gracePeriod, _quitCts.Token);
}
catch (OperationCanceledException) { return; }
if (_clients.IsEmpty)
{
await ShutdownAsync();
return;
}
// Stagger-close remaining clients
var dur = _options.LameDuckDuration - gracePeriod;
if (dur <= TimeSpan.Zero) dur = TimeSpan.FromSeconds(1);
var clients = _clients.Values.ToList();
var numClients = clients.Count;
if (numClients > 0)
{
_logger.LogInformation("Closing {Count} existing clients over {Duration}ms",
numClients, dur.TotalMilliseconds);
var sleepInterval = dur.Ticks / numClients;
if (sleepInterval < TimeSpan.TicksPerMillisecond)
sleepInterval = TimeSpan.TicksPerMillisecond;
if (sleepInterval > TimeSpan.TicksPerSecond)
sleepInterval = TimeSpan.TicksPerSecond;
for (int i = 0; i < clients.Count; i++)
{
clients[i].MarkClosed(ClientClosedReason.ServerShutdown);
await clients[i].FlushAndCloseAsync(minimalFlush: true);
if (i < clients.Count - 1)
{
var jitter = Random.Shared.NextInt64(sleepInterval / 2, sleepInterval);
try
{
await Task.Delay(TimeSpan.FromTicks(jitter), _quitCts.Token);
}
catch (OperationCanceledException) { break; }
}
}
}
await ShutdownAsync();
}
/// <summary>
/// Registers Unix signal handlers.
/// SIGTERM → shutdown, SIGUSR2 → lame duck, SIGUSR1 → log reopen (stub), SIGHUP → reload (stub).
/// </summary>
public void HandleSignals()
{
_signalRegistrations.Add(PosixSignalRegistration.Create(PosixSignal.SIGTERM, ctx =>
{
ctx.Cancel = true;
_logger.LogInformation("Trapped SIGTERM signal");
_ = Task.Run(async () => await ShutdownAsync());
}));
_signalRegistrations.Add(PosixSignalRegistration.Create(PosixSignal.SIGQUIT, ctx =>
{
ctx.Cancel = true;
_logger.LogInformation("Trapped SIGQUIT signal");
_ = Task.Run(async () => await ShutdownAsync());
}));
_signalRegistrations.Add(PosixSignalRegistration.Create(PosixSignal.SIGHUP, ctx =>
{
ctx.Cancel = true;
_logger.LogWarning("Trapped SIGHUP signal — config reload not yet supported");
}));
// SIGUSR1 and SIGUSR2 only on non-Windows
if (!OperatingSystem.IsWindows())
{
_signalRegistrations.Add(PosixSignalRegistration.Create((PosixSignal)10, ctx =>
{
ctx.Cancel = true;
_logger.LogWarning("Trapped SIGUSR1 signal — log reopen not yet supported");
}));
_signalRegistrations.Add(PosixSignalRegistration.Create((PosixSignal)12, ctx =>
{
ctx.Cancel = true;
_logger.LogInformation("Trapped SIGUSR2 signal — entering lame duck mode");
_ = Task.Run(async () => await LameDuckShutdownAsync());
}));
}
}
public NatsServer(NatsOptions options, ILoggerFactory loggerFactory)
{
_options = options;
@@ -50,6 +242,15 @@ public sealed class NatsServer : IMessageRouter, ISubListAccess, IDisposable
_authService = AuthService.Build(options);
_globalAccount = new Account(Account.GlobalAccountName);
_accounts[Account.GlobalAccountName] = _globalAccount;
// Create $SYS system account (stub -- no internal subscriptions yet)
_systemAccount = new Account("$SYS");
_accounts["$SYS"] = _systemAccount;
// Generate Ed25519 server NKey identity
using var serverKeyPair = KeyPair.CreatePair(PrefixByte.Server);
ServerNKey = serverKeyPair.GetPublicKey();
_serverInfo = new ServerInfo
{
ServerId = Guid.NewGuid().ToString("N")[..20].ToUpperInvariant(),
@@ -75,6 +276,8 @@ public sealed class NatsServer : IMessageRouter, ISubListAccess, IDisposable
public async Task StartAsync(CancellationToken ct)
{
using var linked = CancellationTokenSource.CreateLinkedTokenSource(ct, _quitCts.Token);
_listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
_listener.SetSocketOption(SocketOptionLevel.Socket, SocketOptionName.ReuseAddress, true);
_listener.Bind(new IPEndPoint(
@@ -82,23 +285,68 @@ public sealed class NatsServer : IMessageRouter, ISubListAccess, IDisposable
_options.Port));
Interlocked.Exchange(ref _startTimeTicks, DateTime.UtcNow.Ticks);
_listener.Listen(128);
// Resolve ephemeral port if port=0
if (_options.Port == 0)
{
var actualPort = ((IPEndPoint)_listener.LocalEndPoint!).Port;
_options.Port = actualPort;
_serverInfo.Port = actualPort;
}
_listeningStarted.TrySetResult();
_logger.LogInformation("Listening on {Host}:{Port}", _options.Host, _options.Port);
_logger.LogInformation("Listening for client connections on {Host}:{Port}", _options.Host, _options.Port);
// Warn about stub features
if (_options.ConfigFile != null)
_logger.LogWarning("Config file parsing not yet supported (file: {ConfigFile})", _options.ConfigFile);
if (_options.ProfPort > 0)
_logger.LogWarning("Profiling endpoint not yet supported (port: {ProfPort})", _options.ProfPort);
if (_options.MonitorPort > 0)
{
_monitorServer = new MonitorServer(this, _options, _stats, _loggerFactory);
await _monitorServer.StartAsync(ct);
await _monitorServer.StartAsync(linked.Token);
}
WritePidFile();
WritePortsFile();
var tmpDelay = AcceptMinSleep;
try
{
while (!ct.IsCancellationRequested)
while (!linked.Token.IsCancellationRequested)
{
var socket = await _listener.AcceptAsync(ct);
Socket socket;
try
{
socket = await _listener.AcceptAsync(linked.Token);
tmpDelay = AcceptMinSleep; // Reset on success
}
catch (OperationCanceledException)
{
break;
}
catch (ObjectDisposedException)
{
break;
}
catch (SocketException ex)
{
if (IsShuttingDown || IsLameDuckMode)
break;
// Check MaxConnections before creating the client
_logger.LogError(ex, "Temporary accept error, sleeping {Delay}ms", tmpDelay.TotalMilliseconds);
try { await Task.Delay(tmpDelay, linked.Token); }
catch (OperationCanceledException) { break; }
tmpDelay = TimeSpan.FromTicks(Math.Min(tmpDelay.Ticks * 2, AcceptMaxSleep.Ticks));
continue;
}
// Check MaxConnections
if (_options.MaxConnections > 0 && _clients.Count >= _options.MaxConnections)
{
_logger.LogWarning("Client connection rejected: maximum connections ({MaxConnections}) exceeded",
@@ -108,13 +356,13 @@ public sealed class NatsServer : IMessageRouter, ISubListAccess, IDisposable
var stream = new NetworkStream(socket, ownsSocket: false);
var errBytes = Encoding.ASCII.GetBytes(
$"-ERR '{NatsProtocol.ErrMaxConnectionsExceeded}'\r\n");
await stream.WriteAsync(errBytes, ct);
await stream.FlushAsync(ct);
await stream.WriteAsync(errBytes, linked.Token);
await stream.FlushAsync(linked.Token);
stream.Dispose();
}
catch (Exception ex)
catch (Exception ex2)
{
_logger.LogDebug(ex, "Failed to send -ERR to rejected client");
_logger.LogDebug(ex2, "Failed to send -ERR to rejected client");
}
finally
{
@@ -125,16 +373,21 @@ public sealed class NatsServer : IMessageRouter, ISubListAccess, IDisposable
var clientId = Interlocked.Increment(ref _nextClientId);
Interlocked.Increment(ref _stats.TotalConnections);
Interlocked.Increment(ref _activeClientCount);
_logger.LogDebug("Client {ClientId} connected from {RemoteEndpoint}", clientId, socket.RemoteEndPoint);
_ = AcceptClientAsync(socket, clientId, ct);
_ = AcceptClientAsync(socket, clientId, linked.Token);
}
}
catch (OperationCanceledException)
{
_logger.LogDebug("Accept loop cancelled, server shutting down");
}
finally
{
_acceptLoopExited.TrySetResult();
}
}
private async Task AcceptClientAsync(Socket socket, ulong clientId, CancellationToken ct)
@@ -217,8 +470,9 @@ public sealed class NatsServer : IMessageRouter, ISubListAccess, IDisposable
}
finally
{
_logger.LogDebug("Client {ClientId} disconnected", client.Id);
_logger.LogDebug("Client {ClientId} disconnected (reason: {CloseReason})", client.Id, client.CloseReason);
RemoveClient(client);
Interlocked.Decrement(ref _activeClientCount);
}
}
@@ -327,10 +581,75 @@ public sealed class NatsServer : IMessageRouter, ISubListAccess, IDisposable
client.Account?.RemoveClient(client.Id);
}
private void WritePidFile()
{
if (string.IsNullOrEmpty(_options.PidFile)) return;
try
{
File.WriteAllText(_options.PidFile, Environment.ProcessId.ToString());
_logger.LogDebug("Wrote PID file {PidFile}", _options.PidFile);
}
catch (Exception ex)
{
_logger.LogError(ex, "Error writing PID file {PidFile}", _options.PidFile);
}
}
private void DeletePidFile()
{
if (string.IsNullOrEmpty(_options.PidFile)) return;
try
{
if (File.Exists(_options.PidFile))
File.Delete(_options.PidFile);
}
catch (Exception ex)
{
_logger.LogError(ex, "Error deleting PID file {PidFile}", _options.PidFile);
}
}
private void WritePortsFile()
{
if (string.IsNullOrEmpty(_options.PortsFileDir)) return;
try
{
var exeName = Path.GetFileNameWithoutExtension(Environment.ProcessPath ?? "nats-server");
var fileName = $"{exeName}_{Environment.ProcessId}.ports";
_portsFilePath = Path.Combine(_options.PortsFileDir, fileName);
var ports = new { client = _options.Port, monitor = _options.MonitorPort > 0 ? _options.MonitorPort : (int?)null };
var json = System.Text.Json.JsonSerializer.Serialize(ports);
File.WriteAllText(_portsFilePath, json);
_logger.LogDebug("Wrote ports file {PortsFile}", _portsFilePath);
}
catch (Exception ex)
{
_logger.LogError(ex, "Error writing ports file to {PortsFileDir}", _options.PortsFileDir);
}
}
private void DeletePortsFile()
{
if (_portsFilePath == null) return;
try
{
if (File.Exists(_portsFilePath))
File.Delete(_portsFilePath);
}
catch (Exception ex)
{
_logger.LogError(ex, "Error deleting ports file {PortsFile}", _portsFilePath);
}
}
public void Dispose()
{
if (_monitorServer != null)
_monitorServer.DisposeAsync().AsTask().GetAwaiter().GetResult();
if (!IsShuttingDown)
ShutdownAsync().GetAwaiter().GetResult();
foreach (var reg in _signalRegistrations)
reg.Dispose();
_quitCts.Dispose();
_tlsRateLimiter?.Dispose();
_listener?.Dispose();
foreach (var client in _clients.Values)