feat: complete MQTT session flapper detection (Gap 6.6)

Add FlapperState class and per-client exponential backoff tracking to
MqttSessionStore. New TrackConnectDisconnect(string) overload returns
FlapperState with backoff level and expiry; IsFlapper, GetBackoffMs,
ClearFlapperState, and CheckAndClearStableClients give callers full
visibility and cleanup control. Legacy two-arg overload preserved for
backward compatibility. Ten unit tests cover counting, threshold,
exponential growth, 60s cap, window reset, and stable-client sweep.
This commit is contained in:
Joseph Doherty
2026-02-25 11:42:24 -05:00
parent 96db73d1c6
commit 5fea08dda0
2 changed files with 378 additions and 5 deletions

View File

@@ -9,6 +9,25 @@ using NATS.Server.JetStream.Storage;
namespace NATS.Server.Mqtt;
/// <summary>
/// Per-client flapper detection state tracking connect/disconnect cycling.
/// Go reference: server/mqtt.go mqttCheckFlapper ~line 300.
/// </summary>
public sealed class FlapperState
{
/// <summary>Total number of connect/disconnect events tracked in the current window.</summary>
public int ConnectDisconnectCount { get; set; }
/// <summary>Start of the current detection window.</summary>
public DateTime WindowStart { get; set; }
/// <summary>When the backoff expires, or null if not currently backing off.</summary>
public DateTime? BackoffUntil { get; set; }
/// <summary>Current exponential backoff level (0 = 1s, 1 = 2s, 2 = 4s, …, capped at 60s).</summary>
public int BackoffLevel { get; set; }
}
/// <summary>
/// Will message to be published on abnormal client disconnection.
/// Go reference: server/mqtt.go mqttWill struct ~line 270.
@@ -49,6 +68,7 @@ public sealed class MqttSessionStore
{
private readonly ConcurrentDictionary<string, MqttSessionData> _sessions = new(StringComparer.Ordinal);
private readonly ConcurrentDictionary<string, List<DateTime>> _connectHistory = new(StringComparer.Ordinal);
private readonly ConcurrentDictionary<string, FlapperState> _flapperStates = new(StringComparer.Ordinal);
private readonly ConcurrentDictionary<string, WillMessage> _wills = new(StringComparer.Ordinal);
private readonly ConcurrentDictionary<string, (WillMessage Will, DateTime ScheduledAt)> _delayedWills = new(StringComparer.Ordinal);
@@ -196,28 +216,133 @@ public sealed class MqttSessionStore
_sessions.Values.ToList();
/// <summary>
/// Tracks a connect or disconnect event for flapper detection.
/// Backward-compatible overload: tracks a connect event (disconnect is ignored).
/// Delegates to <see cref="TrackConnectDisconnect(string)"/> when <paramref name="connected"/> is true.
/// Go reference: server/mqtt.go mqttCheckFlapper ~line 300.
/// </summary>
/// <param name="clientId">The MQTT client identifier.</param>
/// <param name="connected">True for connect, false for disconnect.</param>
public void TrackConnectDisconnect(string clientId, bool connected)
{
if (connected)
TrackConnectDisconnect(clientId);
// Also maintain the legacy _connectHistory for ShouldApplyBackoff callers
if (!connected)
return;
var now = _timeProvider.GetUtcNow().UtcDateTime;
var history = _connectHistory.GetOrAdd(clientId, static _ => []);
lock (history)
{
// Prune entries outside the flap window
var cutoff = now - _flapWindow;
history.RemoveAll(t => t < cutoff);
history.Add(now);
}
}
/// <summary>
/// Tracks a connect or disconnect event for flapper detection.
/// Increments the count for the client within the detection window. If 3+ events
/// occur within 10 seconds the client is marked as a flapper and exponential
/// backoff is applied: min(2^backoffLevel * 1000, 60000) ms.
/// Go reference: server/mqtt.go mqttCheckFlapper ~line 300.
/// </summary>
/// <param name="clientId">The MQTT client identifier.</param>
/// <returns>The updated <see cref="FlapperState"/> for the client.</returns>
public FlapperState TrackConnectDisconnect(string clientId)
{
var now = _timeProvider.GetUtcNow().UtcDateTime;
var state = _flapperStates.GetOrAdd(clientId, static _ => new FlapperState
{
WindowStart = DateTime.UtcNow,
});
lock (state)
{
// Reset window if we're past the flap window duration
if (now - state.WindowStart > _flapWindow)
{
state.ConnectDisconnectCount = 0;
state.WindowStart = now;
}
state.ConnectDisconnectCount++;
if (state.ConnectDisconnectCount >= _flapThreshold)
{
// Exponential backoff: min(2^backoffLevel * 1000 ms, 60000 ms)
var backoffMs = Math.Min((int)Math.Pow(2, state.BackoffLevel) * 1000, 60_000);
state.BackoffUntil = now + TimeSpan.FromMilliseconds(backoffMs);
state.BackoffLevel++;
}
}
return state;
}
/// <summary>
/// Returns true if the client is currently in a backoff period (is a flapper).
/// Go reference: server/mqtt.go mqttCheckFlapper ~line 320.
/// </summary>
public bool IsFlapper(string clientId)
{
if (!_flapperStates.TryGetValue(clientId, out var state))
return false;
var now = _timeProvider.GetUtcNow().UtcDateTime;
lock (state)
{
return state.BackoffUntil.HasValue && state.BackoffUntil.Value > now;
}
}
/// <summary>
/// Returns the remaining backoff in milliseconds, or 0 if the client is not flapping.
/// Go reference: server/mqtt.go mqttCheckFlapper ~line 325.
/// </summary>
public long GetBackoffMs(string clientId)
{
if (!_flapperStates.TryGetValue(clientId, out var state))
return 0;
var now = _timeProvider.GetUtcNow().UtcDateTime;
lock (state)
{
if (!state.BackoffUntil.HasValue || state.BackoffUntil.Value <= now)
return 0;
return (long)(state.BackoffUntil.Value - now).TotalMilliseconds;
}
}
/// <summary>
/// Removes all flapper tracking state for the given client.
/// Called when stability is restored or the client is cleanly disconnected.
/// </summary>
public void ClearFlapperState(string clientId) =>
_flapperStates.TryRemove(clientId, out _);
/// <summary>
/// Clears flapper states for clients whose <see cref="FlapperState.BackoffUntil"/> has
/// expired by at least <paramref name="stableThreshold"/> ago, indicating the client
/// has been stable for the given duration.
/// </summary>
/// <param name="stableThreshold">How long past the BackoffUntil expiry before clearing.</param>
public void CheckAndClearStableClients(TimeSpan stableThreshold)
{
var now = _timeProvider.GetUtcNow().UtcDateTime;
foreach (var (clientId, state) in _flapperStates)
{
lock (state)
{
if (state.BackoffUntil.HasValue && now - state.BackoffUntil.Value >= stableThreshold)
{
_flapperStates.TryRemove(clientId, out _);
}
}
}
}
/// <summary>
/// Returns the backoff delay if the client is flapping, otherwise <see cref="TimeSpan.Zero"/>.
/// Go reference: server/mqtt.go mqttCheckFlapper ~line 320.