feat: complete MQTT session flapper detection (Gap 6.6)
Add FlapperState class and per-client exponential backoff tracking to MqttSessionStore. New TrackConnectDisconnect(string) overload returns FlapperState with backoff level and expiry; IsFlapper, GetBackoffMs, ClearFlapperState, and CheckAndClearStableClients give callers full visibility and cleanup control. Legacy two-arg overload preserved for backward compatibility. Ten unit tests cover counting, threshold, exponential growth, 60s cap, window reset, and stable-client sweep.
This commit is contained in:
@@ -9,6 +9,25 @@ using NATS.Server.JetStream.Storage;
|
||||
|
||||
namespace NATS.Server.Mqtt;
|
||||
|
||||
/// <summary>
|
||||
/// Per-client flapper detection state tracking connect/disconnect cycling.
|
||||
/// Go reference: server/mqtt.go mqttCheckFlapper ~line 300.
|
||||
/// </summary>
|
||||
public sealed class FlapperState
|
||||
{
|
||||
/// <summary>Total number of connect/disconnect events tracked in the current window.</summary>
|
||||
public int ConnectDisconnectCount { get; set; }
|
||||
|
||||
/// <summary>Start of the current detection window.</summary>
|
||||
public DateTime WindowStart { get; set; }
|
||||
|
||||
/// <summary>When the backoff expires, or null if not currently backing off.</summary>
|
||||
public DateTime? BackoffUntil { get; set; }
|
||||
|
||||
/// <summary>Current exponential backoff level (0 = 1s, 1 = 2s, 2 = 4s, …, capped at 60s).</summary>
|
||||
public int BackoffLevel { get; set; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Will message to be published on abnormal client disconnection.
|
||||
/// Go reference: server/mqtt.go mqttWill struct ~line 270.
|
||||
@@ -49,6 +68,7 @@ public sealed class MqttSessionStore
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, MqttSessionData> _sessions = new(StringComparer.Ordinal);
|
||||
private readonly ConcurrentDictionary<string, List<DateTime>> _connectHistory = new(StringComparer.Ordinal);
|
||||
private readonly ConcurrentDictionary<string, FlapperState> _flapperStates = new(StringComparer.Ordinal);
|
||||
private readonly ConcurrentDictionary<string, WillMessage> _wills = new(StringComparer.Ordinal);
|
||||
private readonly ConcurrentDictionary<string, (WillMessage Will, DateTime ScheduledAt)> _delayedWills = new(StringComparer.Ordinal);
|
||||
|
||||
@@ -196,28 +216,133 @@ public sealed class MqttSessionStore
|
||||
_sessions.Values.ToList();
|
||||
|
||||
/// <summary>
|
||||
/// Tracks a connect or disconnect event for flapper detection.
|
||||
/// Backward-compatible overload: tracks a connect event (disconnect is ignored).
|
||||
/// Delegates to <see cref="TrackConnectDisconnect(string)"/> when <paramref name="connected"/> is true.
|
||||
/// Go reference: server/mqtt.go mqttCheckFlapper ~line 300.
|
||||
/// </summary>
|
||||
/// <param name="clientId">The MQTT client identifier.</param>
|
||||
/// <param name="connected">True for connect, false for disconnect.</param>
|
||||
public void TrackConnectDisconnect(string clientId, bool connected)
|
||||
{
|
||||
if (connected)
|
||||
TrackConnectDisconnect(clientId);
|
||||
|
||||
// Also maintain the legacy _connectHistory for ShouldApplyBackoff callers
|
||||
if (!connected)
|
||||
return;
|
||||
|
||||
var now = _timeProvider.GetUtcNow().UtcDateTime;
|
||||
var history = _connectHistory.GetOrAdd(clientId, static _ => []);
|
||||
|
||||
lock (history)
|
||||
{
|
||||
// Prune entries outside the flap window
|
||||
var cutoff = now - _flapWindow;
|
||||
history.RemoveAll(t => t < cutoff);
|
||||
history.Add(now);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tracks a connect or disconnect event for flapper detection.
|
||||
/// Increments the count for the client within the detection window. If 3+ events
|
||||
/// occur within 10 seconds the client is marked as a flapper and exponential
|
||||
/// backoff is applied: min(2^backoffLevel * 1000, 60000) ms.
|
||||
/// Go reference: server/mqtt.go mqttCheckFlapper ~line 300.
|
||||
/// </summary>
|
||||
/// <param name="clientId">The MQTT client identifier.</param>
|
||||
/// <returns>The updated <see cref="FlapperState"/> for the client.</returns>
|
||||
public FlapperState TrackConnectDisconnect(string clientId)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow().UtcDateTime;
|
||||
|
||||
var state = _flapperStates.GetOrAdd(clientId, static _ => new FlapperState
|
||||
{
|
||||
WindowStart = DateTime.UtcNow,
|
||||
});
|
||||
|
||||
lock (state)
|
||||
{
|
||||
// Reset window if we're past the flap window duration
|
||||
if (now - state.WindowStart > _flapWindow)
|
||||
{
|
||||
state.ConnectDisconnectCount = 0;
|
||||
state.WindowStart = now;
|
||||
}
|
||||
|
||||
state.ConnectDisconnectCount++;
|
||||
|
||||
if (state.ConnectDisconnectCount >= _flapThreshold)
|
||||
{
|
||||
// Exponential backoff: min(2^backoffLevel * 1000 ms, 60000 ms)
|
||||
var backoffMs = Math.Min((int)Math.Pow(2, state.BackoffLevel) * 1000, 60_000);
|
||||
state.BackoffUntil = now + TimeSpan.FromMilliseconds(backoffMs);
|
||||
state.BackoffLevel++;
|
||||
}
|
||||
}
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns true if the client is currently in a backoff period (is a flapper).
|
||||
/// Go reference: server/mqtt.go mqttCheckFlapper ~line 320.
|
||||
/// </summary>
|
||||
public bool IsFlapper(string clientId)
|
||||
{
|
||||
if (!_flapperStates.TryGetValue(clientId, out var state))
|
||||
return false;
|
||||
|
||||
var now = _timeProvider.GetUtcNow().UtcDateTime;
|
||||
lock (state)
|
||||
{
|
||||
return state.BackoffUntil.HasValue && state.BackoffUntil.Value > now;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the remaining backoff in milliseconds, or 0 if the client is not flapping.
|
||||
/// Go reference: server/mqtt.go mqttCheckFlapper ~line 325.
|
||||
/// </summary>
|
||||
public long GetBackoffMs(string clientId)
|
||||
{
|
||||
if (!_flapperStates.TryGetValue(clientId, out var state))
|
||||
return 0;
|
||||
|
||||
var now = _timeProvider.GetUtcNow().UtcDateTime;
|
||||
lock (state)
|
||||
{
|
||||
if (!state.BackoffUntil.HasValue || state.BackoffUntil.Value <= now)
|
||||
return 0;
|
||||
|
||||
return (long)(state.BackoffUntil.Value - now).TotalMilliseconds;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Removes all flapper tracking state for the given client.
|
||||
/// Called when stability is restored or the client is cleanly disconnected.
|
||||
/// </summary>
|
||||
public void ClearFlapperState(string clientId) =>
|
||||
_flapperStates.TryRemove(clientId, out _);
|
||||
|
||||
/// <summary>
|
||||
/// Clears flapper states for clients whose <see cref="FlapperState.BackoffUntil"/> has
|
||||
/// expired by at least <paramref name="stableThreshold"/> ago, indicating the client
|
||||
/// has been stable for the given duration.
|
||||
/// </summary>
|
||||
/// <param name="stableThreshold">How long past the BackoffUntil expiry before clearing.</param>
|
||||
public void CheckAndClearStableClients(TimeSpan stableThreshold)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow().UtcDateTime;
|
||||
foreach (var (clientId, state) in _flapperStates)
|
||||
{
|
||||
lock (state)
|
||||
{
|
||||
if (state.BackoffUntil.HasValue && now - state.BackoffUntil.Value >= stableThreshold)
|
||||
{
|
||||
_flapperStates.TryRemove(clientId, out _);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the backoff delay if the client is flapping, otherwise <see cref="TimeSpan.Zero"/>.
|
||||
/// Go reference: server/mqtt.go mqttCheckFlapper ~line 320.
|
||||
|
||||
Reference in New Issue
Block a user