fix(historian): dead-letter poison events after maxAttempts (finding 002)
This commit is contained in:
@@ -51,6 +51,9 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
public const long DefaultCapacity = 1_000_000;
|
||||
public static readonly TimeSpan DefaultDeadLetterRetention = TimeSpan.FromDays(30);
|
||||
|
||||
/// <summary>Default max delivery attempts before a perpetually-retrying (poison) row is dead-lettered.</summary>
|
||||
public const int DefaultMaxAttempts = 10;
|
||||
|
||||
private static readonly TimeSpan[] BackoffLadder =
|
||||
[
|
||||
TimeSpan.FromSeconds(1),
|
||||
@@ -66,6 +69,7 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
private readonly int _batchSize;
|
||||
private readonly long _capacity;
|
||||
private readonly TimeSpan _deadLetterRetention;
|
||||
private readonly int _maxAttempts;
|
||||
private readonly Func<DateTime> _clock;
|
||||
|
||||
private readonly SemaphoreSlim _drainGate = new(1, 1);
|
||||
@@ -115,6 +119,7 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
/// <param name="batchSize">The maximum number of rows to forward in a single batch. Defaults to 100.</param>
|
||||
/// <param name="capacity">The maximum queue capacity before evicting oldest rows. Defaults to 1,000,000.</param>
|
||||
/// <param name="deadLetterRetention">The timespan to retain dead-lettered rows before purging. Defaults to 30 days.</param>
|
||||
/// <param name="maxAttempts">The maximum number of delivery attempts before a perpetually-retrying (poison) row is dead-lettered. Defaults to 10.</param>
|
||||
/// <param name="clock">Optional clock function for testing; defaults to <see cref="DateTime.UtcNow"/>.</param>
|
||||
public SqliteStoreAndForwardSink(
|
||||
string databasePath,
|
||||
@@ -123,6 +128,7 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
int batchSize = 100,
|
||||
long capacity = DefaultCapacity,
|
||||
TimeSpan? deadLetterRetention = null,
|
||||
int maxAttempts = DefaultMaxAttempts,
|
||||
Func<DateTime>? clock = null)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(databasePath))
|
||||
@@ -132,6 +138,7 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
_batchSize = batchSize > 0 ? batchSize : throw new ArgumentOutOfRangeException(nameof(batchSize));
|
||||
_capacity = capacity > 0 ? capacity : throw new ArgumentOutOfRangeException(nameof(capacity));
|
||||
_deadLetterRetention = deadLetterRetention ?? DefaultDeadLetterRetention;
|
||||
_maxAttempts = maxAttempts > 0 ? maxAttempts : throw new ArgumentOutOfRangeException(nameof(maxAttempts));
|
||||
_clock = clock ?? (() => DateTime.UtcNow);
|
||||
// DefaultTimeout gives ADO.NET command-level retry; the PRAGMA busy_timeout
|
||||
// applied in OpenConnection backs it with SQLite's own busy-handler so an
|
||||
@@ -457,14 +464,28 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
rowsLeavingQueue++;
|
||||
break;
|
||||
case HistorianWriteOutcome.RetryPlease:
|
||||
BumpAttempt(conn, tx, rowId, "retry-please");
|
||||
// finding 002: cap retries so a perpetually-RetryPlease (poison)
|
||||
// row cannot retry forever at the 60s backoff floor. The incoming
|
||||
// AttemptCount is the count BEFORE this attempt; +1 accounts for the
|
||||
// bump this drain represents. At the cap, dead-letter instead of
|
||||
// bumping — and count it as leaving the live queue like PermanentFail.
|
||||
if (liveRows[i].AttemptCount + 1 >= _maxAttempts)
|
||||
{
|
||||
DeadLetterRow(conn, tx, rowId, $"max attempts ({_maxAttempts}) exceeded");
|
||||
rowsLeavingQueue++;
|
||||
}
|
||||
else
|
||||
{
|
||||
BumpAttempt(conn, tx, rowId, "retry-please");
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
tx.Commit();
|
||||
}
|
||||
// Ack-deleted + PermanentFail-dead-lettered rows both leave the
|
||||
// non-dead-lettered queue — keep the counter aligned (Core.AlarmHistorian-008).
|
||||
// non-dead-lettered queue, as do RetryPlease rows that hit the max-attempts
|
||||
// cap (finding 002) — keep the counter aligned (Core.AlarmHistorian-008).
|
||||
if (rowsLeavingQueue > 0)
|
||||
Interlocked.Add(ref _queuedRowCount, -rowsLeavingQueue);
|
||||
|
||||
@@ -552,14 +573,14 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
/// the <see cref="RowId"/> always stays bound to its own row so outcomes can
|
||||
/// never be mapped to the wrong row.
|
||||
/// </summary>
|
||||
private readonly record struct QueueRow(long RowId, AlarmHistorianEvent? Event);
|
||||
private readonly record struct QueueRow(long RowId, AlarmHistorianEvent? Event, long AttemptCount);
|
||||
|
||||
private List<QueueRow> ReadBatch(SqliteConnection conn)
|
||||
{
|
||||
var rows = new List<QueueRow>();
|
||||
using var cmd = conn.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
SELECT RowId, PayloadJson FROM Queue
|
||||
SELECT RowId, PayloadJson, AttemptCount FROM Queue
|
||||
WHERE DeadLettered = 0
|
||||
ORDER BY RowId ASC
|
||||
LIMIT $limit
|
||||
@@ -570,6 +591,7 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
{
|
||||
var rowId = reader.GetInt64(0);
|
||||
var payload = reader.GetString(1);
|
||||
var attemptCount = reader.GetInt64(2);
|
||||
AlarmHistorianEvent? evt;
|
||||
try
|
||||
{
|
||||
@@ -580,7 +602,7 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
// Malformed JSON — carry a null event so the caller dead-letters this row.
|
||||
evt = null;
|
||||
}
|
||||
rows.Add(new QueueRow(rowId, evt));
|
||||
rows.Add(new QueueRow(rowId, evt, attemptCount));
|
||||
}
|
||||
return rows;
|
||||
}
|
||||
|
||||
@@ -53,6 +53,10 @@ public sealed class AlarmHistorianOptions
|
||||
/// <summary>Days to retain dead-lettered rows before purge. Defaults to 30.</summary>
|
||||
public int DeadLetterRetentionDays { get; init; } = 30;
|
||||
|
||||
/// <summary>Maximum delivery attempts before a perpetually-retrying (poison) row is dead-lettered.
|
||||
/// Defaults to 10 (matches <c>SqliteStoreAndForwardSink</c>'s <c>DefaultMaxAttempts</c>).</summary>
|
||||
public int MaxAttempts { get; init; } = SqliteStoreAndForwardSink.DefaultMaxAttempts;
|
||||
|
||||
/// <summary>Returns operator-facing misconfiguration warnings for an <c>Enabled</c> historian
|
||||
/// (empty when disabled or correctly configured). Pure — the registration logs each entry.</summary>
|
||||
/// <returns>Zero or more human-readable warning messages.</returns>
|
||||
@@ -70,6 +74,8 @@ public sealed class AlarmHistorianOptions
|
||||
warnings.Add($"AlarmHistorian:Capacity is {Capacity} — must be > 0; the sink constructor will throw at startup.");
|
||||
if (DeadLetterRetentionDays <= 0)
|
||||
warnings.Add($"AlarmHistorian:DeadLetterRetentionDays is {DeadLetterRetentionDays} — must be > 0; dead-lettered rows would be purged on every drain tick.");
|
||||
if (MaxAttempts <= 0)
|
||||
warnings.Add($"AlarmHistorian:MaxAttempts is {MaxAttempts} — must be > 0; the sink constructor will throw at startup.");
|
||||
return warnings;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,7 +93,8 @@ public static class ServiceCollectionExtensions
|
||||
Serilog.Log.Logger.ForContext<SqliteStoreAndForwardSink>(),
|
||||
batchSize: opts.BatchSize,
|
||||
capacity: opts.Capacity,
|
||||
deadLetterRetention: TimeSpan.FromDays(opts.DeadLetterRetentionDays));
|
||||
deadLetterRetention: TimeSpan.FromDays(opts.DeadLetterRetentionDays),
|
||||
maxAttempts: opts.MaxAttempts);
|
||||
sink.StartDrainLoop(TimeSpan.FromSeconds(opts.DrainIntervalSeconds));
|
||||
return sink;
|
||||
});
|
||||
|
||||
+32
@@ -268,6 +268,37 @@ public sealed class SqliteStoreAndForwardSinkTests : IDisposable
|
||||
sink.CurrentBackoff.ShouldBe(TimeSpan.FromSeconds(60));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Regression for finding 002: a permanently-malformed (poison) event that the
|
||||
/// writer can only ever map to <see cref="HistorianWriteOutcome.RetryPlease"/>
|
||||
/// must NOT retry forever. After <c>maxAttempts</c> retry-please drains the row
|
||||
/// is dead-lettered so the queue head can never stall on it indefinitely.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task RetryPlease_dead_letters_row_after_MaxAttempts()
|
||||
{
|
||||
var writer = new FakeWriter { DefaultOutcome = HistorianWriteOutcome.RetryPlease };
|
||||
using var sink = new SqliteStoreAndForwardSink(
|
||||
_dbPath, writer, _log, maxAttempts: 3);
|
||||
|
||||
await sink.EnqueueAsync(Event("poison"), CancellationToken.None);
|
||||
|
||||
// Drain 1 + 2: still retrying — the row stays queued, not yet dead-lettered.
|
||||
await sink.DrainOnceAsync(CancellationToken.None);
|
||||
sink.GetStatus().QueueDepth.ShouldBe(1, "after 1 attempt the row is still queued");
|
||||
sink.GetStatus().DeadLetterDepth.ShouldBe(0, "not yet dead-lettered after 1 attempt");
|
||||
|
||||
await sink.DrainOnceAsync(CancellationToken.None);
|
||||
sink.GetStatus().QueueDepth.ShouldBe(1, "after 2 attempts the row is still queued");
|
||||
sink.GetStatus().DeadLetterDepth.ShouldBe(0, "not yet dead-lettered after 2 attempts");
|
||||
|
||||
// Drain 3: the 3rd attempt hits the cap — dead-letter it.
|
||||
await sink.DrainOnceAsync(CancellationToken.None);
|
||||
var status = sink.GetStatus();
|
||||
status.QueueDepth.ShouldBe(0, "row left the live queue once max attempts exceeded");
|
||||
status.DeadLetterDepth.ShouldBe(1, "poison row dead-lettered at the max-attempts cap");
|
||||
}
|
||||
|
||||
/// <summary>Verifies that NullAlarmHistorianSink reports disabled status.</summary>
|
||||
[Fact]
|
||||
public void NullAlarmHistorianSink_reports_disabled_status()
|
||||
@@ -295,6 +326,7 @@ public sealed class SqliteStoreAndForwardSinkTests : IDisposable
|
||||
Should.Throw<ArgumentNullException>(() => new SqliteStoreAndForwardSink(_dbPath, w, null!));
|
||||
Should.Throw<ArgumentOutOfRangeException>(() => new SqliteStoreAndForwardSink(_dbPath, w, _log, batchSize: 0));
|
||||
Should.Throw<ArgumentOutOfRangeException>(() => new SqliteStoreAndForwardSink(_dbPath, w, _log, capacity: 0));
|
||||
Should.Throw<ArgumentOutOfRangeException>(() => new SqliteStoreAndForwardSink(_dbPath, w, _log, maxAttempts: 0));
|
||||
}
|
||||
|
||||
/// <summary>Verifies that a disposed sink rejects enqueue operations.</summary>
|
||||
|
||||
Reference in New Issue
Block a user