fix(historian): dead-letter poison events after maxAttempts (finding 002)

This commit is contained in:
Joseph Doherty
2026-06-16 05:25:43 -04:00
parent 5e27b5f708
commit fcb3801415
4 changed files with 67 additions and 6 deletions
@@ -51,6 +51,9 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
public const long DefaultCapacity = 1_000_000;
public static readonly TimeSpan DefaultDeadLetterRetention = TimeSpan.FromDays(30);
/// <summary>Default max delivery attempts before a perpetually-retrying (poison) row is dead-lettered.</summary>
public const int DefaultMaxAttempts = 10;
private static readonly TimeSpan[] BackoffLadder =
[
TimeSpan.FromSeconds(1),
@@ -66,6 +69,7 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
private readonly int _batchSize;
private readonly long _capacity;
private readonly TimeSpan _deadLetterRetention;
private readonly int _maxAttempts;
private readonly Func<DateTime> _clock;
private readonly SemaphoreSlim _drainGate = new(1, 1);
@@ -115,6 +119,7 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
/// <param name="batchSize">The maximum number of rows to forward in a single batch. Defaults to 100.</param>
/// <param name="capacity">The maximum queue capacity before evicting oldest rows. Defaults to 1,000,000.</param>
/// <param name="deadLetterRetention">The timespan to retain dead-lettered rows before purging. Defaults to 30 days.</param>
/// <param name="maxAttempts">The maximum number of delivery attempts before a perpetually-retrying (poison) row is dead-lettered. Defaults to 10.</param>
/// <param name="clock">Optional clock function for testing; defaults to <see cref="DateTime.UtcNow"/>.</param>
public SqliteStoreAndForwardSink(
string databasePath,
@@ -123,6 +128,7 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
int batchSize = 100,
long capacity = DefaultCapacity,
TimeSpan? deadLetterRetention = null,
int maxAttempts = DefaultMaxAttempts,
Func<DateTime>? clock = null)
{
if (string.IsNullOrWhiteSpace(databasePath))
@@ -132,6 +138,7 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
_batchSize = batchSize > 0 ? batchSize : throw new ArgumentOutOfRangeException(nameof(batchSize));
_capacity = capacity > 0 ? capacity : throw new ArgumentOutOfRangeException(nameof(capacity));
_deadLetterRetention = deadLetterRetention ?? DefaultDeadLetterRetention;
_maxAttempts = maxAttempts > 0 ? maxAttempts : throw new ArgumentOutOfRangeException(nameof(maxAttempts));
_clock = clock ?? (() => DateTime.UtcNow);
// DefaultTimeout gives ADO.NET command-level retry; the PRAGMA busy_timeout
// applied in OpenConnection backs it with SQLite's own busy-handler so an
@@ -457,14 +464,28 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
rowsLeavingQueue++;
break;
case HistorianWriteOutcome.RetryPlease:
BumpAttempt(conn, tx, rowId, "retry-please");
// finding 002: cap retries so a perpetually-RetryPlease (poison)
// row cannot retry forever at the 60s backoff floor. The incoming
// AttemptCount is the count BEFORE this attempt; +1 accounts for the
// bump this drain represents. At the cap, dead-letter instead of
// bumping — and count it as leaving the live queue like PermanentFail.
if (liveRows[i].AttemptCount + 1 >= _maxAttempts)
{
DeadLetterRow(conn, tx, rowId, $"max attempts ({_maxAttempts}) exceeded");
rowsLeavingQueue++;
}
else
{
BumpAttempt(conn, tx, rowId, "retry-please");
}
break;
}
}
tx.Commit();
}
// Ack-deleted + PermanentFail-dead-lettered rows both leave the
// non-dead-lettered queue — keep the counter aligned (Core.AlarmHistorian-008).
// non-dead-lettered queue, as do RetryPlease rows that hit the max-attempts
// cap (finding 002) — keep the counter aligned (Core.AlarmHistorian-008).
if (rowsLeavingQueue > 0)
Interlocked.Add(ref _queuedRowCount, -rowsLeavingQueue);
@@ -552,14 +573,14 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
/// the <see cref="RowId"/> always stays bound to its own row so outcomes can
/// never be mapped to the wrong row.
/// </summary>
private readonly record struct QueueRow(long RowId, AlarmHistorianEvent? Event);
private readonly record struct QueueRow(long RowId, AlarmHistorianEvent? Event, long AttemptCount);
private List<QueueRow> ReadBatch(SqliteConnection conn)
{
var rows = new List<QueueRow>();
using var cmd = conn.CreateCommand();
cmd.CommandText = """
SELECT RowId, PayloadJson FROM Queue
SELECT RowId, PayloadJson, AttemptCount FROM Queue
WHERE DeadLettered = 0
ORDER BY RowId ASC
LIMIT $limit
@@ -570,6 +591,7 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
{
var rowId = reader.GetInt64(0);
var payload = reader.GetString(1);
var attemptCount = reader.GetInt64(2);
AlarmHistorianEvent? evt;
try
{
@@ -580,7 +602,7 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
// Malformed JSON — carry a null event so the caller dead-letters this row.
evt = null;
}
rows.Add(new QueueRow(rowId, evt));
rows.Add(new QueueRow(rowId, evt, attemptCount));
}
return rows;
}