fix(alarm-historian): keep queue rows aligned to events on drain (Core.AlarmHistorian-001)

ReadBatch built parallel rowIds / events lists: rowIds.Add ran for every
row but events.Add was guarded by `if (evt is not null)`. A corrupt /
null-deserializing payload desynced the lists, so DrainOnceAsync applied
each outcome to the wrong RowId — an Ack could delete an un-sent event
(silent alarm-event data loss) and the corrupt row stalled the queue
head forever.

ReadBatch now returns a single list of QueueRow(long RowId,
AlarmHistorianEvent? Event) records so a rowId can never drift from its
event; deserialization is wrapped to yield null on JsonException.
DrainOnceAsync immediately dead-letters rows whose payload is
null/un-deserializable and forwards only well-formed events to the
writer, mapping outcomes by RowId.

Regression tests cover a corrupt row mid-batch and at the queue head.
Core.AlarmHistorian suite: 16/16 pass.

Resolves code-review finding Core.AlarmHistorian-001 (Critical).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-22 05:54:20 -04:00
parent cfb9ff1032
commit 796871c210
3 changed files with 130 additions and 13 deletions

View File

@@ -142,8 +142,34 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
_lastDrainUtc = _clock();
PurgeAgedDeadLetters();
var (rowIds, events) = ReadBatch();
if (rowIds.Count == 0)
var batch = ReadBatch();
if (batch.Count == 0)
{
_drainState = HistorianDrainState.Idle;
return;
}
// A null/un-deserializable payload can never succeed — dead-letter it
// immediately for its own RowId so it cannot stall the queue head, and
// exclude it from the batch handed to the writer.
var corruptRowIds = batch.Where(r => r.Event is null).Select(r => r.RowId).ToList();
var liveRows = batch.Where(r => r.Event is not null).ToList();
var events = liveRows.Select(r => r.Event!).ToList();
if (corruptRowIds.Count > 0)
{
using var corruptConn = new SqliteConnection(_connectionString);
corruptConn.Open();
using var corruptTx = corruptConn.BeginTransaction();
foreach (var rowId in corruptRowIds)
DeadLetterRow(corruptConn, corruptTx, rowId, $"corrupt payload at {_clock():O}");
corruptTx.Commit();
_logger.Warning(
"Dead-lettered {Count} historian queue row(s) with un-deserializable payload",
corruptRowIds.Count);
}
if (events.Count == 0)
{
_drainState = HistorianDrainState.Idle;
return;
@@ -179,7 +205,7 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
for (var i = 0; i < outcomes.Count; i++)
{
var outcome = outcomes[i];
var rowId = rowIds[i];
var rowId = liveRows[i].RowId;
switch (outcome)
{
case HistorianWriteOutcome.Ack:
@@ -252,10 +278,17 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
return cmd.ExecuteNonQuery();
}
private (List<long> rowIds, List<AlarmHistorianEvent> events) ReadBatch()
/// <summary>
/// One queued row paired with its deserialized event. <see cref="Event"/> is
/// <c>null</c> when the row's <c>PayloadJson</c> is corrupt or un-deserializable —
/// the <see cref="RowId"/> always stays bound to its own row so outcomes can
/// never be mapped to the wrong row.
/// </summary>
private readonly record struct QueueRow(long RowId, AlarmHistorianEvent? Event);
private List<QueueRow> ReadBatch()
{
var rowIds = new List<long>();
var events = new List<AlarmHistorianEvent>();
var rows = new List<QueueRow>();
using var conn = new SqliteConnection(_connectionString);
conn.Open();
using var cmd = conn.CreateCommand();
@@ -269,12 +302,21 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
using var reader = cmd.ExecuteReader();
while (reader.Read())
{
rowIds.Add(reader.GetInt64(0));
var rowId = reader.GetInt64(0);
var payload = reader.GetString(1);
var evt = JsonSerializer.Deserialize<AlarmHistorianEvent>(payload);
if (evt is not null) events.Add(evt);
AlarmHistorianEvent? evt;
try
{
evt = JsonSerializer.Deserialize<AlarmHistorianEvent>(payload);
}
catch (JsonException)
{
// Malformed JSON — carry a null event so the caller dead-letters this row.
evt = null;
}
rows.Add(new QueueRow(rowId, evt));
}
return (rowIds, events);
return rows;
}
private static void DeleteRow(SqliteConnection conn, SqliteTransaction tx, long rowId)