fix(core-alarm-historian): resolve Low code-review findings (Core.AlarmHistorian-008,011)
- Core.AlarmHistorian-008: cache queue depth in an Interlocked counter so EnqueueAsync no longer runs COUNT(*) on every alarm; consolidate DrainOnceAsync onto a single SqliteConnection per tick (purge, batch read, dead-letter, and outcome transaction all share it). - Core.AlarmHistorian-011: confirm the stale Galaxy.Host XML doc references were already fixed under earlier commits; flip to Resolved. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -87,6 +87,25 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
// having to scrape the WARN log.
|
||||
private long _evictedCount;
|
||||
|
||||
// Core.AlarmHistorian-008: keep an approximate in-memory count of non-dead-lettered
|
||||
// rows so EnqueueAsync does not need to run a SELECT COUNT(*) on every call. The
|
||||
// counter is seeded from storage at construction, kept current by every mutation
|
||||
// (Enqueue, Drain, RetryDeadLettered, PurgeAgedDeadLetters, EnforceCapacity), and
|
||||
// periodically re-synced from storage as a safety net against drift.
|
||||
// Mutations cross threads (EnqueueAsync is called from the emitting thread, drain
|
||||
// runs on the timer / drain thread) so it is updated via Interlocked.
|
||||
private long _queuedRowCount;
|
||||
// Probe counter — incremented every time we actually issue a real COUNT(*) for
|
||||
// capacity enforcement. Public for test instrumentation only.
|
||||
private long _capacityProbeCount;
|
||||
// After every Nth enqueue we resync the in-memory counter from storage to defend
|
||||
// against silent drift (e.g. an external process editing the DB).
|
||||
private const long ResyncEnqueueInterval = 10_000;
|
||||
private long _enqueuesSinceResync;
|
||||
|
||||
/// <summary>Test-only: number of times the perf-optimised path fell through to a real <c>COUNT(*)</c>.</summary>
|
||||
public long DebugCapacityProbeCount => Interlocked.Read(ref _capacityProbeCount);
|
||||
|
||||
public SqliteStoreAndForwardSink(
|
||||
string databasePath,
|
||||
IAlarmHistorianWriter writer,
|
||||
@@ -115,6 +134,9 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
}.ToString();
|
||||
|
||||
InitializeSchema();
|
||||
// Core.AlarmHistorian-008: seed the in-memory counter from storage so the
|
||||
// perf-optimised EnqueueAsync path starts in sync with what's on disk.
|
||||
_queuedRowCount = ProbeQueuedRowCount();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -223,7 +245,11 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
await conn.OpenAsync(cancellationToken).ConfigureAwait(false);
|
||||
await ApplyPragmasAsync(conn, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
await EnforceCapacityAsync(conn, cancellationToken).ConfigureAwait(false);
|
||||
// Core.AlarmHistorian-008: use the in-memory counter to short-circuit the
|
||||
// capacity check on every enqueue. The bare hot path is now one INSERT — no
|
||||
// SELECT COUNT(*). We fall back to a real probe only when the cached counter
|
||||
// says we're at or above capacity, or periodically to defend against drift.
|
||||
await EnforceCapacityFastPathAsync(conn, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
using var cmd = conn.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
@@ -234,6 +260,57 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
cmd.Parameters.AddWithValue("$enqueued", _clock().ToString("O"));
|
||||
cmd.Parameters.AddWithValue("$payload", JsonSerializer.Serialize(evt));
|
||||
await cmd.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
Interlocked.Increment(ref _queuedRowCount);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Capacity enforcement on the hot enqueue path: consults the in-memory counter
|
||||
/// first and only probes storage with a real <c>COUNT(*)</c> when (a) the
|
||||
/// cached value indicates the capacity wall is in reach, or (b) the periodic
|
||||
/// resync interval has elapsed. The actual eviction (when over capacity) goes
|
||||
/// through <see cref="EnforceCapacityAsync"/> which still runs a precise
|
||||
/// COUNT to compute the exact number of rows to evict.
|
||||
/// </summary>
|
||||
private async Task EnforceCapacityFastPathAsync(SqliteConnection conn, CancellationToken ct)
|
||||
{
|
||||
var enqueuesSinceResync = Interlocked.Increment(ref _enqueuesSinceResync);
|
||||
var cached = Interlocked.Read(ref _queuedRowCount);
|
||||
|
||||
// Periodic resync — bounded amount of drift even under exotic conditions.
|
||||
if (enqueuesSinceResync >= ResyncEnqueueInterval)
|
||||
{
|
||||
await ResyncQueuedRowCountAsync(conn, ct).ConfigureAwait(false);
|
||||
cached = Interlocked.Read(ref _queuedRowCount);
|
||||
Interlocked.Exchange(ref _enqueuesSinceResync, 0);
|
||||
}
|
||||
|
||||
// Below capacity per the cached counter — skip the COUNT(*) entirely.
|
||||
if (cached < _capacity) return;
|
||||
|
||||
// Cached counter says we're at or above the capacity wall — fall back to the
|
||||
// precise path which probes COUNT(*) and evicts whatever's needed.
|
||||
await EnforceCapacityAsync(conn, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
/// <summary>Synchronously query <c>COUNT(*)</c> of non-dead-lettered rows. Used at startup.</summary>
|
||||
private long ProbeQueuedRowCount()
|
||||
{
|
||||
Interlocked.Increment(ref _capacityProbeCount);
|
||||
using var conn = OpenConnection();
|
||||
using var cmd = conn.CreateCommand();
|
||||
cmd.CommandText = "SELECT COUNT(*) FROM Queue WHERE DeadLettered = 0";
|
||||
return (long)(cmd.ExecuteScalar() ?? 0L);
|
||||
}
|
||||
|
||||
/// <summary>Re-sync the in-memory counter from storage (async path).</summary>
|
||||
private async Task ResyncQueuedRowCountAsync(SqliteConnection conn, CancellationToken ct)
|
||||
{
|
||||
Interlocked.Increment(ref _capacityProbeCount);
|
||||
using var cmd = conn.CreateCommand();
|
||||
cmd.CommandText = "SELECT COUNT(*) FROM Queue WHERE DeadLettered = 0";
|
||||
var live = (long)(await cmd.ExecuteScalarAsync(ct).ConfigureAwait(false) ?? 0L);
|
||||
Interlocked.Exchange(ref _queuedRowCount, live);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -242,6 +319,12 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
/// on RetryPlease. Safe to call from multiple threads; the semaphore enforces
|
||||
/// serial execution.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Core.AlarmHistorian-008: every per-tick SQLite operation runs through a
|
||||
/// single shared connection (purge, read, corrupt-row dead-letter, and the
|
||||
/// outcome-applying transaction). Pre-fix the drain opened three independent
|
||||
/// connections per tick, each paying the open + PRAGMA cost.
|
||||
/// </remarks>
|
||||
public async Task DrainOnceAsync(CancellationToken ct)
|
||||
{
|
||||
if (_disposed) return;
|
||||
@@ -254,8 +337,12 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
_lastDrainUtc = _clock();
|
||||
}
|
||||
|
||||
PurgeAgedDeadLetters();
|
||||
var batch = ReadBatch();
|
||||
// One connection per drain tick — used by purge, read, corrupt-dead-letter,
|
||||
// and the outcome-applying transaction.
|
||||
using var conn = OpenConnection();
|
||||
|
||||
PurgeAgedDeadLetters(conn);
|
||||
var batch = ReadBatch(conn);
|
||||
if (batch.Count == 0)
|
||||
{
|
||||
lock (_statusLock) { _drainState = HistorianDrainState.Idle; }
|
||||
@@ -271,11 +358,13 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
|
||||
if (corruptRowIds.Count > 0)
|
||||
{
|
||||
using var corruptConn = OpenConnection();
|
||||
using var corruptTx = corruptConn.BeginTransaction();
|
||||
using var corruptTx = conn.BeginTransaction();
|
||||
foreach (var rowId in corruptRowIds)
|
||||
DeadLetterRow(corruptConn, corruptTx, rowId, $"corrupt payload at {_clock():O}");
|
||||
DeadLetterRow(conn, corruptTx, rowId, $"corrupt payload at {_clock():O}");
|
||||
corruptTx.Commit();
|
||||
// Each corrupt row leaves the non-dead-lettered queue — bookkeeping for
|
||||
// the in-memory counter (Core.AlarmHistorian-008).
|
||||
Interlocked.Add(ref _queuedRowCount, -corruptRowIds.Count);
|
||||
_logger.Warning(
|
||||
"Dead-lettered {Count} historian queue row(s) with un-deserializable payload",
|
||||
corruptRowIds.Count);
|
||||
@@ -330,26 +419,34 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
return;
|
||||
}
|
||||
|
||||
using var conn = OpenConnection();
|
||||
using var tx = conn.BeginTransaction();
|
||||
for (var i = 0; i < outcomes.Count; i++)
|
||||
int rowsLeavingQueue = 0;
|
||||
using (var tx = conn.BeginTransaction())
|
||||
{
|
||||
var outcome = outcomes[i];
|
||||
var rowId = liveRows[i].RowId;
|
||||
switch (outcome)
|
||||
for (var i = 0; i < outcomes.Count; i++)
|
||||
{
|
||||
case HistorianWriteOutcome.Ack:
|
||||
DeleteRow(conn, tx, rowId);
|
||||
break;
|
||||
case HistorianWriteOutcome.PermanentFail:
|
||||
DeadLetterRow(conn, tx, rowId, $"permanent fail at {_clock():O}");
|
||||
break;
|
||||
case HistorianWriteOutcome.RetryPlease:
|
||||
BumpAttempt(conn, tx, rowId, "retry-please");
|
||||
break;
|
||||
var outcome = outcomes[i];
|
||||
var rowId = liveRows[i].RowId;
|
||||
switch (outcome)
|
||||
{
|
||||
case HistorianWriteOutcome.Ack:
|
||||
DeleteRow(conn, tx, rowId);
|
||||
rowsLeavingQueue++;
|
||||
break;
|
||||
case HistorianWriteOutcome.PermanentFail:
|
||||
DeadLetterRow(conn, tx, rowId, $"permanent fail at {_clock():O}");
|
||||
rowsLeavingQueue++;
|
||||
break;
|
||||
case HistorianWriteOutcome.RetryPlease:
|
||||
BumpAttempt(conn, tx, rowId, "retry-please");
|
||||
break;
|
||||
}
|
||||
}
|
||||
tx.Commit();
|
||||
}
|
||||
tx.Commit();
|
||||
// Ack-deleted + PermanentFail-dead-lettered rows both leave the
|
||||
// non-dead-lettered queue — keep the counter aligned (Core.AlarmHistorian-008).
|
||||
if (rowsLeavingQueue > 0)
|
||||
Interlocked.Add(ref _queuedRowCount, -rowsLeavingQueue);
|
||||
|
||||
var acks = outcomes.Count(o => o == HistorianWriteOutcome.Ack);
|
||||
lock (_statusLock)
|
||||
@@ -375,15 +472,15 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
|
||||
public HistorianSinkStatus GetStatus()
|
||||
{
|
||||
using var conn = OpenConnection();
|
||||
// Core.AlarmHistorian-008: read the non-dead-lettered count from the in-memory
|
||||
// counter so a busy Admin UI / health probe does not hammer the DB. Dead-letter
|
||||
// depth is rare-path only (it lives in the queue until retention) so a real
|
||||
// COUNT(*) on a single combined connection is fine.
|
||||
var queued = Interlocked.Read(ref _queuedRowCount);
|
||||
if (queued < 0) queued = 0;
|
||||
|
||||
long queued;
|
||||
long deadlettered;
|
||||
using (var cmd = conn.CreateCommand())
|
||||
{
|
||||
cmd.CommandText = "SELECT COUNT(*) FROM Queue WHERE DeadLettered = 0";
|
||||
queued = (long)(cmd.ExecuteScalar() ?? 0L);
|
||||
}
|
||||
using (var conn = OpenConnection())
|
||||
using (var cmd = conn.CreateCommand())
|
||||
{
|
||||
cmd.CommandText = "SELECT COUNT(*) FROM Queue WHERE DeadLettered = 1";
|
||||
@@ -421,7 +518,11 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
using var conn = OpenConnection();
|
||||
using var cmd = conn.CreateCommand();
|
||||
cmd.CommandText = "UPDATE Queue SET DeadLettered = 0, AttemptCount = 0, LastError = NULL WHERE DeadLettered = 1";
|
||||
return cmd.ExecuteNonQuery();
|
||||
var revived = cmd.ExecuteNonQuery();
|
||||
// Dead-lettered rows rejoin the non-dead-lettered queue — keep the in-memory
|
||||
// counter aligned (Core.AlarmHistorian-008).
|
||||
if (revived > 0) Interlocked.Add(ref _queuedRowCount, revived);
|
||||
return revived;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -432,10 +533,9 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
/// </summary>
|
||||
private readonly record struct QueueRow(long RowId, AlarmHistorianEvent? Event);
|
||||
|
||||
private List<QueueRow> ReadBatch()
|
||||
private List<QueueRow> ReadBatch(SqliteConnection conn)
|
||||
{
|
||||
var rows = new List<QueueRow>();
|
||||
using var conn = OpenConnection();
|
||||
using var cmd = conn.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
SELECT RowId, PayloadJson FROM Queue
|
||||
@@ -501,50 +601,21 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
cmd.ExecuteNonQuery();
|
||||
}
|
||||
|
||||
private void EnforceCapacity(SqliteConnection conn)
|
||||
{
|
||||
// Count non-dead-lettered rows only — dead-lettered rows retain for
|
||||
// post-mortem per the configured retention window.
|
||||
long count;
|
||||
using (var cmd = conn.CreateCommand())
|
||||
{
|
||||
cmd.CommandText = "SELECT COUNT(*) FROM Queue WHERE DeadLettered = 0";
|
||||
count = (long)(cmd.ExecuteScalar() ?? 0L);
|
||||
}
|
||||
if (count < _capacity) return;
|
||||
|
||||
var toEvict = count - _capacity + 1;
|
||||
using (var cmd = conn.CreateCommand())
|
||||
{
|
||||
cmd.CommandText = """
|
||||
DELETE FROM Queue
|
||||
WHERE RowId IN (
|
||||
SELECT RowId FROM Queue
|
||||
WHERE DeadLettered = 0
|
||||
ORDER BY RowId ASC
|
||||
LIMIT $n
|
||||
)
|
||||
""";
|
||||
cmd.Parameters.AddWithValue("$n", toEvict);
|
||||
cmd.ExecuteNonQuery();
|
||||
}
|
||||
// Core.AlarmHistorian-009: increment the lifetime eviction counter so the
|
||||
// Admin UI / health check can report overflow without requiring log scraping.
|
||||
lock (_statusLock) { _evictedCount += toEvict; }
|
||||
_logger.Warning(
|
||||
"Historian queue at capacity {Cap} — evicted {Count} oldest row(s) to make room (lifetime evictions: {Total})",
|
||||
_capacity, toEvict, _evictedCount);
|
||||
}
|
||||
|
||||
// Async variant used by EnqueueAsync (Core.AlarmHistorian-003).
|
||||
// Core.AlarmHistorian-008: the precise path — runs COUNT(*) to compute the exact
|
||||
// number of rows to evict. Reached only from the fast-path fallback when the
|
||||
// in-memory counter says we are at or above capacity.
|
||||
private async Task EnforceCapacityAsync(SqliteConnection conn, CancellationToken ct)
|
||||
{
|
||||
Interlocked.Increment(ref _capacityProbeCount);
|
||||
long count;
|
||||
using (var cmd = conn.CreateCommand())
|
||||
{
|
||||
cmd.CommandText = "SELECT COUNT(*) FROM Queue WHERE DeadLettered = 0";
|
||||
count = (long)(await cmd.ExecuteScalarAsync(ct).ConfigureAwait(false) ?? 0L);
|
||||
}
|
||||
// Resync the in-memory counter while we have a fresh number.
|
||||
Interlocked.Exchange(ref _queuedRowCount, count);
|
||||
if (count < _capacity) return;
|
||||
|
||||
var toEvict = count - _capacity + 1;
|
||||
@@ -562,16 +633,16 @@ public sealed class SqliteStoreAndForwardSink : IAlarmHistorianSink, IDisposable
|
||||
cmd.Parameters.AddWithValue("$n", toEvict);
|
||||
await cmd.ExecuteNonQueryAsync(ct).ConfigureAwait(false);
|
||||
}
|
||||
Interlocked.Add(ref _queuedRowCount, -toEvict);
|
||||
lock (_statusLock) { _evictedCount += toEvict; }
|
||||
_logger.Warning(
|
||||
"Historian queue at capacity {Cap} — evicted {Count} oldest row(s) to make room (lifetime evictions: {Total})",
|
||||
_capacity, toEvict, _evictedCount);
|
||||
}
|
||||
|
||||
private void PurgeAgedDeadLetters()
|
||||
private void PurgeAgedDeadLetters(SqliteConnection conn)
|
||||
{
|
||||
var cutoff = (_clock() - _deadLetterRetention).ToString("O");
|
||||
using var conn = OpenConnection();
|
||||
using var cmd = conn.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
DELETE FROM Queue
|
||||
|
||||
Reference in New Issue
Block a user