fix(core-alarm-historian): resolve Low code-review findings (Core.AlarmHistorian-008,011)

- Core.AlarmHistorian-008: cache queue depth in an Interlocked counter so
  EnqueueAsync no longer runs COUNT(*) on every alarm; consolidate
  DrainOnceAsync onto a single SqliteConnection per tick (purge, batch
  read, dead-letter, and outcome transaction all share it).
- Core.AlarmHistorian-011: confirm the stale Galaxy.Host XML doc
  references were already fixed under earlier commits; flip to Resolved.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-23 05:38:26 -04:00
parent b92fea15d4
commit 0da4f3b63a
3 changed files with 269 additions and 74 deletions

View File

@@ -609,6 +609,130 @@ public sealed class SqliteStoreAndForwardSinkTests : IDisposable
}
}
/// <summary>
/// Regression for Core.AlarmHistorian-008: <c>EnqueueAsync</c> must NOT run a
/// <c>SELECT COUNT(*)</c> on every enqueue when we are far below capacity. The
/// optimisation tracks the queue depth in memory and only consults the database
/// when the cached value indicates the capacity wall is in reach. This regression
/// pins the perf characteristic: after many enqueues below capacity, the
/// capacity-probe count must stay bounded — not grow proportionally to the
/// enqueue count as the un-optimised path did.
/// </summary>
[Fact]
public async Task EnqueueAsync_does_not_count_all_rows_on_every_call_below_capacity()
{
var writer = new FakeWriter();
using var sink = new SqliteStoreAndForwardSink(
_dbPath, writer, _log, batchSize: 100, capacity: 10_000);
for (var i = 0; i < 200; i++)
await sink.EnqueueAsync(Event($"E{i}"), CancellationToken.None);
// Pre-fix: probe count == enqueue count (200). Post-fix: ≤ a handful (initial
// load + occasional periodic re-syncs). 25 is generous headroom.
sink.DebugCapacityProbeCount.ShouldBeLessThan(25,
"EnqueueAsync must not run a per-call SELECT COUNT(*) below capacity");
}
/// <summary>
/// Regression for Core.AlarmHistorian-008: across every queue mutation (enqueue,
/// Ack drain, PermanentFail drain, capacity eviction, RetryDeadLettered) the
/// queue depth reported by <see cref="SqliteStoreAndForwardSink.GetStatus"/> must
/// stay aligned with a fresh <c>COUNT(*)</c> against the database. Catches drift
/// bugs in the in-memory counter introduced by the perf optimisation.
/// </summary>
[Fact]
public async Task Enqueue_and_drain_keep_queue_depth_consistent_with_storage()
{
var writer = new FakeWriter();
using var sink = new SqliteStoreAndForwardSink(
_dbPath, writer, _log, batchSize: 5, capacity: 8);
// Burst-enqueue below capacity — the in-memory counter must stay aligned with the
// SELECT COUNT(*) that GetStatus runs.
for (var i = 0; i < 6; i++)
await sink.EnqueueAsync(Event($"burst-{i}"), CancellationToken.None);
AssertQueueDepthMatchesStorage(sink);
sink.GetStatus().QueueDepth.ShouldBe(6);
// Push past capacity — capacity must still be enforced even when EnqueueAsync no
// longer runs COUNT(*) on every call.
for (var i = 0; i < 5; i++)
await sink.EnqueueAsync(Event($"overflow-{i}"), CancellationToken.None);
sink.GetStatus().QueueDepth.ShouldBe(8, "capacity must still be honoured by the perf-optimised path");
sink.GetStatus().EvictedCount.ShouldBe(3, "eviction counter must reflect every evicted row");
AssertQueueDepthMatchesStorage(sink);
// Drain a partial batch (Ack) — the in-memory counter must follow the deletes
// applied inside the single consolidated drain transaction.
await sink.DrainOnceAsync(CancellationToken.None);
AssertQueueDepthMatchesStorage(sink);
// Add a dead-lettered row and verify the counter does NOT include it (QueueDepth
// is non-dead-lettered only).
writer.NextOutcomePerEvent.Enqueue(HistorianWriteOutcome.PermanentFail);
await sink.EnqueueAsync(Event("to-dead-letter"), CancellationToken.None);
await sink.DrainOnceAsync(CancellationToken.None);
AssertQueueDepthMatchesStorage(sink);
sink.GetStatus().DeadLetterDepth.ShouldBeGreaterThanOrEqualTo(1);
// RetryDeadLettered moves DLQ rows back into the live queue — the counter must
// pick that up.
sink.RetryDeadLettered();
AssertQueueDepthMatchesStorage(sink);
}
/// <summary>
/// Stress regression for Core.AlarmHistorian-008: interleave many enqueues and
/// drains across threads and confirm the in-memory counter stays consistent
/// with storage. Catches drift bugs in the optimised path that would only show
/// up under contention.
/// </summary>
[Fact]
public async Task Counter_remains_consistent_under_concurrent_enqueue_and_drain()
{
var writer = new FakeWriter();
using var sink = new SqliteStoreAndForwardSink(_dbPath, writer, _log);
var enqueuers = Enumerable.Range(0, 3).Select(t => Task.Run(async () =>
{
for (var i = 0; i < 60; i++)
await sink.EnqueueAsync(Event($"T{t}-{i}"), CancellationToken.None);
}));
var drainers = Enumerable.Range(0, 2).Select(_ => Task.Run(async () =>
{
for (var i = 0; i < 30; i++)
{
await sink.DrainOnceAsync(CancellationToken.None);
await Task.Delay(2);
}
}));
await Task.WhenAll(enqueuers.Concat(drainers));
// Drain anything left over.
for (var i = 0; i < 10; i++)
await sink.DrainOnceAsync(CancellationToken.None);
AssertQueueDepthMatchesStorage(sink);
sink.GetStatus().QueueDepth.ShouldBe(0, "every event drained at the end of the run");
}
/// <summary>
/// Helper that confirms the queue depth surfaced by GetStatus matches a fresh
/// COUNT(*) read directly from storage — proves the in-memory counter has not
/// drifted from the persisted truth.
/// </summary>
private void AssertQueueDepthMatchesStorage(SqliteStoreAndForwardSink sink)
{
using var conn = new SqliteConnection($"Data Source={_dbPath}");
conn.Open();
using var cmd = conn.CreateCommand();
cmd.CommandText = "SELECT COUNT(*) FROM Queue WHERE DeadLettered = 0";
var live = (long)(cmd.ExecuteScalar() ?? 0L);
sink.GetStatus().QueueDepth.ShouldBe(live, "GetStatus must agree with a fresh COUNT(*)");
}
/// <summary>Insert a queue row whose PayloadJson cannot deserialize into an AlarmHistorianEvent.</summary>
private void InsertCorruptRow(string alarmId)
{