feat(health): SiteAuditBacklog metric (count + age + bytes) (#23 M6)

This commit is contained in:
Joseph Doherty
2026-05-20 19:02:01 -04:00
parent 75b060e0a8
commit e93f655ce4
11 changed files with 511 additions and 2 deletions

View File

@@ -1,4 +1,5 @@
using ScadaLink.Commons.Entities.Audit;
using ScadaLink.Commons.Types;
namespace ScadaLink.Commons.Interfaces.Services;
@@ -70,4 +71,17 @@ public interface ISiteAuditQueue
/// are left untouched (idempotent re-call). Non-existent ids are silent no-ops.
/// </summary>
Task MarkReconciledAsync(IReadOnlyList<Guid> eventIds, CancellationToken ct = default);
/// <summary>
/// M6 Bundle E (T6) health-metric surface: returns a point-in-time snapshot
/// of the site queue's pending count + oldest pending timestamp + on-disk
/// SQLite file size. Surfaced on
/// <see cref="ScadaLink.Commons.Messages.Health.SiteHealthReport"/> as
/// <c>SiteAuditBacklog</c> by the periodic <c>SiteAuditBacklogReporter</c>
/// hosted service so a stuck site→central drain is visible on the central
/// health dashboard. Safe to call concurrently with hot-path writes —
/// implementations are expected to take the same connection lock used by
/// the hot-path INSERT batch and the drain queries.
/// </summary>
Task<SiteAuditBacklogSnapshot> GetBacklogStatsAsync(CancellationToken ct = default);
}

View File

@@ -1,3 +1,4 @@
using ScadaLink.Commons.Types;
using ScadaLink.Commons.Types.Enums;
namespace ScadaLink.Commons.Messages.Health;
@@ -32,7 +33,14 @@ public record SiteHealthReport(
// marker). Surfaces a misconfigured / catastrophic regex on
// /monitoring/health. Defaults to 0 for back-compat with existing
// producers and tests that don't construct the field.
int AuditRedactionFailure = 0);
int AuditRedactionFailure = 0,
// Audit Log (#23) M6 Bundle E (T6): point-in-time snapshot of the
// site-local SQLite audit-log queue (pending count, oldest pending row,
// on-disk bytes). Populated by the site-side SiteAuditBacklogReporter
// hosted service every 30 s. Defaults to null so existing producers /
// tests that don't refresh the snapshot stay valid; the central health
// surface treats null as "no data yet" rather than a zeroed queue.
SiteAuditBacklogSnapshot? SiteAuditBacklog = null);
/// <summary>
/// Broadcast wrapper used between central nodes to keep per-node

View File

@@ -0,0 +1,32 @@
namespace ScadaLink.Commons.Types;
/// <summary>
/// Audit Log (#23) M6 Bundle E (T6) — point-in-time snapshot of the site-local
/// SQLite audit-log queue health, surfaced on
/// <see cref="ScadaLink.Commons.Messages.Health.SiteHealthReport"/> as
/// <c>SiteAuditBacklog</c> and refreshed periodically by the
/// <c>SiteAuditBacklogReporter</c> hosted service.
/// </summary>
/// <param name="PendingCount">
/// Number of rows currently in
/// <see cref="ScadaLink.Commons.Types.Enums.AuditForwardState.Pending"/> — i.e.
/// not yet acknowledged by central via either the push-telemetry or
/// reconciliation-pull paths. A persistently non-zero value with rising
/// <see cref="OldestPendingUtc"/> indicates the site→central drain isn't
/// keeping up.
/// </param>
/// <param name="OldestPendingUtc">
/// <see cref="ScadaLink.Commons.Entities.Audit.AuditEvent.OccurredAtUtc"/> of
/// the oldest Pending row, or <c>null</c> if the queue is empty. Used by ops
/// to compute backlog age without a separate query.
/// </param>
/// <param name="OnDiskBytes">
/// Size of the SQLite file on disk in bytes, or <c>0</c> if the writer is
/// running against an in-memory database. Mirrors the 7-day retention
/// invariant (alog.md §10) — a steady file-size growth past the retention
/// window points at a stuck purge or a stuck forwarder.
/// </param>
public sealed record SiteAuditBacklogSnapshot(
int PendingCount,
DateTime? OldestPendingUtc,
long OnDiskBytes);