feat(health): SiteAuditBacklog metric (count + age + bytes) (#23 M6)

This commit is contained in:
Joseph Doherty
2026-05-20 19:02:01 -04:00
parent 75b060e0a8
commit e93f655ce4
11 changed files with 511 additions and 2 deletions

View File

@@ -218,6 +218,14 @@ public static class ServiceCollectionExtensions
ServiceDescriptor.Singleton<IAuditWriteFailureCounter, HealthMetricsAuditWriteFailureCounter>());
services.Replace(
ServiceDescriptor.Singleton<IAuditRedactionFailureCounter, HealthMetricsAuditRedactionFailureCounter>());
// M6 Bundle E (T6): the site-side backlog reporter polls the
// SqliteAuditWriter every 30 s and pushes the snapshot into the
// collector so the next SiteHealthReport carries a fresh
// SiteAuditBacklog field. Registered alongside the other site-only
// metric bridges so AddAuditLog (which runs on central too) stays
// free of hosted-service registrations that would resolve a missing
// ISiteHealthCollector on central.
services.AddHostedService<SiteAuditBacklogReporter>();
return services;
}

View File

@@ -0,0 +1,133 @@
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using ScadaLink.Commons.Interfaces.Services;
using ScadaLink.HealthMonitoring;
namespace ScadaLink.AuditLog.Site;
/// <summary>
/// Audit Log (#23) M6 Bundle E (T6) — site-side hosted service that
/// periodically pulls a backlog snapshot from <see cref="ISiteAuditQueue"/>
/// and pushes it into <see cref="ISiteHealthCollector"/> so the next
/// <see cref="ISiteHealthCollector.CollectReport"/> emits a fresh
/// <c>SiteAuditBacklog</c> field on the site health report.
/// </summary>
/// <remarks>
/// <para>
/// <b>Why a hosted service, not the report sender.</b> Querying SQLite for the
/// backlog requires the queue's write lock; doing it inline in
/// <see cref="ISiteHealthCollector.CollectReport"/> would couple the collector
/// to <see cref="ISiteAuditQueue"/> and turn an in-memory snapshot read into
/// a synchronous I/O call on the report path. The hosted-service pattern keeps
/// the report path pure and the SQL probe off the report timing budget.
/// </para>
/// <para>
/// <b>Cadence.</b> 30 s by default — coarse enough to amortise the SQL probe
/// across many reports, fine enough that the central dashboard never lags by
/// more than one health-report interval. Tunable via
/// <see cref="ScadaLink.AuditLog.Site.SqliteAuditWriterOptions"/> in a follow-up
/// if ops needs a different cadence; for M6 we hard-code the value because the
/// brief calls it out explicitly.
/// </para>
/// <para>
/// <b>Failure containment.</b> The probe call is wrapped in a try/catch so a
/// transient SQLite error never tears down the hosted service — the next tick
/// retries. Mirrors <see cref="ScadaLink.AuditLog.Central.AuditLogPartitionMaintenanceService"/>'s
/// "exception logged, not propagated" contract.
/// </para>
/// </remarks>
public sealed class SiteAuditBacklogReporter : IHostedService, IDisposable
{
/// <summary>
/// Default poll cadence. Half a typical 60 s health-report interval keeps
/// the snapshot fresh without spinning the SQL probe more often than
/// necessary.
/// </summary>
internal static readonly TimeSpan DefaultRefreshInterval = TimeSpan.FromSeconds(30);
private readonly ISiteAuditQueue _queue;
private readonly ISiteHealthCollector _collector;
private readonly ILogger<SiteAuditBacklogReporter> _logger;
private readonly TimeSpan _refreshInterval;
private CancellationTokenSource? _cts;
private Task? _loop;
public SiteAuditBacklogReporter(
ISiteAuditQueue queue,
ISiteHealthCollector collector,
ILogger<SiteAuditBacklogReporter> logger,
TimeSpan? refreshInterval = null)
{
_queue = queue ?? throw new ArgumentNullException(nameof(queue));
_collector = collector ?? throw new ArgumentNullException(nameof(collector));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_refreshInterval = refreshInterval ?? DefaultRefreshInterval;
}
/// <inheritdoc />
public Task StartAsync(CancellationToken ct)
{
// Linked CTS lets StopAsync's cancellation AND the host's shutdown
// token both terminate the loop; either side firing aborts the
// pending Task.Delay.
_cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
_loop = Task.Run(() => RunLoopAsync(_cts.Token));
return Task.CompletedTask;
}
private async Task RunLoopAsync(CancellationToken ct)
{
// First tick runs immediately so the very first health report after
// process start carries a real backlog snapshot — without this the
// dashboard would show null for the first 30 s after a deploy.
await SafeProbeAsync(ct).ConfigureAwait(false);
while (!ct.IsCancellationRequested)
{
try
{
await Task.Delay(_refreshInterval, ct).ConfigureAwait(false);
}
catch (OperationCanceledException)
{
break;
}
await SafeProbeAsync(ct).ConfigureAwait(false);
}
}
private async Task SafeProbeAsync(CancellationToken ct)
{
try
{
var snapshot = await _queue.GetBacklogStatsAsync(ct).ConfigureAwait(false);
_collector.UpdateSiteAuditBacklog(snapshot);
}
catch (OperationCanceledException)
{
// Shutdown — let the outer loop exit cleanly.
throw;
}
catch (Exception ex)
{
// Catch-all is deliberate: the hosted service must survive every
// class of probe failure (transient SQLite lock contention, disk
// I/O hiccup, …) so the next tick gets a chance.
_logger.LogWarning(ex, "SiteAuditBacklogReporter probe failed; next tick will retry.");
}
}
/// <inheritdoc />
public Task StopAsync(CancellationToken ct)
{
_cts?.Cancel();
return _loop ?? Task.CompletedTask;
}
/// <inheritdoc />
public void Dispose()
{
_cts?.Dispose();
}
}

View File

@@ -4,6 +4,7 @@ using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ScadaLink.Commons.Entities.Audit;
using ScadaLink.Commons.Interfaces.Services;
using ScadaLink.Commons.Types;
using ScadaLink.Commons.Types.Enums;
namespace ScadaLink.AuditLog.Site;
@@ -484,6 +485,84 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
}
}
/// <summary>
/// M6 Bundle E (T6) health-metric surface: returns a point-in-time snapshot
/// of the site queue's pending count, the oldest pending row's
/// <see cref="AuditEvent.OccurredAtUtc"/>, and the on-disk file size. Called
/// by the site-side <c>SiteAuditBacklogReporter</c> hosted service on its
/// 30 s tick to refresh the <c>SiteHealthReport.SiteAuditBacklog</c> field.
/// </summary>
/// <remarks>
/// The pending-count + oldest-row queries run inside the same write lock as
/// the hot-path INSERT batch so the snapshot is consistent against the
/// connection's view (no torn read of an in-flight transaction). The on-disk
/// size lookup happens OUTSIDE the lock — it's a stat() call on the file
/// path and doesn't touch the connection. In-memory and missing files
/// return 0 bytes (the snapshot is for ops dashboards, not a correctness
/// invariant).
/// </remarks>
public Task<SiteAuditBacklogSnapshot> GetBacklogStatsAsync(CancellationToken ct = default)
{
int pendingCount;
DateTime? oldestPending;
lock (_writeLock)
{
ObjectDisposedException.ThrowIf(_disposed, this);
// Single round-trip — COUNT(*) + MIN(OccurredAtUtc) over the same
// index range avoids a second scan. The IX_SiteAuditLog_ForwardState_Occurred
// index makes both aggregates cheap (count is a covering scan, min
// is the first key).
using var cmd = _connection.CreateCommand();
cmd.CommandText = """
SELECT COUNT(*), MIN(OccurredAtUtc)
FROM AuditLog
WHERE ForwardState = $pending;
""";
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
using var reader = cmd.ExecuteReader();
reader.Read();
pendingCount = reader.GetInt32(0);
oldestPending = reader.IsDBNull(1)
? null
: DateTime.Parse(reader.GetString(1),
System.Globalization.CultureInfo.InvariantCulture,
System.Globalization.DateTimeStyles.RoundtripKind);
}
// File-size lookup outside the lock — the DatabasePath option is the
// canonical source. The connection-string-override branch (used by
// some tests) keeps the same DatabasePath value, so this works
// uniformly. In-memory / mode=memory paths return 0 because the file
// doesn't exist on disk.
long onDiskBytes = 0;
try
{
if (!string.IsNullOrEmpty(_options.DatabasePath) &&
!_options.DatabasePath.StartsWith(":memory:", StringComparison.Ordinal) &&
!_options.DatabasePath.Contains("mode=memory", StringComparison.OrdinalIgnoreCase) &&
File.Exists(_options.DatabasePath))
{
onDiskBytes = new FileInfo(_options.DatabasePath).Length;
}
}
catch (Exception ex)
{
// File system probe is a best-effort health-metric — never abort
// a backlog snapshot because stat() failed. Log and report 0.
_logger.LogDebug(ex,
"SqliteAuditWriter could not stat DB path {Path} for backlog snapshot.",
_options.DatabasePath);
}
return Task.FromResult(new SiteAuditBacklogSnapshot(
PendingCount: pendingCount,
OldestPendingUtc: oldestPending,
OnDiskBytes: onDiskBytes));
}
private static DateTime EnsureUtc(DateTime value) =>
value.Kind == DateTimeKind.Utc
? value