feat(health): SiteAuditBacklog metric (count + age + bytes) (#23 M6)
This commit is contained in:
@@ -218,6 +218,14 @@ public static class ServiceCollectionExtensions
|
||||
ServiceDescriptor.Singleton<IAuditWriteFailureCounter, HealthMetricsAuditWriteFailureCounter>());
|
||||
services.Replace(
|
||||
ServiceDescriptor.Singleton<IAuditRedactionFailureCounter, HealthMetricsAuditRedactionFailureCounter>());
|
||||
// M6 Bundle E (T6): the site-side backlog reporter polls the
|
||||
// SqliteAuditWriter every 30 s and pushes the snapshot into the
|
||||
// collector so the next SiteHealthReport carries a fresh
|
||||
// SiteAuditBacklog field. Registered alongside the other site-only
|
||||
// metric bridges so AddAuditLog (which runs on central too) stays
|
||||
// free of hosted-service registrations that would resolve a missing
|
||||
// ISiteHealthCollector on central.
|
||||
services.AddHostedService<SiteAuditBacklogReporter>();
|
||||
return services;
|
||||
}
|
||||
|
||||
|
||||
133
src/ScadaLink.AuditLog/Site/SiteAuditBacklogReporter.cs
Normal file
133
src/ScadaLink.AuditLog/Site/SiteAuditBacklogReporter.cs
Normal file
@@ -0,0 +1,133 @@
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ScadaLink.Commons.Interfaces.Services;
|
||||
using ScadaLink.HealthMonitoring;
|
||||
|
||||
namespace ScadaLink.AuditLog.Site;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M6 Bundle E (T6) — site-side hosted service that
|
||||
/// periodically pulls a backlog snapshot from <see cref="ISiteAuditQueue"/>
|
||||
/// and pushes it into <see cref="ISiteHealthCollector"/> so the next
|
||||
/// <see cref="ISiteHealthCollector.CollectReport"/> emits a fresh
|
||||
/// <c>SiteAuditBacklog</c> field on the site health report.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Why a hosted service, not the report sender.</b> Querying SQLite for the
|
||||
/// backlog requires the queue's write lock; doing it inline in
|
||||
/// <see cref="ISiteHealthCollector.CollectReport"/> would couple the collector
|
||||
/// to <see cref="ISiteAuditQueue"/> and turn an in-memory snapshot read into
|
||||
/// a synchronous I/O call on the report path. The hosted-service pattern keeps
|
||||
/// the report path pure and the SQL probe off the report timing budget.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Cadence.</b> 30 s by default — coarse enough to amortise the SQL probe
|
||||
/// across many reports, fine enough that the central dashboard never lags by
|
||||
/// more than one health-report interval. Tunable via
|
||||
/// <see cref="ScadaLink.AuditLog.Site.SqliteAuditWriterOptions"/> in a follow-up
|
||||
/// if ops needs a different cadence; for M6 we hard-code the value because the
|
||||
/// brief calls it out explicitly.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Failure containment.</b> The probe call is wrapped in a try/catch so a
|
||||
/// transient SQLite error never tears down the hosted service — the next tick
|
||||
/// retries. Mirrors <see cref="ScadaLink.AuditLog.Central.AuditLogPartitionMaintenanceService"/>'s
|
||||
/// "exception logged, not propagated" contract.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class SiteAuditBacklogReporter : IHostedService, IDisposable
|
||||
{
|
||||
/// <summary>
|
||||
/// Default poll cadence. Half a typical 60 s health-report interval keeps
|
||||
/// the snapshot fresh without spinning the SQL probe more often than
|
||||
/// necessary.
|
||||
/// </summary>
|
||||
internal static readonly TimeSpan DefaultRefreshInterval = TimeSpan.FromSeconds(30);
|
||||
|
||||
private readonly ISiteAuditQueue _queue;
|
||||
private readonly ISiteHealthCollector _collector;
|
||||
private readonly ILogger<SiteAuditBacklogReporter> _logger;
|
||||
private readonly TimeSpan _refreshInterval;
|
||||
private CancellationTokenSource? _cts;
|
||||
private Task? _loop;
|
||||
|
||||
public SiteAuditBacklogReporter(
|
||||
ISiteAuditQueue queue,
|
||||
ISiteHealthCollector collector,
|
||||
ILogger<SiteAuditBacklogReporter> logger,
|
||||
TimeSpan? refreshInterval = null)
|
||||
{
|
||||
_queue = queue ?? throw new ArgumentNullException(nameof(queue));
|
||||
_collector = collector ?? throw new ArgumentNullException(nameof(collector));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_refreshInterval = refreshInterval ?? DefaultRefreshInterval;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task StartAsync(CancellationToken ct)
|
||||
{
|
||||
// Linked CTS lets StopAsync's cancellation AND the host's shutdown
|
||||
// token both terminate the loop; either side firing aborts the
|
||||
// pending Task.Delay.
|
||||
_cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
_loop = Task.Run(() => RunLoopAsync(_cts.Token));
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
private async Task RunLoopAsync(CancellationToken ct)
|
||||
{
|
||||
// First tick runs immediately so the very first health report after
|
||||
// process start carries a real backlog snapshot — without this the
|
||||
// dashboard would show null for the first 30 s after a deploy.
|
||||
await SafeProbeAsync(ct).ConfigureAwait(false);
|
||||
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await Task.Delay(_refreshInterval, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
await SafeProbeAsync(ct).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task SafeProbeAsync(CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
var snapshot = await _queue.GetBacklogStatsAsync(ct).ConfigureAwait(false);
|
||||
_collector.UpdateSiteAuditBacklog(snapshot);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Shutdown — let the outer loop exit cleanly.
|
||||
throw;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Catch-all is deliberate: the hosted service must survive every
|
||||
// class of probe failure (transient SQLite lock contention, disk
|
||||
// I/O hiccup, …) so the next tick gets a chance.
|
||||
_logger.LogWarning(ex, "SiteAuditBacklogReporter probe failed; next tick will retry.");
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task StopAsync(CancellationToken ct)
|
||||
{
|
||||
_cts?.Cancel();
|
||||
return _loop ?? Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void Dispose()
|
||||
{
|
||||
_cts?.Dispose();
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,7 @@ using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ScadaLink.Commons.Entities.Audit;
|
||||
using ScadaLink.Commons.Interfaces.Services;
|
||||
using ScadaLink.Commons.Types;
|
||||
using ScadaLink.Commons.Types.Enums;
|
||||
|
||||
namespace ScadaLink.AuditLog.Site;
|
||||
@@ -484,6 +485,84 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// M6 Bundle E (T6) health-metric surface: returns a point-in-time snapshot
|
||||
/// of the site queue's pending count, the oldest pending row's
|
||||
/// <see cref="AuditEvent.OccurredAtUtc"/>, and the on-disk file size. Called
|
||||
/// by the site-side <c>SiteAuditBacklogReporter</c> hosted service on its
|
||||
/// 30 s tick to refresh the <c>SiteHealthReport.SiteAuditBacklog</c> field.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The pending-count + oldest-row queries run inside the same write lock as
|
||||
/// the hot-path INSERT batch so the snapshot is consistent against the
|
||||
/// connection's view (no torn read of an in-flight transaction). The on-disk
|
||||
/// size lookup happens OUTSIDE the lock — it's a stat() call on the file
|
||||
/// path and doesn't touch the connection. In-memory and missing files
|
||||
/// return 0 bytes (the snapshot is for ops dashboards, not a correctness
|
||||
/// invariant).
|
||||
/// </remarks>
|
||||
public Task<SiteAuditBacklogSnapshot> GetBacklogStatsAsync(CancellationToken ct = default)
|
||||
{
|
||||
int pendingCount;
|
||||
DateTime? oldestPending;
|
||||
|
||||
lock (_writeLock)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
// Single round-trip — COUNT(*) + MIN(OccurredAtUtc) over the same
|
||||
// index range avoids a second scan. The IX_SiteAuditLog_ForwardState_Occurred
|
||||
// index makes both aggregates cheap (count is a covering scan, min
|
||||
// is the first key).
|
||||
using var cmd = _connection.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
SELECT COUNT(*), MIN(OccurredAtUtc)
|
||||
FROM AuditLog
|
||||
WHERE ForwardState = $pending;
|
||||
""";
|
||||
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
|
||||
|
||||
using var reader = cmd.ExecuteReader();
|
||||
reader.Read();
|
||||
pendingCount = reader.GetInt32(0);
|
||||
oldestPending = reader.IsDBNull(1)
|
||||
? null
|
||||
: DateTime.Parse(reader.GetString(1),
|
||||
System.Globalization.CultureInfo.InvariantCulture,
|
||||
System.Globalization.DateTimeStyles.RoundtripKind);
|
||||
}
|
||||
|
||||
// File-size lookup outside the lock — the DatabasePath option is the
|
||||
// canonical source. The connection-string-override branch (used by
|
||||
// some tests) keeps the same DatabasePath value, so this works
|
||||
// uniformly. In-memory / mode=memory paths return 0 because the file
|
||||
// doesn't exist on disk.
|
||||
long onDiskBytes = 0;
|
||||
try
|
||||
{
|
||||
if (!string.IsNullOrEmpty(_options.DatabasePath) &&
|
||||
!_options.DatabasePath.StartsWith(":memory:", StringComparison.Ordinal) &&
|
||||
!_options.DatabasePath.Contains("mode=memory", StringComparison.OrdinalIgnoreCase) &&
|
||||
File.Exists(_options.DatabasePath))
|
||||
{
|
||||
onDiskBytes = new FileInfo(_options.DatabasePath).Length;
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// File system probe is a best-effort health-metric — never abort
|
||||
// a backlog snapshot because stat() failed. Log and report 0.
|
||||
_logger.LogDebug(ex,
|
||||
"SqliteAuditWriter could not stat DB path {Path} for backlog snapshot.",
|
||||
_options.DatabasePath);
|
||||
}
|
||||
|
||||
return Task.FromResult(new SiteAuditBacklogSnapshot(
|
||||
PendingCount: pendingCount,
|
||||
OldestPendingUtc: oldestPending,
|
||||
OnDiskBytes: onDiskBytes));
|
||||
}
|
||||
|
||||
private static DateTime EnsureUtc(DateTime value) =>
|
||||
value.Kind == DateTimeKind.Utc
|
||||
? value
|
||||
|
||||
Reference in New Issue
Block a user