feat(health): wire ISiteEventLogger.FailedWriteCount into SiteHealthReport (#30, M2.16)

Add SiteHealthReport.SiteEventLogWriteFailures (trailing optional long = 0,
additive-only), ISiteHealthCollector.SetSiteEventLogWriteFailures (default
no-op so existing fakes compile), and SiteEventLogFailureCountReporter
(hosted service in HealthMonitoring, Func<long> delegate to avoid the
HealthMonitoring → StoreAndForward → SiteEventLogging cycle).

Registration helper AddSiteEventLogHealthMetricsBridge added to
HealthMonitoring.ServiceCollectionExtensions; wired in
SiteServiceRegistration after AddSiteEventLogging.

Tests: SiteEventLogWriteFailuresMetricTests (4 collector tests) +
SiteEventLogFailureCountReporterTests (2 poller tests) in
HealthMonitoring.Tests. 79/79 HealthMonitoring.Tests green,
59/59 SiteEventLogging.Tests green, 0 warnings.
This commit is contained in:
Joseph Doherty
2026-06-16 07:14:54 -04:00
parent e1ee37e508
commit d81f747434
9 changed files with 394 additions and 6 deletions
@@ -40,7 +40,14 @@ public record SiteHealthReport(
// hosted service every 30 s. Defaults to null so existing producers /
// tests that don't refresh the snapshot stay valid; the central health
// surface treats null as "no data yet" rather than a zeroed queue.
SiteAuditBacklogSnapshot? SiteAuditBacklog = null);
SiteAuditBacklogSnapshot? SiteAuditBacklog = null,
// Site Event Logging (#12) M2.16 (#30): cumulative count of event-log write
// failures (SQLite error, disk full, bounded-queue overflow drop) since the
// logger was created. Populated by the site-side SiteEventLogFailureCountReporter
// hosted service. Point-in-time (not reset on collect) — mirrors the
// SiteAuditBacklog pattern. Defaults to 0 so existing producers / tests that
// don't wire the poller stay valid.
long SiteEventLogWriteFailures = 0);
/// <summary>
/// Broadcast wrapper used between central nodes to keep per-node
@@ -111,6 +111,23 @@ public interface ISiteHealthCollector
/// <param name="count">The number of parked messages.</param>
void SetParkedMessageCount(int count);
/// <summary>
/// Site Event Logging (#12) M2.16 (#30) — replace the latest cumulative
/// site-event-log write-failure count (SQLite error, disk full,
/// bounded-queue overflow drop) used by the next <see cref="CollectReport"/>
/// call. Refreshed periodically by the <c>SiteEventLogFailureCountReporter</c>
/// hosted service. Point-in-time: the value is NOT reset on
/// <see cref="CollectReport"/>; it carries forward until the next poller
/// refresh. Default interface implementation is a no-op so existing test
/// fakes continue to compile without per-fake updates.
/// </summary>
/// <param name="count">The cumulative failed-write count from <c>ISiteEventLogger.FailedWriteCount</c>.</param>
void SetSiteEventLogWriteFailures(long count)
{
// Default no-op so test fakes do not need to be updated. The real
// SiteHealthCollector overrides this with the Interlocked.Exchange store.
}
/// <summary>
/// Sets the hostname of this node.
/// </summary>
@@ -1,5 +1,7 @@
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.DependencyInjection.Extensions;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
@@ -50,6 +52,68 @@ public static class ServiceCollectionExtensions
return services;
}
/// <summary>
/// Site Event Logging (#12) M2.16 (#30) — register the
/// <see cref="SiteEventLogFailureCountReporter"/> hosted service that
/// periodically reads the cumulative event-log write-failure count and
/// pushes it into <see cref="ISiteHealthCollector"/> as a point-in-time
/// snapshot (<c>SiteEventLogWriteFailures</c> on the site health report).
/// </summary>
/// <remarks>
/// <para>
/// Must be called AFTER <see cref="AddSiteHealthMonitoring"/> (or
/// <see cref="AddHealthMonitoring"/>) which registers the
/// <see cref="ISiteHealthCollector"/> the reporter depends on.
/// </para>
/// <para>
/// <b>Why a Func&lt;long&gt; delegate instead of ISiteEventLogger.</b>
/// <c>HealthMonitoring</c> must not reference <c>SiteEventLogging</c> directly —
/// the <c>StoreAndForward → SiteEventLogging</c> edge already exists in the
/// transitive graph, and <c>HealthMonitoring → StoreAndForward</c> is an
/// existing direct reference; adding <c>HealthMonitoring → SiteEventLogging</c>
/// would complete a cycle. The <see cref="Func{TResult}"/> delegate seam keeps
/// the dependency acyclic: the caller (Host site wiring) captures
/// <c>ISiteEventLogger.FailedWriteCount</c> as a lambda and passes it here.
/// </para>
/// <para>
/// Idempotent — a sentinel check on the
/// <see cref="SiteEventLogFailureCountReporter"/> hosted-service descriptor
/// short-circuits subsequent calls so the hosted service is not
/// double-registered (AddHostedService has no TryAdd variant).
/// </para>
/// </remarks>
/// <param name="services">The service collection to register into.</param>
/// <param name="failedWriteCountProvider">
/// A factory delegate that, given the root <see cref="IServiceProvider"/>,
/// returns a <see cref="Func{TResult}"/> that reads the current cumulative
/// event-log write-failure count. Typically:
/// <c>sp => () => sp.GetRequiredService&lt;ISiteEventLogger&gt;().FailedWriteCount</c>.
/// The factory is evaluated once at hosted-service resolution time; the inner
/// <see cref="Func{TResult}"/> is called on every poll tick.
/// </param>
/// <returns>The same <see cref="IServiceCollection"/> for chaining.</returns>
public static IServiceCollection AddSiteEventLogHealthMetricsBridge(
this IServiceCollection services,
Func<IServiceProvider, Func<long>> failedWriteCountProvider)
{
ArgumentNullException.ThrowIfNull(services);
ArgumentNullException.ThrowIfNull(failedWriteCountProvider);
// Idempotent guard — mirrors AddAuditLogHealthMetricsBridge's
// SiteAuditBacklogReporter sentinel check.
if (services.Any(d => d.ImplementationType == typeof(SiteEventLogFailureCountReporter)))
{
return services;
}
services.AddHostedService(sp => new SiteEventLogFailureCountReporter(
failedWriteCountProvider(sp),
sp.GetRequiredService<ISiteHealthCollector>(),
sp.GetRequiredService<ILogger<SiteEventLogFailureCountReporter>>()));
return services;
}
/// <summary>
/// HealthMonitoring-014: register the <see cref="HealthMonitoringOptionsValidator"/>
/// so a misconfigured <c>ScadaBridge:HealthMonitoring</c> section (zero/negative
@@ -0,0 +1,144 @@
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
/// <summary>
/// Site Event Logging (#12) M2.16 (#30) — site-side hosted service that
/// periodically reads the cumulative event-log write-failure count and pushes
/// it into <see cref="ISiteHealthCollector"/> so the next
/// <see cref="ISiteHealthCollector.CollectReport"/> emits a fresh
/// <c>SiteEventLogWriteFailures</c> field on the site health report.
/// </summary>
/// <remarks>
/// <para>
/// <b>Why a Func&lt;long&gt; and not ISiteEventLogger directly.</b>
/// <c>HealthMonitoring</c> does not (and cannot) reference
/// <c>SiteEventLogging</c> — <c>HealthMonitoring → StoreAndForward →
/// SiteEventLogging</c> already exists in the transitive graph, so adding a
/// direct reference would create a cycle. The <see cref="Func{TResult}"/>
/// delegate seam breaks the coupling: the caller (Host site wiring) captures
/// <c>ISiteEventLogger.FailedWriteCount</c> as a lambda at registration
/// time, and this service reads only the numeric result. The delegate
/// approach is a standard pattern for counter bridges and keeps the
/// registration path self-documenting.
/// </para>
/// <para>
/// <b>Cadence.</b> 30 s by default — the same cadence as
/// <c>SiteAuditBacklogReporter</c>, which is coarse enough to stay within
/// the health-report interval budget while keeping the central dashboard
/// current.
/// </para>
/// <para>
/// <b>Failure containment.</b> Any unexpected exception during the probe is
/// caught and logged; the next tick retries. Mirrors
/// <c>SiteAuditBacklogReporter</c>'s "exception logged, not propagated"
/// contract.
/// </para>
/// </remarks>
public sealed class SiteEventLogFailureCountReporter : IHostedService, IDisposable
{
/// <summary>
/// Default poll cadence. Matches <c>SiteAuditBacklogReporter.DefaultRefreshInterval</c>
/// (30 s) — coarse enough to amortise the read across many reports, fine
/// enough that the central dashboard never lags by more than one
/// health-report interval.
/// </summary>
internal static readonly TimeSpan DefaultRefreshInterval = TimeSpan.FromSeconds(30);
private readonly Func<long> _failedWriteCountProvider;
private readonly ISiteHealthCollector _collector;
private readonly ILogger<SiteEventLogFailureCountReporter> _logger;
private readonly TimeSpan _refreshInterval;
private CancellationTokenSource? _cts;
private Task? _loop;
/// <summary>Initializes a new instance of <see cref="SiteEventLogFailureCountReporter"/>.</summary>
/// <param name="failedWriteCountProvider">
/// A delegate that returns the current cumulative event-log write-failure count.
/// Typically wired as <c>() => sp.GetRequiredService&lt;ISiteEventLogger&gt;().FailedWriteCount</c>
/// in the Host site composition root.
/// </param>
/// <param name="collector">The site health collector that receives the failure-count snapshot.</param>
/// <param name="logger">Logger instance.</param>
/// <param name="refreshInterval">Poll interval override; defaults to <see cref="DefaultRefreshInterval"/> (30 s).</param>
public SiteEventLogFailureCountReporter(
Func<long> failedWriteCountProvider,
ISiteHealthCollector collector,
ILogger<SiteEventLogFailureCountReporter> logger,
TimeSpan? refreshInterval = null)
{
_failedWriteCountProvider = failedWriteCountProvider
?? throw new ArgumentNullException(nameof(failedWriteCountProvider));
_collector = collector ?? throw new ArgumentNullException(nameof(collector));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_refreshInterval = refreshInterval ?? DefaultRefreshInterval;
}
/// <summary>Starts the background polling loop, running an immediate first probe before entering the timed cycle.</summary>
/// <param name="ct">Cancellation token signalling host shutdown.</param>
/// <returns>A task that represents the asynchronous operation.</returns>
public Task StartAsync(CancellationToken ct)
{
// Linked CTS lets StopAsync's cancellation AND the host's shutdown
// token both terminate the loop; either side firing aborts the
// pending Task.Delay.
_cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
_loop = Task.Run(() => RunLoopAsync(_cts.Token));
return Task.CompletedTask;
}
private async Task RunLoopAsync(CancellationToken ct)
{
// First tick runs immediately so the very first health report after
// process start carries a real failure-count snapshot — without this
// the dashboard would show 0 for the first 30 s after a deploy even
// if failures had already accumulated.
SafeProbe();
while (!ct.IsCancellationRequested)
{
try
{
await Task.Delay(_refreshInterval, ct).ConfigureAwait(false);
}
catch (OperationCanceledException)
{
break;
}
SafeProbe();
}
}
private void SafeProbe()
{
try
{
var count = _failedWriteCountProvider();
_collector.SetSiteEventLogWriteFailures(count);
}
catch (Exception ex)
{
// Catch-all is deliberate: the hosted service must survive every
// class of probe failure so the next tick gets a chance. Mirrors
// SiteAuditBacklogReporter's "exception logged, not propagated" contract.
_logger.LogWarning(ex, "SiteEventLogFailureCountReporter probe failed; next tick will retry.");
}
}
/// <summary>Signals the polling loop to stop and waits for it to complete.</summary>
/// <param name="ct">Cancellation token (not used; the internal CTS governs shutdown).</param>
/// <returns>A task that represents the asynchronous operation.</returns>
public Task StopAsync(CancellationToken ct)
{
_cts?.Cancel();
return _loop ?? Task.CompletedTask;
}
/// <summary>Releases the internal <see cref="CancellationTokenSource"/> used to stop the polling loop.</summary>
public void Dispose()
{
_cts?.Dispose();
}
}
@@ -17,6 +17,7 @@ public class SiteHealthCollector : ISiteHealthCollector
private int _siteAuditWriteFailures;
private int _auditRedactionFailures;
private volatile SiteAuditBacklogSnapshot? _siteAuditBacklog;
private long _siteEventLogWriteFailures;
private readonly ConcurrentDictionary<string, ConnectionHealth> _connectionStatuses = new();
private readonly ConcurrentDictionary<string, TagResolutionStatus> _tagResolutionCounts = new();
private readonly ConcurrentDictionary<string, string> _connectionEndpoints = new();
@@ -77,6 +78,12 @@ public class SiteHealthCollector : ISiteHealthCollector
_siteAuditBacklog = snapshot ?? throw new ArgumentNullException(nameof(snapshot));
}
/// <inheritdoc />
public void SetSiteEventLogWriteFailures(long count)
{
Interlocked.Exchange(ref _siteEventLogWriteFailures, count);
}
/// <inheritdoc />
public void UpdateConnectionHealth(string connectionName, ConnectionHealth health)
{
@@ -206,6 +213,7 @@ public class SiteHealthCollector : ISiteHealthCollector
ClusterNodes: _clusterNodes?.ToList(),
SiteAuditWriteFailures: siteAuditWriteFailures,
AuditRedactionFailure: auditRedactionFailures,
SiteAuditBacklog: _siteAuditBacklog);
SiteAuditBacklog: _siteAuditBacklog,
SiteEventLogWriteFailures: Interlocked.Read(ref _siteEventLogWriteFailures));
}
}
@@ -58,6 +58,16 @@ public static class SiteServiceRegistration
services.AddStoreAndForward();
services.AddSiteEventLogging();
// Site Event Logging (#12) M2.16 (#30) — bridge ISiteEventLogger.FailedWriteCount
// into the site health report as a point-in-time SiteEventLogWriteFailures field.
// Must come AFTER both AddSiteHealthMonitoring (registers ISiteHealthCollector) and
// AddSiteEventLogging (registers ISiteEventLogger). The outer Func<IServiceProvider, …>
// is evaluated once at hosted-service resolution time (root IServiceProvider is available);
// the inner Func<long> is called on every poll tick and reads FailedWriteCount from the
// already-resolved ISiteEventLogger singleton.
services.AddSiteEventLogHealthMetricsBridge(
sp => () => sp.GetRequiredService<ISiteEventLogger>().FailedWriteCount);
// Audit Log (#23) — site-side hot-path writer + telemetry collaborators.
// The SiteAuditTelemetryActor itself is registered by AkkaHostedService
// in the site-role block; this call wires every DI dependency it (and
@@ -32,10 +32,9 @@ public interface ISiteEventLogger
/// <summary>
/// SiteEventLogging-018: total number of event writes that have failed
/// (SQLite error, disk full, bounded-queue overflow drop, etc.) since this
/// logger was created. Available for future Health Monitoring integration —
/// promoted onto the interface so a Health consumer can read it without a
/// concrete-type downcast. Not yet polled by Health Monitoring; the wiring
/// is tracked separately.
/// logger was created. Polled by <c>SiteEventLogFailureCountReporter</c>
/// (HealthMonitoring — M2.16 / #30) every 30 s and surfaced on the site
/// health report as <c>SiteHealthReport.SiteEventLogWriteFailures</c>.
/// </summary>
long FailedWriteCount { get; }
}
@@ -0,0 +1,77 @@
using Microsoft.Extensions.Logging.Abstractions;
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;
/// <summary>
/// M2.16 (#30) — unit tests for <see cref="SiteEventLogFailureCountReporter"/>.
/// Verifies that the poller reads the count provided by the
/// <see cref="Func{TResult}"/> delegate and pushes it into
/// <see cref="ISiteHealthCollector.SetSiteEventLogWriteFailures"/>.
/// </summary>
public class SiteEventLogFailureCountReporterTests
{
[Fact]
public async Task StartAsync_ImmediatelyProbes_FailedWriteCount()
{
// Arrange
var count = 99L;
var collector = new SiteHealthCollector();
using var reporter = new SiteEventLogFailureCountReporter(
failedWriteCountProvider: () => count,
collector: collector,
logger: NullLogger<SiteEventLogFailureCountReporter>.Instance,
refreshInterval: TimeSpan.FromHours(1)); // long interval — only immediate tick matters
// Act
await reporter.StartAsync(CancellationToken.None);
// Give the background Task a moment to execute its synchronous immediate probe.
var deadline = DateTime.UtcNow.AddSeconds(5);
while (collector.CollectReport("probe").SiteEventLogWriteFailures == 0L
&& DateTime.UtcNow < deadline)
{
await Task.Delay(10);
}
// Assert — the immediate probe before the first Delay must have fired.
var report = collector.CollectReport("site-1");
Assert.Equal(99L, report.SiteEventLogWriteFailures);
await reporter.StopAsync(CancellationToken.None);
}
[Fact]
public async Task StartAsync_PushesLatestCount_OnEachTick()
{
// Arrange — start with count 5; advance to 12 after the first tick.
var count = 5L;
var collector = new SiteHealthCollector();
using var reporter = new SiteEventLogFailureCountReporter(
failedWriteCountProvider: () => count,
collector: collector,
logger: NullLogger<SiteEventLogFailureCountReporter>.Instance,
refreshInterval: TimeSpan.FromMilliseconds(50));
await reporter.StartAsync(CancellationToken.None);
// Wait for immediate probe.
var deadline = DateTime.UtcNow.AddSeconds(5);
while (collector.CollectReport("probe").SiteEventLogWriteFailures != 5L
&& DateTime.UtcNow < deadline)
await Task.Delay(10);
Assert.Equal(5L, collector.CollectReport("site-1").SiteEventLogWriteFailures);
// Advance the counter and wait for the next tick to push the new value.
count = 12L;
deadline = DateTime.UtcNow.AddSeconds(5);
while (collector.CollectReport("probe").SiteEventLogWriteFailures != 12L
&& DateTime.UtcNow < deadline)
await Task.Delay(10);
Assert.Equal(12L, collector.CollectReport("site-1").SiteEventLogWriteFailures);
await reporter.StopAsync(CancellationToken.None);
}
}
@@ -0,0 +1,62 @@
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;
/// <summary>
/// M2.16 (#30) regression coverage. <see cref="ISiteEventLogger.FailedWriteCount"/>
/// is a cumulative (point-in-time) counter. A periodic
/// <c>SiteEventLogFailureCountReporter</c> hosted service polls the count and
/// pushes it into the collector via
/// <see cref="ISiteHealthCollector.SetSiteEventLogWriteFailures"/> so the next
/// <see cref="ISiteHealthCollector.CollectReport"/> includes it in the report
/// payload as <c>SiteEventLogWriteFailures</c>. Unlike the per-interval
/// SiteAuditWriteFailures counter, this value is NOT reset on collect — it
/// carries forward whatever the most recent poller push delivered.
/// </summary>
public class SiteEventLogWriteFailuresMetricTests
{
private readonly SiteHealthCollector _collector = new();
[Fact]
public void Set_Then_CollectReport_IncludesCount()
{
_collector.SetSiteEventLogWriteFailures(17L);
var report = _collector.CollectReport("site-1");
Assert.Equal(17L, report.SiteEventLogWriteFailures);
}
[Fact]
public void Report_Payload_Includes_SiteEventLogWriteFailures_AsZeroByDefault()
{
var report = _collector.CollectReport("site-1");
Assert.Equal(0L, report.SiteEventLogWriteFailures);
}
[Fact]
public void CollectReport_DoesNotReset_SiteEventLogWriteFailures()
{
// This is a point-in-time cumulative count — successive CollectReport
// calls before the next poller tick MUST carry forward the same value
// rather than resetting to zero (which would falsely indicate no failures
// between the two reports).
_collector.SetSiteEventLogWriteFailures(42L);
var first = _collector.CollectReport("site-1");
var second = _collector.CollectReport("site-1");
Assert.Equal(42L, first.SiteEventLogWriteFailures);
Assert.Equal(42L, second.SiteEventLogWriteFailures);
}
[Fact]
public void Set_Overwrites_Previous_Value()
{
_collector.SetSiteEventLogWriteFailures(5L);
_collector.SetSiteEventLogWriteFailures(9L);
var report = _collector.CollectReport("site-1");
Assert.Equal(9L, report.SiteEventLogWriteFailures);
}
}