diff --git a/src/ZB.MOM.WW.ScadaBridge.Commons/Messages/Health/SiteHealthReport.cs b/src/ZB.MOM.WW.ScadaBridge.Commons/Messages/Health/SiteHealthReport.cs index 4392fbfb..3c93879f 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Commons/Messages/Health/SiteHealthReport.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Commons/Messages/Health/SiteHealthReport.cs @@ -40,7 +40,14 @@ public record SiteHealthReport( // hosted service every 30 s. Defaults to null so existing producers / // tests that don't refresh the snapshot stay valid; the central health // surface treats null as "no data yet" rather than a zeroed queue. - SiteAuditBacklogSnapshot? SiteAuditBacklog = null); + SiteAuditBacklogSnapshot? SiteAuditBacklog = null, + // Site Event Logging (#12) M2.16 (#30): cumulative count of event-log write + // failures (SQLite error, disk full, bounded-queue overflow drop) since the + // logger was created. Populated by the site-side SiteEventLogFailureCountReporter + // hosted service. Point-in-time (not reset on collect) — mirrors the + // SiteAuditBacklog pattern. Defaults to 0 so existing producers / tests that + // don't wire the poller stay valid. + long SiteEventLogWriteFailures = 0); /// /// Broadcast wrapper used between central nodes to keep per-node diff --git a/src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/ISiteHealthCollector.cs b/src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/ISiteHealthCollector.cs index 9c0c660f..7e03be2b 100644 --- a/src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/ISiteHealthCollector.cs +++ b/src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/ISiteHealthCollector.cs @@ -111,6 +111,23 @@ public interface ISiteHealthCollector /// The number of parked messages. void SetParkedMessageCount(int count); + /// + /// Site Event Logging (#12) M2.16 (#30) — replace the latest cumulative + /// site-event-log write-failure count (SQLite error, disk full, + /// bounded-queue overflow drop) used by the next + /// call. Refreshed periodically by the SiteEventLogFailureCountReporter + /// hosted service. Point-in-time: the value is NOT reset on + /// ; it carries forward until the next poller + /// refresh. Default interface implementation is a no-op so existing test + /// fakes continue to compile without per-fake updates. + /// + /// The cumulative failed-write count from ISiteEventLogger.FailedWriteCount. + void SetSiteEventLogWriteFailures(long count) + { + // Default no-op so test fakes do not need to be updated. The real + // SiteHealthCollector overrides this with the Interlocked.Exchange store. + } + /// /// Sets the hostname of this node. /// diff --git a/src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/ServiceCollectionExtensions.cs b/src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/ServiceCollectionExtensions.cs index 2bff9f09..28d49604 100644 --- a/src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/ServiceCollectionExtensions.cs +++ b/src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/ServiceCollectionExtensions.cs @@ -1,5 +1,7 @@ using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.DependencyInjection.Extensions; +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring; @@ -50,6 +52,68 @@ public static class ServiceCollectionExtensions return services; } + /// + /// Site Event Logging (#12) M2.16 (#30) — register the + /// hosted service that + /// periodically reads the cumulative event-log write-failure count and + /// pushes it into as a point-in-time + /// snapshot (SiteEventLogWriteFailures on the site health report). + /// + /// + /// + /// Must be called AFTER (or + /// ) which registers the + /// the reporter depends on. + /// + /// + /// Why a Func<long> delegate instead of ISiteEventLogger. + /// HealthMonitoring must not reference SiteEventLogging directly — + /// the StoreAndForward → SiteEventLogging edge already exists in the + /// transitive graph, and HealthMonitoring → StoreAndForward is an + /// existing direct reference; adding HealthMonitoring → SiteEventLogging + /// would complete a cycle. The delegate seam keeps + /// the dependency acyclic: the caller (Host site wiring) captures + /// ISiteEventLogger.FailedWriteCount as a lambda and passes it here. + /// + /// + /// Idempotent — a sentinel check on the + /// hosted-service descriptor + /// short-circuits subsequent calls so the hosted service is not + /// double-registered (AddHostedService has no TryAdd variant). + /// + /// + /// The service collection to register into. + /// + /// A factory delegate that, given the root , + /// returns a that reads the current cumulative + /// event-log write-failure count. Typically: + /// sp => () => sp.GetRequiredService<ISiteEventLogger>().FailedWriteCount. + /// The factory is evaluated once at hosted-service resolution time; the inner + /// is called on every poll tick. + /// + /// The same for chaining. + public static IServiceCollection AddSiteEventLogHealthMetricsBridge( + this IServiceCollection services, + Func> failedWriteCountProvider) + { + ArgumentNullException.ThrowIfNull(services); + ArgumentNullException.ThrowIfNull(failedWriteCountProvider); + + // Idempotent guard — mirrors AddAuditLogHealthMetricsBridge's + // SiteAuditBacklogReporter sentinel check. + if (services.Any(d => d.ImplementationType == typeof(SiteEventLogFailureCountReporter))) + { + return services; + } + + services.AddHostedService(sp => new SiteEventLogFailureCountReporter( + failedWriteCountProvider(sp), + sp.GetRequiredService(), + sp.GetRequiredService>())); + + return services; + } + /// /// HealthMonitoring-014: register the /// so a misconfigured ScadaBridge:HealthMonitoring section (zero/negative diff --git a/src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/SiteEventLogFailureCountReporter.cs b/src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/SiteEventLogFailureCountReporter.cs new file mode 100644 index 00000000..2076f7eb --- /dev/null +++ b/src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/SiteEventLogFailureCountReporter.cs @@ -0,0 +1,144 @@ +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; + +namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring; + +/// +/// Site Event Logging (#12) M2.16 (#30) — site-side hosted service that +/// periodically reads the cumulative event-log write-failure count and pushes +/// it into so the next +/// emits a fresh +/// SiteEventLogWriteFailures field on the site health report. +/// +/// +/// +/// Why a Func<long> and not ISiteEventLogger directly. +/// HealthMonitoring does not (and cannot) reference +/// SiteEventLoggingHealthMonitoring → StoreAndForward → +/// SiteEventLogging already exists in the transitive graph, so adding a +/// direct reference would create a cycle. The +/// delegate seam breaks the coupling: the caller (Host site wiring) captures +/// ISiteEventLogger.FailedWriteCount as a lambda at registration +/// time, and this service reads only the numeric result. The delegate +/// approach is a standard pattern for counter bridges and keeps the +/// registration path self-documenting. +/// +/// +/// Cadence. 30 s by default — the same cadence as +/// SiteAuditBacklogReporter, which is coarse enough to stay within +/// the health-report interval budget while keeping the central dashboard +/// current. +/// +/// +/// Failure containment. Any unexpected exception during the probe is +/// caught and logged; the next tick retries. Mirrors +/// SiteAuditBacklogReporter's "exception logged, not propagated" +/// contract. +/// +/// +public sealed class SiteEventLogFailureCountReporter : IHostedService, IDisposable +{ + /// + /// Default poll cadence. Matches SiteAuditBacklogReporter.DefaultRefreshInterval + /// (30 s) — coarse enough to amortise the read across many reports, fine + /// enough that the central dashboard never lags by more than one + /// health-report interval. + /// + internal static readonly TimeSpan DefaultRefreshInterval = TimeSpan.FromSeconds(30); + + private readonly Func _failedWriteCountProvider; + private readonly ISiteHealthCollector _collector; + private readonly ILogger _logger; + private readonly TimeSpan _refreshInterval; + private CancellationTokenSource? _cts; + private Task? _loop; + + /// Initializes a new instance of . + /// + /// A delegate that returns the current cumulative event-log write-failure count. + /// Typically wired as () => sp.GetRequiredService<ISiteEventLogger>().FailedWriteCount + /// in the Host site composition root. + /// + /// The site health collector that receives the failure-count snapshot. + /// Logger instance. + /// Poll interval override; defaults to (30 s). + public SiteEventLogFailureCountReporter( + Func failedWriteCountProvider, + ISiteHealthCollector collector, + ILogger logger, + TimeSpan? refreshInterval = null) + { + _failedWriteCountProvider = failedWriteCountProvider + ?? throw new ArgumentNullException(nameof(failedWriteCountProvider)); + _collector = collector ?? throw new ArgumentNullException(nameof(collector)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + _refreshInterval = refreshInterval ?? DefaultRefreshInterval; + } + + /// Starts the background polling loop, running an immediate first probe before entering the timed cycle. + /// Cancellation token signalling host shutdown. + /// A task that represents the asynchronous operation. + public Task StartAsync(CancellationToken ct) + { + // Linked CTS lets StopAsync's cancellation AND the host's shutdown + // token both terminate the loop; either side firing aborts the + // pending Task.Delay. + _cts = CancellationTokenSource.CreateLinkedTokenSource(ct); + _loop = Task.Run(() => RunLoopAsync(_cts.Token)); + return Task.CompletedTask; + } + + private async Task RunLoopAsync(CancellationToken ct) + { + // First tick runs immediately so the very first health report after + // process start carries a real failure-count snapshot — without this + // the dashboard would show 0 for the first 30 s after a deploy even + // if failures had already accumulated. + SafeProbe(); + + while (!ct.IsCancellationRequested) + { + try + { + await Task.Delay(_refreshInterval, ct).ConfigureAwait(false); + } + catch (OperationCanceledException) + { + break; + } + + SafeProbe(); + } + } + + private void SafeProbe() + { + try + { + var count = _failedWriteCountProvider(); + _collector.SetSiteEventLogWriteFailures(count); + } + catch (Exception ex) + { + // Catch-all is deliberate: the hosted service must survive every + // class of probe failure so the next tick gets a chance. Mirrors + // SiteAuditBacklogReporter's "exception logged, not propagated" contract. + _logger.LogWarning(ex, "SiteEventLogFailureCountReporter probe failed; next tick will retry."); + } + } + + /// Signals the polling loop to stop and waits for it to complete. + /// Cancellation token (not used; the internal CTS governs shutdown). + /// A task that represents the asynchronous operation. + public Task StopAsync(CancellationToken ct) + { + _cts?.Cancel(); + return _loop ?? Task.CompletedTask; + } + + /// Releases the internal used to stop the polling loop. + public void Dispose() + { + _cts?.Dispose(); + } +} diff --git a/src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/SiteHealthCollector.cs b/src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/SiteHealthCollector.cs index d06d62e7..14b393d0 100644 --- a/src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/SiteHealthCollector.cs +++ b/src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/SiteHealthCollector.cs @@ -17,6 +17,7 @@ public class SiteHealthCollector : ISiteHealthCollector private int _siteAuditWriteFailures; private int _auditRedactionFailures; private volatile SiteAuditBacklogSnapshot? _siteAuditBacklog; + private long _siteEventLogWriteFailures; private readonly ConcurrentDictionary _connectionStatuses = new(); private readonly ConcurrentDictionary _tagResolutionCounts = new(); private readonly ConcurrentDictionary _connectionEndpoints = new(); @@ -77,6 +78,12 @@ public class SiteHealthCollector : ISiteHealthCollector _siteAuditBacklog = snapshot ?? throw new ArgumentNullException(nameof(snapshot)); } + /// + public void SetSiteEventLogWriteFailures(long count) + { + Interlocked.Exchange(ref _siteEventLogWriteFailures, count); + } + /// public void UpdateConnectionHealth(string connectionName, ConnectionHealth health) { @@ -206,6 +213,7 @@ public class SiteHealthCollector : ISiteHealthCollector ClusterNodes: _clusterNodes?.ToList(), SiteAuditWriteFailures: siteAuditWriteFailures, AuditRedactionFailure: auditRedactionFailures, - SiteAuditBacklog: _siteAuditBacklog); + SiteAuditBacklog: _siteAuditBacklog, + SiteEventLogWriteFailures: Interlocked.Read(ref _siteEventLogWriteFailures)); } } diff --git a/src/ZB.MOM.WW.ScadaBridge.Host/SiteServiceRegistration.cs b/src/ZB.MOM.WW.ScadaBridge.Host/SiteServiceRegistration.cs index 367a55ca..8929619d 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Host/SiteServiceRegistration.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Host/SiteServiceRegistration.cs @@ -58,6 +58,16 @@ public static class SiteServiceRegistration services.AddStoreAndForward(); services.AddSiteEventLogging(); + // Site Event Logging (#12) M2.16 (#30) — bridge ISiteEventLogger.FailedWriteCount + // into the site health report as a point-in-time SiteEventLogWriteFailures field. + // Must come AFTER both AddSiteHealthMonitoring (registers ISiteHealthCollector) and + // AddSiteEventLogging (registers ISiteEventLogger). The outer Func + // is evaluated once at hosted-service resolution time (root IServiceProvider is available); + // the inner Func is called on every poll tick and reads FailedWriteCount from the + // already-resolved ISiteEventLogger singleton. + services.AddSiteEventLogHealthMetricsBridge( + sp => () => sp.GetRequiredService().FailedWriteCount); + // Audit Log (#23) — site-side hot-path writer + telemetry collaborators. // The SiteAuditTelemetryActor itself is registered by AkkaHostedService // in the site-role block; this call wires every DI dependency it (and diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteEventLogging/ISiteEventLogger.cs b/src/ZB.MOM.WW.ScadaBridge.SiteEventLogging/ISiteEventLogger.cs index 630822ef..a3ac9183 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteEventLogging/ISiteEventLogger.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteEventLogging/ISiteEventLogger.cs @@ -32,10 +32,9 @@ public interface ISiteEventLogger /// /// SiteEventLogging-018: total number of event writes that have failed /// (SQLite error, disk full, bounded-queue overflow drop, etc.) since this - /// logger was created. Available for future Health Monitoring integration — - /// promoted onto the interface so a Health consumer can read it without a - /// concrete-type downcast. Not yet polled by Health Monitoring; the wiring - /// is tracked separately. + /// logger was created. Polled by SiteEventLogFailureCountReporter + /// (HealthMonitoring — M2.16 / #30) every 30 s and surfaced on the site + /// health report as SiteHealthReport.SiteEventLogWriteFailures. /// long FailedWriteCount { get; } } diff --git a/tests/ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests/SiteEventLogFailureCountReporterTests.cs b/tests/ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests/SiteEventLogFailureCountReporterTests.cs new file mode 100644 index 00000000..632bb60f --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests/SiteEventLogFailureCountReporterTests.cs @@ -0,0 +1,77 @@ +using Microsoft.Extensions.Logging.Abstractions; + +namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests; + +/// +/// M2.16 (#30) — unit tests for . +/// Verifies that the poller reads the count provided by the +/// delegate and pushes it into +/// . +/// +public class SiteEventLogFailureCountReporterTests +{ + [Fact] + public async Task StartAsync_ImmediatelyProbes_FailedWriteCount() + { + // Arrange + var count = 99L; + var collector = new SiteHealthCollector(); + using var reporter = new SiteEventLogFailureCountReporter( + failedWriteCountProvider: () => count, + collector: collector, + logger: NullLogger.Instance, + refreshInterval: TimeSpan.FromHours(1)); // long interval — only immediate tick matters + + // Act + await reporter.StartAsync(CancellationToken.None); + + // Give the background Task a moment to execute its synchronous immediate probe. + var deadline = DateTime.UtcNow.AddSeconds(5); + while (collector.CollectReport("probe").SiteEventLogWriteFailures == 0L + && DateTime.UtcNow < deadline) + { + await Task.Delay(10); + } + + // Assert — the immediate probe before the first Delay must have fired. + var report = collector.CollectReport("site-1"); + Assert.Equal(99L, report.SiteEventLogWriteFailures); + + await reporter.StopAsync(CancellationToken.None); + } + + [Fact] + public async Task StartAsync_PushesLatestCount_OnEachTick() + { + // Arrange — start with count 5; advance to 12 after the first tick. + var count = 5L; + var collector = new SiteHealthCollector(); + using var reporter = new SiteEventLogFailureCountReporter( + failedWriteCountProvider: () => count, + collector: collector, + logger: NullLogger.Instance, + refreshInterval: TimeSpan.FromMilliseconds(50)); + + await reporter.StartAsync(CancellationToken.None); + + // Wait for immediate probe. + var deadline = DateTime.UtcNow.AddSeconds(5); + while (collector.CollectReport("probe").SiteEventLogWriteFailures != 5L + && DateTime.UtcNow < deadline) + await Task.Delay(10); + + Assert.Equal(5L, collector.CollectReport("site-1").SiteEventLogWriteFailures); + + // Advance the counter and wait for the next tick to push the new value. + count = 12L; + + deadline = DateTime.UtcNow.AddSeconds(5); + while (collector.CollectReport("probe").SiteEventLogWriteFailures != 12L + && DateTime.UtcNow < deadline) + await Task.Delay(10); + + Assert.Equal(12L, collector.CollectReport("site-1").SiteEventLogWriteFailures); + + await reporter.StopAsync(CancellationToken.None); + } +} diff --git a/tests/ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests/SiteEventLogWriteFailuresMetricTests.cs b/tests/ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests/SiteEventLogWriteFailuresMetricTests.cs new file mode 100644 index 00000000..1d3132dd --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests/SiteEventLogWriteFailuresMetricTests.cs @@ -0,0 +1,62 @@ +namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests; + +/// +/// M2.16 (#30) regression coverage. +/// is a cumulative (point-in-time) counter. A periodic +/// SiteEventLogFailureCountReporter hosted service polls the count and +/// pushes it into the collector via +/// so the next +/// includes it in the report +/// payload as SiteEventLogWriteFailures. Unlike the per-interval +/// SiteAuditWriteFailures counter, this value is NOT reset on collect — it +/// carries forward whatever the most recent poller push delivered. +/// +public class SiteEventLogWriteFailuresMetricTests +{ + private readonly SiteHealthCollector _collector = new(); + + [Fact] + public void Set_Then_CollectReport_IncludesCount() + { + _collector.SetSiteEventLogWriteFailures(17L); + + var report = _collector.CollectReport("site-1"); + + Assert.Equal(17L, report.SiteEventLogWriteFailures); + } + + [Fact] + public void Report_Payload_Includes_SiteEventLogWriteFailures_AsZeroByDefault() + { + var report = _collector.CollectReport("site-1"); + + Assert.Equal(0L, report.SiteEventLogWriteFailures); + } + + [Fact] + public void CollectReport_DoesNotReset_SiteEventLogWriteFailures() + { + // This is a point-in-time cumulative count — successive CollectReport + // calls before the next poller tick MUST carry forward the same value + // rather than resetting to zero (which would falsely indicate no failures + // between the two reports). + _collector.SetSiteEventLogWriteFailures(42L); + + var first = _collector.CollectReport("site-1"); + var second = _collector.CollectReport("site-1"); + + Assert.Equal(42L, first.SiteEventLogWriteFailures); + Assert.Equal(42L, second.SiteEventLogWriteFailures); + } + + [Fact] + public void Set_Overwrites_Previous_Value() + { + _collector.SetSiteEventLogWriteFailures(5L); + _collector.SetSiteEventLogWriteFailures(9L); + + var report = _collector.CollectReport("site-1"); + + Assert.Equal(9L, report.SiteEventLogWriteFailures); + } +}