From dd3351da930144becac180c36dc0a0608008ae8d Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 20 May 2026 13:22:25 -0400 Subject: [PATCH] feat(health): SiteAuditWriteFailures counter + AuditLog bridge (#23) Bundle G of Audit Log #23 M2. Bridges the FallbackAuditWriter primary- failure counter into the Site Health Monitoring report payload so a sustained audit-write outage surfaces on /monitoring/health instead of disappearing into a NoOp sink. - SiteHealthReport: add SiteAuditWriteFailures (defaulted, additive). - ISiteHealthCollector + SiteHealthCollector: new IncrementSiteAuditWriteFailures() counter, per-interval reset semantics matching ScriptErrorCount / DeadLetterCount. - HealthMetricsAuditWriteFailureCounter: adapter forwarding IAuditWriteFailureCounter.Increment() to the collector. - AddAuditLogHealthMetricsBridge(): swaps the NoOp default registration for the real bridge; called from SiteServiceRegistration after AddSiteHealthMonitoring + AddAuditLog. - Existing host-wiring test updated: site composition now resolves HealthMetricsAuditWriteFailureCounter (not NoOp). Tests: HealthMonitoring 60 -> 63 (3 new), AuditLog 56 -> 59 (3 new), full solution green. --- .../ServiceCollectionExtensions.cs | 33 ++++++++++++ .../HealthMetricsAuditWriteFailureCounter.cs | 33 ++++++++++++ .../Messages/Health/SiteHealthReport.cs | 7 ++- .../ISiteHealthCollector.cs | 7 +++ .../SiteHealthCollector.cs | 17 +++++- src/ScadaLink.Host/SiteServiceRegistration.cs | 7 +++ .../AddAuditLogTests.cs | 53 +++++++++++++++++++ ...lthMetricsAuditWriteFailureCounterTests.cs | 46 ++++++++++++++++ .../SiteAuditWriteFailuresMetricTests.cs | 52 ++++++++++++++++++ .../AkkaHostedServiceAuditWiringTests.cs | 9 +++- .../Actors/DeploymentManagerRedeployTests.cs | 1 + 11 files changed, 261 insertions(+), 4 deletions(-) create mode 100644 src/ScadaLink.AuditLog/Site/HealthMetricsAuditWriteFailureCounter.cs create mode 100644 tests/ScadaLink.AuditLog.Tests/Site/HealthMetricsAuditWriteFailureCounterTests.cs create mode 100644 tests/ScadaLink.HealthMonitoring.Tests/SiteAuditWriteFailuresMetricTests.cs diff --git a/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs b/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs index b8b183c..34d3a23 100644 --- a/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs +++ b/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs @@ -1,5 +1,6 @@ using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.DependencyInjection.Extensions; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using ScadaLink.AuditLog.Configuration; @@ -103,4 +104,36 @@ public static class ServiceCollectionExtensions return services; } + + /// + /// Audit Log (#23) M2 Bundle G — swap the default + /// registration for the real + /// bridge so the + /// FallbackAuditWriter primary-failure counter surfaces in the site health + /// report payload as SiteHealthReport.SiteAuditWriteFailures. + /// + /// + /// + /// Must be called AFTER both (registers the + /// NoOp default this method replaces) and + /// ScadaLink.HealthMonitoring.ServiceCollectionExtensions.AddHealthMonitoring + /// or AddSiteHealthMonitoring (registers the + /// the bridge depends on). Resolving + /// without the latter throws + /// at GetRequiredService + /// time — by design, since a silent NoOp would mask a misconfiguration. + /// + /// + /// Idempotent — calling twice replaces the descriptor each time without + /// piling up registrations. + /// + /// + public static IServiceCollection AddAuditLogHealthMetricsBridge(this IServiceCollection services) + { + ArgumentNullException.ThrowIfNull(services); + + services.Replace( + ServiceDescriptor.Singleton()); + return services; + } } diff --git a/src/ScadaLink.AuditLog/Site/HealthMetricsAuditWriteFailureCounter.cs b/src/ScadaLink.AuditLog/Site/HealthMetricsAuditWriteFailureCounter.cs new file mode 100644 index 0000000..7284727 --- /dev/null +++ b/src/ScadaLink.AuditLog/Site/HealthMetricsAuditWriteFailureCounter.cs @@ -0,0 +1,33 @@ +using ScadaLink.HealthMonitoring; + +namespace ScadaLink.AuditLog.Site; + +/// +/// Audit Log (#23) M2 Bundle G — bridges +/// (incremented by every time the primary +/// SQLite writer throws) into so the count +/// surfaces in the site health report payload as +/// SiteHealthReport.SiteAuditWriteFailures. +/// +/// +/// +/// Registered by ; +/// callers must register AddHealthMonitoring() first so +/// resolves. The default +/// registration keeps for nodes +/// where Site Health Monitoring is not wired (the silent-sink contract — audit +/// write failures must NEVER abort the user-facing action, alog.md §7). +/// +/// +public sealed class HealthMetricsAuditWriteFailureCounter : IAuditWriteFailureCounter +{ + private readonly ISiteHealthCollector _collector; + + public HealthMetricsAuditWriteFailureCounter(ISiteHealthCollector collector) + { + _collector = collector ?? throw new ArgumentNullException(nameof(collector)); + } + + /// + public void Increment() => _collector.IncrementSiteAuditWriteFailures(); +} diff --git a/src/ScadaLink.Commons/Messages/Health/SiteHealthReport.cs b/src/ScadaLink.Commons/Messages/Health/SiteHealthReport.cs index d61b37e..516d4f3 100644 --- a/src/ScadaLink.Commons/Messages/Health/SiteHealthReport.cs +++ b/src/ScadaLink.Commons/Messages/Health/SiteHealthReport.cs @@ -20,7 +20,12 @@ public record SiteHealthReport( IReadOnlyDictionary? DataConnectionEndpoints = null, IReadOnlyDictionary? DataConnectionTagQuality = null, int ParkedMessageCount = 0, - IReadOnlyList? ClusterNodes = null); + IReadOnlyList? ClusterNodes = null, + // Audit Log (#23) M2 Bundle G: per-interval count of FallbackAuditWriter + // primary failures (SQLite throws routed to the drop-oldest ring). Surfaces + // a sustained audit-write outage on /monitoring/health. Defaults to 0 so + // existing producers / tests that don't construct the field stay valid. + int SiteAuditWriteFailures = 0); /// /// Broadcast wrapper used between central nodes to keep per-node diff --git a/src/ScadaLink.HealthMonitoring/ISiteHealthCollector.cs b/src/ScadaLink.HealthMonitoring/ISiteHealthCollector.cs index 1210833..c16c45f 100644 --- a/src/ScadaLink.HealthMonitoring/ISiteHealthCollector.cs +++ b/src/ScadaLink.HealthMonitoring/ISiteHealthCollector.cs @@ -12,6 +12,13 @@ public interface ISiteHealthCollector void IncrementScriptError(); void IncrementAlarmError(); void IncrementDeadLetter(); + /// + /// Audit Log (#23) Bundle G — increment the per-interval count of + /// FallbackAuditWriter primary failures. Bridged from the + /// IAuditWriteFailureCounter binding registered via + /// AddAuditLogHealthMetricsBridge(). + /// + void IncrementSiteAuditWriteFailures(); void UpdateConnectionHealth(string connectionName, ConnectionHealth health); void RemoveConnection(string connectionName); void UpdateTagResolution(string connectionName, int totalSubscribed, int successfullyResolved); diff --git a/src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs b/src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs index ca05cf9..1a6aa48 100644 --- a/src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs +++ b/src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs @@ -13,6 +13,7 @@ public class SiteHealthCollector : ISiteHealthCollector private int _scriptErrorCount; private int _alarmErrorCount; private int _deadLetterCount; + private int _siteAuditWriteFailures; private readonly ConcurrentDictionary _connectionStatuses = new(); private readonly ConcurrentDictionary _tagResolutionCounts = new(); private readonly ConcurrentDictionary _connectionEndpoints = new(); @@ -61,6 +62,18 @@ public class SiteHealthCollector : ISiteHealthCollector Interlocked.Increment(ref _deadLetterCount); } + /// + /// Audit Log (#23) Bundle G — increment the per-interval count of + /// FallbackAuditWriter primary failures. Bridged from the + /// IAuditWriteFailureCounter binding registered via + /// AddAuditLogHealthMetricsBridge(); reset every interval together + /// with the other per-interval counters. + /// + public void IncrementSiteAuditWriteFailures() + { + Interlocked.Increment(ref _siteAuditWriteFailures); + } + /// /// Update the health status for a named data connection. /// Called by DCL when connection state changes. @@ -144,6 +157,7 @@ public class SiteHealthCollector : ISiteHealthCollector var scriptErrors = Interlocked.Exchange(ref _scriptErrorCount, 0); var alarmErrors = Interlocked.Exchange(ref _alarmErrorCount, 0); var deadLetters = Interlocked.Exchange(ref _deadLetterCount, 0); + var siteAuditWriteFailures = Interlocked.Exchange(ref _siteAuditWriteFailures, 0); // Snapshot current connection and tag resolution state var connectionStatuses = new Dictionary(_connectionStatuses); @@ -175,6 +189,7 @@ public class SiteHealthCollector : ISiteHealthCollector DataConnectionEndpoints: connectionEndpoints, DataConnectionTagQuality: tagQuality, ParkedMessageCount: Interlocked.CompareExchange(ref _parkedMessageCount, 0, 0), - ClusterNodes: _clusterNodes?.ToList()); + ClusterNodes: _clusterNodes?.ToList(), + SiteAuditWriteFailures: siteAuditWriteFailures); } } diff --git a/src/ScadaLink.Host/SiteServiceRegistration.cs b/src/ScadaLink.Host/SiteServiceRegistration.cs index e92b583..dd13484 100644 --- a/src/ScadaLink.Host/SiteServiceRegistration.cs +++ b/src/ScadaLink.Host/SiteServiceRegistration.cs @@ -51,6 +51,13 @@ public static class SiteServiceRegistration // ScriptRuntimeContext, when Bundle F lands) reaches for. services.AddAuditLog(config); + // Audit Log (#23) M2 Bundle G — bridge FallbackAuditWriter primary + // failures into the site health report payload as + // SiteAuditWriteFailures. Must come AFTER both AddSiteHealthMonitoring + // (registers ISiteHealthCollector) and AddAuditLog (registers the + // NoOp default this call replaces). + services.AddAuditLogHealthMetricsBridge(); + // WP-13: Akka.NET bootstrap via hosted service services.AddSingleton(); services.AddHostedService(sp => sp.GetRequiredService()); diff --git a/tests/ScadaLink.AuditLog.Tests/AddAuditLogTests.cs b/tests/ScadaLink.AuditLog.Tests/AddAuditLogTests.cs index afe70f4..03d337a 100644 --- a/tests/ScadaLink.AuditLog.Tests/AddAuditLogTests.cs +++ b/tests/ScadaLink.AuditLog.Tests/AddAuditLogTests.cs @@ -7,6 +7,7 @@ using ScadaLink.AuditLog.Configuration; using ScadaLink.AuditLog.Site; using ScadaLink.AuditLog.Site.Telemetry; using ScadaLink.Commons.Interfaces.Services; +using ScadaLink.HealthMonitoring; namespace ScadaLink.AuditLog.Tests; @@ -187,4 +188,56 @@ public class AddAuditLogTests Assert.Equal(3, opts.BusyIntervalSeconds); Assert.Equal(60, opts.IdleIntervalSeconds); } + + // -- Bundle G (M2 Task G1) Site Health Monitoring bridge ---------------- + + [Fact] + public void AddAuditLogHealthMetricsBridge_Swaps_FailureCounter_To_HealthMetrics_Implementation() + { + var config = new ConfigurationBuilder() + .AddInMemoryCollection(new Dictionary + { + ["AuditLog:SiteWriter:DatabasePath"] = ":memory:", + }) + .Build(); + + var services = new ServiceCollection(); + services.AddSingleton(); + services.AddSingleton(typeof(ILogger<>), typeof(NullLogger<>)); + services.AddAuditLog(config); + // The bridge depends on ISiteHealthCollector; AddHealthMonitoring is + // what registers it on the site (and the central self-host). + services.AddHealthMonitoring(); + services.AddAuditLogHealthMetricsBridge(); + using var provider = services.BuildServiceProvider(); + + var counter = provider.GetRequiredService(); + + Assert.IsType(counter); + } + + [Fact] + public void AddAuditLogHealthMetricsBridge_Without_HealthMonitoring_Still_Resolves_But_Errors_On_Use() + { + // The bridge replaces the registration unconditionally; resolving the + // counter when ISiteHealthCollector is missing throws at GetRequiredService + // time. This documents the contract — callers must register + // AddHealthMonitoring() before the bridge. + var config = new ConfigurationBuilder() + .AddInMemoryCollection(new Dictionary + { + ["AuditLog:SiteWriter:DatabasePath"] = ":memory:", + }) + .Build(); + + var services = new ServiceCollection(); + services.AddSingleton(); + services.AddSingleton(typeof(ILogger<>), typeof(NullLogger<>)); + services.AddAuditLog(config); + services.AddAuditLogHealthMetricsBridge(); + using var provider = services.BuildServiceProvider(); + + Assert.Throws( + () => provider.GetRequiredService()); + } } diff --git a/tests/ScadaLink.AuditLog.Tests/Site/HealthMetricsAuditWriteFailureCounterTests.cs b/tests/ScadaLink.AuditLog.Tests/Site/HealthMetricsAuditWriteFailureCounterTests.cs new file mode 100644 index 0000000..cb8d9d2 --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Site/HealthMetricsAuditWriteFailureCounterTests.cs @@ -0,0 +1,46 @@ +using NSubstitute; +using ScadaLink.AuditLog.Site; +using ScadaLink.HealthMonitoring; + +namespace ScadaLink.AuditLog.Tests.Site; + +/// +/// Bundle G (M2-T11) — the +/// adapter is the production binding for +/// on site nodes; it forwards every FallbackAuditWriter primary failure into +/// the shared so the site health report +/// surfaces the failure count as SiteAuditWriteFailures. +/// +public class HealthMetricsAuditWriteFailureCounterTests +{ + [Fact] + public void Increment_Routes_To_Collector_IncrementSiteAuditWriteFailures() + { + var collector = Substitute.For(); + var counter = new HealthMetricsAuditWriteFailureCounter(collector); + + counter.Increment(); + + collector.Received(1).IncrementSiteAuditWriteFailures(); + } + + [Fact] + public void Increment_Multiple_Calls_Route_To_Collector_Each_Time() + { + var collector = Substitute.For(); + var counter = new HealthMetricsAuditWriteFailureCounter(collector); + + counter.Increment(); + counter.Increment(); + counter.Increment(); + + collector.Received(3).IncrementSiteAuditWriteFailures(); + } + + [Fact] + public void Construction_With_Null_Collector_Throws_ArgumentNullException() + { + Assert.Throws( + () => new HealthMetricsAuditWriteFailureCounter(null!)); + } +} diff --git a/tests/ScadaLink.HealthMonitoring.Tests/SiteAuditWriteFailuresMetricTests.cs b/tests/ScadaLink.HealthMonitoring.Tests/SiteAuditWriteFailuresMetricTests.cs new file mode 100644 index 0000000..0fdb533 --- /dev/null +++ b/tests/ScadaLink.HealthMonitoring.Tests/SiteAuditWriteFailuresMetricTests.cs @@ -0,0 +1,52 @@ +namespace ScadaLink.HealthMonitoring.Tests; + +/// +/// Bundle G (M2-T11) regression coverage. The site-side Audit Log writer chain +/// (FallbackAuditWriter) increments +/// every time the primary SQLite writer throws. Bundle G bridges that counter +/// into the Site Health Monitoring report payload as SiteAuditWriteFailures +/// so a sustained audit-write outage surfaces on /monitoring/health rather than +/// disappearing into a NoOp sink. +/// +public class SiteAuditWriteFailuresMetricTests +{ + private readonly SiteHealthCollector _collector = new(); + + [Fact] + public void Increment_Three_Times_Counter_Reports_3() + { + _collector.IncrementSiteAuditWriteFailures(); + _collector.IncrementSiteAuditWriteFailures(); + _collector.IncrementSiteAuditWriteFailures(); + + var report = _collector.CollectReport("site-1"); + + Assert.Equal(3, report.SiteAuditWriteFailures); + } + + [Fact] + public void Report_Payload_Includes_SiteAuditWriteFailures_AsZeroByDefault() + { + var report = _collector.CollectReport("site-1"); + + Assert.Equal(0, report.SiteAuditWriteFailures); + } + + /// + /// Mirrors the existing per-interval reset semantics for ScriptErrorCount / + /// AlarmEvaluationErrorCount / DeadLetterCount — SiteAuditWriteFailures is an + /// interval count, not a running total. + /// + [Fact] + public void CollectReport_Resets_SiteAuditWriteFailures() + { + _collector.IncrementSiteAuditWriteFailures(); + _collector.IncrementSiteAuditWriteFailures(); + + var first = _collector.CollectReport("site-1"); + Assert.Equal(2, first.SiteAuditWriteFailures); + + var second = _collector.CollectReport("site-1"); + Assert.Equal(0, second.SiteAuditWriteFailures); + } +} diff --git a/tests/ScadaLink.Host.Tests/AkkaHostedServiceAuditWiringTests.cs b/tests/ScadaLink.Host.Tests/AkkaHostedServiceAuditWiringTests.cs index ed84701..392dc38 100644 --- a/tests/ScadaLink.Host.Tests/AkkaHostedServiceAuditWiringTests.cs +++ b/tests/ScadaLink.Host.Tests/AkkaHostedServiceAuditWiringTests.cs @@ -274,11 +274,16 @@ public class SiteAuditWiringTests : IDisposable } [Fact] - public void Site_Resolves_IAuditWriteFailureCounter_AsNoOpDefault() + public void Site_Resolves_IAuditWriteFailureCounter_AsHealthMetricsBridge() { + // Bundle G (M2-T11): site composition root calls + // AddAuditLogHealthMetricsBridge() after AddAuditLog + AddSiteHealthMonitoring, + // which swaps the NoOp default for the real health-metrics bridge so + // FallbackAuditWriter primary failures surface in the site health + // report payload as SiteAuditWriteFailures. var counter = _host.Services.GetService(); Assert.NotNull(counter); - Assert.IsType(counter); + Assert.IsType(counter); } [Fact] diff --git a/tests/ScadaLink.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs b/tests/ScadaLink.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs index 7778c65..9548631 100644 --- a/tests/ScadaLink.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs +++ b/tests/ScadaLink.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs @@ -69,6 +69,7 @@ public class DeploymentManagerRedeployTests : TestKit, IDisposable public void IncrementScriptError() { } public void IncrementAlarmError() { } public void IncrementDeadLetter() { } + public void IncrementSiteAuditWriteFailures() { } public void UpdateConnectionHealth(string connectionName, ConnectionHealth health) { } public void RemoveConnection(string connectionName) { } public void UpdateTagResolution(string connectionName, int totalSubscribed, int successfullyResolved) { }