feat(health): wire ISiteEventLogger.FailedWriteCount into SiteHealthReport (#30, M2.16)
Add SiteHealthReport.SiteEventLogWriteFailures (trailing optional long = 0, additive-only), ISiteHealthCollector.SetSiteEventLogWriteFailures (default no-op so existing fakes compile), and SiteEventLogFailureCountReporter (hosted service in HealthMonitoring, Func<long> delegate to avoid the HealthMonitoring → StoreAndForward → SiteEventLogging cycle). Registration helper AddSiteEventLogHealthMetricsBridge added to HealthMonitoring.ServiceCollectionExtensions; wired in SiteServiceRegistration after AddSiteEventLogging. Tests: SiteEventLogWriteFailuresMetricTests (4 collector tests) + SiteEventLogFailureCountReporterTests (2 poller tests) in HealthMonitoring.Tests. 79/79 HealthMonitoring.Tests green, 59/59 SiteEventLogging.Tests green, 0 warnings.
This commit is contained in:
@@ -111,6 +111,23 @@ public interface ISiteHealthCollector
|
||||
/// <param name="count">The number of parked messages.</param>
|
||||
void SetParkedMessageCount(int count);
|
||||
|
||||
/// <summary>
|
||||
/// Site Event Logging (#12) M2.16 (#30) — replace the latest cumulative
|
||||
/// site-event-log write-failure count (SQLite error, disk full,
|
||||
/// bounded-queue overflow drop) used by the next <see cref="CollectReport"/>
|
||||
/// call. Refreshed periodically by the <c>SiteEventLogFailureCountReporter</c>
|
||||
/// hosted service. Point-in-time: the value is NOT reset on
|
||||
/// <see cref="CollectReport"/>; it carries forward until the next poller
|
||||
/// refresh. Default interface implementation is a no-op so existing test
|
||||
/// fakes continue to compile without per-fake updates.
|
||||
/// </summary>
|
||||
/// <param name="count">The cumulative failed-write count from <c>ISiteEventLogger.FailedWriteCount</c>.</param>
|
||||
void SetSiteEventLogWriteFailures(long count)
|
||||
{
|
||||
// Default no-op so test fakes do not need to be updated. The real
|
||||
// SiteHealthCollector overrides this with the Interlocked.Exchange store.
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sets the hostname of this node.
|
||||
/// </summary>
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.DependencyInjection.Extensions;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
@@ -50,6 +52,68 @@ public static class ServiceCollectionExtensions
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Site Event Logging (#12) M2.16 (#30) — register the
|
||||
/// <see cref="SiteEventLogFailureCountReporter"/> hosted service that
|
||||
/// periodically reads the cumulative event-log write-failure count and
|
||||
/// pushes it into <see cref="ISiteHealthCollector"/> as a point-in-time
|
||||
/// snapshot (<c>SiteEventLogWriteFailures</c> on the site health report).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Must be called AFTER <see cref="AddSiteHealthMonitoring"/> (or
|
||||
/// <see cref="AddHealthMonitoring"/>) which registers the
|
||||
/// <see cref="ISiteHealthCollector"/> the reporter depends on.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Why a Func<long> delegate instead of ISiteEventLogger.</b>
|
||||
/// <c>HealthMonitoring</c> must not reference <c>SiteEventLogging</c> directly —
|
||||
/// the <c>StoreAndForward → SiteEventLogging</c> edge already exists in the
|
||||
/// transitive graph, and <c>HealthMonitoring → StoreAndForward</c> is an
|
||||
/// existing direct reference; adding <c>HealthMonitoring → SiteEventLogging</c>
|
||||
/// would complete a cycle. The <see cref="Func{TResult}"/> delegate seam keeps
|
||||
/// the dependency acyclic: the caller (Host site wiring) captures
|
||||
/// <c>ISiteEventLogger.FailedWriteCount</c> as a lambda and passes it here.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Idempotent — a sentinel check on the
|
||||
/// <see cref="SiteEventLogFailureCountReporter"/> hosted-service descriptor
|
||||
/// short-circuits subsequent calls so the hosted service is not
|
||||
/// double-registered (AddHostedService has no TryAdd variant).
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
/// <param name="services">The service collection to register into.</param>
|
||||
/// <param name="failedWriteCountProvider">
|
||||
/// A factory delegate that, given the root <see cref="IServiceProvider"/>,
|
||||
/// returns a <see cref="Func{TResult}"/> that reads the current cumulative
|
||||
/// event-log write-failure count. Typically:
|
||||
/// <c>sp => () => sp.GetRequiredService<ISiteEventLogger>().FailedWriteCount</c>.
|
||||
/// The factory is evaluated once at hosted-service resolution time; the inner
|
||||
/// <see cref="Func{TResult}"/> is called on every poll tick.
|
||||
/// </param>
|
||||
/// <returns>The same <see cref="IServiceCollection"/> for chaining.</returns>
|
||||
public static IServiceCollection AddSiteEventLogHealthMetricsBridge(
|
||||
this IServiceCollection services,
|
||||
Func<IServiceProvider, Func<long>> failedWriteCountProvider)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(failedWriteCountProvider);
|
||||
|
||||
// Idempotent guard — mirrors AddAuditLogHealthMetricsBridge's
|
||||
// SiteAuditBacklogReporter sentinel check.
|
||||
if (services.Any(d => d.ImplementationType == typeof(SiteEventLogFailureCountReporter)))
|
||||
{
|
||||
return services;
|
||||
}
|
||||
|
||||
services.AddHostedService(sp => new SiteEventLogFailureCountReporter(
|
||||
failedWriteCountProvider(sp),
|
||||
sp.GetRequiredService<ISiteHealthCollector>(),
|
||||
sp.GetRequiredService<ILogger<SiteEventLogFailureCountReporter>>()));
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// HealthMonitoring-014: register the <see cref="HealthMonitoringOptionsValidator"/>
|
||||
/// so a misconfigured <c>ScadaBridge:HealthMonitoring</c> section (zero/negative
|
||||
|
||||
@@ -0,0 +1,144 @@
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
/// <summary>
|
||||
/// Site Event Logging (#12) M2.16 (#30) — site-side hosted service that
|
||||
/// periodically reads the cumulative event-log write-failure count and pushes
|
||||
/// it into <see cref="ISiteHealthCollector"/> so the next
|
||||
/// <see cref="ISiteHealthCollector.CollectReport"/> emits a fresh
|
||||
/// <c>SiteEventLogWriteFailures</c> field on the site health report.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Why a Func<long> and not ISiteEventLogger directly.</b>
|
||||
/// <c>HealthMonitoring</c> does not (and cannot) reference
|
||||
/// <c>SiteEventLogging</c> — <c>HealthMonitoring → StoreAndForward →
|
||||
/// SiteEventLogging</c> already exists in the transitive graph, so adding a
|
||||
/// direct reference would create a cycle. The <see cref="Func{TResult}"/>
|
||||
/// delegate seam breaks the coupling: the caller (Host site wiring) captures
|
||||
/// <c>ISiteEventLogger.FailedWriteCount</c> as a lambda at registration
|
||||
/// time, and this service reads only the numeric result. The delegate
|
||||
/// approach is a standard pattern for counter bridges and keeps the
|
||||
/// registration path self-documenting.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Cadence.</b> 30 s by default — the same cadence as
|
||||
/// <c>SiteAuditBacklogReporter</c>, which is coarse enough to stay within
|
||||
/// the health-report interval budget while keeping the central dashboard
|
||||
/// current.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Failure containment.</b> Any unexpected exception during the probe is
|
||||
/// caught and logged; the next tick retries. Mirrors
|
||||
/// <c>SiteAuditBacklogReporter</c>'s "exception logged, not propagated"
|
||||
/// contract.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class SiteEventLogFailureCountReporter : IHostedService, IDisposable
|
||||
{
|
||||
/// <summary>
|
||||
/// Default poll cadence. Matches <c>SiteAuditBacklogReporter.DefaultRefreshInterval</c>
|
||||
/// (30 s) — coarse enough to amortise the read across many reports, fine
|
||||
/// enough that the central dashboard never lags by more than one
|
||||
/// health-report interval.
|
||||
/// </summary>
|
||||
internal static readonly TimeSpan DefaultRefreshInterval = TimeSpan.FromSeconds(30);
|
||||
|
||||
private readonly Func<long> _failedWriteCountProvider;
|
||||
private readonly ISiteHealthCollector _collector;
|
||||
private readonly ILogger<SiteEventLogFailureCountReporter> _logger;
|
||||
private readonly TimeSpan _refreshInterval;
|
||||
private CancellationTokenSource? _cts;
|
||||
private Task? _loop;
|
||||
|
||||
/// <summary>Initializes a new instance of <see cref="SiteEventLogFailureCountReporter"/>.</summary>
|
||||
/// <param name="failedWriteCountProvider">
|
||||
/// A delegate that returns the current cumulative event-log write-failure count.
|
||||
/// Typically wired as <c>() => sp.GetRequiredService<ISiteEventLogger>().FailedWriteCount</c>
|
||||
/// in the Host site composition root.
|
||||
/// </param>
|
||||
/// <param name="collector">The site health collector that receives the failure-count snapshot.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
/// <param name="refreshInterval">Poll interval override; defaults to <see cref="DefaultRefreshInterval"/> (30 s).</param>
|
||||
public SiteEventLogFailureCountReporter(
|
||||
Func<long> failedWriteCountProvider,
|
||||
ISiteHealthCollector collector,
|
||||
ILogger<SiteEventLogFailureCountReporter> logger,
|
||||
TimeSpan? refreshInterval = null)
|
||||
{
|
||||
_failedWriteCountProvider = failedWriteCountProvider
|
||||
?? throw new ArgumentNullException(nameof(failedWriteCountProvider));
|
||||
_collector = collector ?? throw new ArgumentNullException(nameof(collector));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_refreshInterval = refreshInterval ?? DefaultRefreshInterval;
|
||||
}
|
||||
|
||||
/// <summary>Starts the background polling loop, running an immediate first probe before entering the timed cycle.</summary>
|
||||
/// <param name="ct">Cancellation token signalling host shutdown.</param>
|
||||
/// <returns>A task that represents the asynchronous operation.</returns>
|
||||
public Task StartAsync(CancellationToken ct)
|
||||
{
|
||||
// Linked CTS lets StopAsync's cancellation AND the host's shutdown
|
||||
// token both terminate the loop; either side firing aborts the
|
||||
// pending Task.Delay.
|
||||
_cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
_loop = Task.Run(() => RunLoopAsync(_cts.Token));
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
private async Task RunLoopAsync(CancellationToken ct)
|
||||
{
|
||||
// First tick runs immediately so the very first health report after
|
||||
// process start carries a real failure-count snapshot — without this
|
||||
// the dashboard would show 0 for the first 30 s after a deploy even
|
||||
// if failures had already accumulated.
|
||||
SafeProbe();
|
||||
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await Task.Delay(_refreshInterval, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
SafeProbe();
|
||||
}
|
||||
}
|
||||
|
||||
private void SafeProbe()
|
||||
{
|
||||
try
|
||||
{
|
||||
var count = _failedWriteCountProvider();
|
||||
_collector.SetSiteEventLogWriteFailures(count);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Catch-all is deliberate: the hosted service must survive every
|
||||
// class of probe failure so the next tick gets a chance. Mirrors
|
||||
// SiteAuditBacklogReporter's "exception logged, not propagated" contract.
|
||||
_logger.LogWarning(ex, "SiteEventLogFailureCountReporter probe failed; next tick will retry.");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Signals the polling loop to stop and waits for it to complete.</summary>
|
||||
/// <param name="ct">Cancellation token (not used; the internal CTS governs shutdown).</param>
|
||||
/// <returns>A task that represents the asynchronous operation.</returns>
|
||||
public Task StopAsync(CancellationToken ct)
|
||||
{
|
||||
_cts?.Cancel();
|
||||
return _loop ?? Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <summary>Releases the internal <see cref="CancellationTokenSource"/> used to stop the polling loop.</summary>
|
||||
public void Dispose()
|
||||
{
|
||||
_cts?.Dispose();
|
||||
}
|
||||
}
|
||||
@@ -17,6 +17,7 @@ public class SiteHealthCollector : ISiteHealthCollector
|
||||
private int _siteAuditWriteFailures;
|
||||
private int _auditRedactionFailures;
|
||||
private volatile SiteAuditBacklogSnapshot? _siteAuditBacklog;
|
||||
private long _siteEventLogWriteFailures;
|
||||
private readonly ConcurrentDictionary<string, ConnectionHealth> _connectionStatuses = new();
|
||||
private readonly ConcurrentDictionary<string, TagResolutionStatus> _tagResolutionCounts = new();
|
||||
private readonly ConcurrentDictionary<string, string> _connectionEndpoints = new();
|
||||
@@ -77,6 +78,12 @@ public class SiteHealthCollector : ISiteHealthCollector
|
||||
_siteAuditBacklog = snapshot ?? throw new ArgumentNullException(nameof(snapshot));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void SetSiteEventLogWriteFailures(long count)
|
||||
{
|
||||
Interlocked.Exchange(ref _siteEventLogWriteFailures, count);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void UpdateConnectionHealth(string connectionName, ConnectionHealth health)
|
||||
{
|
||||
@@ -206,6 +213,7 @@ public class SiteHealthCollector : ISiteHealthCollector
|
||||
ClusterNodes: _clusterNodes?.ToList(),
|
||||
SiteAuditWriteFailures: siteAuditWriteFailures,
|
||||
AuditRedactionFailure: auditRedactionFailures,
|
||||
SiteAuditBacklog: _siteAuditBacklog);
|
||||
SiteAuditBacklog: _siteAuditBacklog,
|
||||
SiteEventLogWriteFailures: Interlocked.Read(ref _siteEventLogWriteFailures));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user