fix(health-monitoring): resolve HealthMonitoring-013,014,016 — shorter-timeout cadence, options validation, injected TimeProvider; HealthMonitoring-015 left open (cross-module design decision)

This commit is contained in:
Joseph Doherty
2026-05-17 03:18:24 -04:00
parent da8c9f171b
commit eae4077414
8 changed files with 296 additions and 12 deletions

View File

@@ -191,9 +191,10 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
"Central health aggregator started, offline timeout {Timeout}s (central {CentralTimeout}s)",
_options.OfflineTimeout.TotalSeconds, _options.CentralOfflineTimeout.TotalSeconds);
// Check at half the (shorter) offline timeout interval for timely detection
var checkInterval = TimeSpan.FromMilliseconds(_options.OfflineTimeout.TotalMilliseconds / 2);
using var timer = new PeriodicTimer(checkInterval);
// Check at half the shorter of the two offline timeouts so detection is
// timely for whichever site class (real or "central") has the tighter
// window — see ComputeCheckInterval.
using var timer = new PeriodicTimer(ComputeCheckInterval(_options));
while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
{
@@ -201,6 +202,24 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
}
}
/// <summary>
/// Computes the offline-check timer cadence: half of the <em>shorter</em> of
/// <see cref="HealthMonitoringOptions.OfflineTimeout"/> and
/// <see cref="HealthMonitoringOptions.CentralOfflineTimeout"/>. Deriving it
/// from the shorter timeout guarantees that whichever site class has the
/// tighter window is still polled at least twice within it — so if an
/// operator configures <c>CentralOfflineTimeout</c> smaller than
/// <c>OfflineTimeout</c>, central offline detection is not delayed by up to a
/// full <c>OfflineTimeout / 2</c>.
/// </summary>
internal static TimeSpan ComputeCheckInterval(HealthMonitoringOptions options)
{
var shorter = options.OfflineTimeout < options.CentralOfflineTimeout
? options.OfflineTimeout
: options.CentralOfflineTimeout;
return TimeSpan.FromMilliseconds(shorter.TotalMilliseconds / 2);
}
internal void CheckForOfflineSites()
{
var now = _timeProvider.GetUtcNow();

View File

@@ -0,0 +1,59 @@
using Microsoft.Extensions.Options;
namespace ScadaLink.HealthMonitoring;
/// <summary>
/// HealthMonitoring-014: validates <see cref="HealthMonitoringOptions"/> at
/// startup. The interval values are fed straight into <c>new PeriodicTimer(...)</c>
/// (and into a division for the offline-check cadence); a zero or negative value
/// makes <see cref="PeriodicTimer"/>'s constructor throw
/// <see cref="ArgumentOutOfRangeException"/>, crashing the
/// <see cref="HealthReportSender"/> / <see cref="CentralHealthReportLoop"/> /
/// <see cref="CentralHealthAggregator"/> hosted service with an opaque exception
/// that does not name the offending config key. Registered with
/// <c>ValidateOnStart()</c> so a bad <c>ScadaLink:HealthMonitoring</c> section
/// fails fast at boot with a clear, key-naming message.
/// </summary>
public sealed class HealthMonitoringOptionsValidator : IValidateOptions<HealthMonitoringOptions>
{
public ValidateOptionsResult Validate(string? name, HealthMonitoringOptions options)
{
var failures = new List<string>();
if (options.ReportInterval <= TimeSpan.Zero)
{
failures.Add(
$"ScadaLink:HealthMonitoring:ReportInterval must be a positive duration " +
$"(was {options.ReportInterval}); it is used directly as a PeriodicTimer period.");
}
if (options.OfflineTimeout <= TimeSpan.Zero)
{
failures.Add(
$"ScadaLink:HealthMonitoring:OfflineTimeout must be a positive duration " +
$"(was {options.OfflineTimeout}); it drives the offline-check PeriodicTimer cadence.");
}
if (options.CentralOfflineTimeout <= TimeSpan.Zero)
{
failures.Add(
$"ScadaLink:HealthMonitoring:CentralOfflineTimeout must be a positive duration " +
$"(was {options.CentralOfflineTimeout}).");
}
if (options.OfflineTimeout > TimeSpan.Zero
&& options.CentralOfflineTimeout > TimeSpan.Zero
&& options.CentralOfflineTimeout < options.OfflineTimeout)
{
failures.Add(
$"ScadaLink:HealthMonitoring:CentralOfflineTimeout ({options.CentralOfflineTimeout}) " +
$"must be >= OfflineTimeout ({options.OfflineTimeout}): the synthetic 'central' site has " +
"no heartbeat source and is fed only by the slower self-report loop, so it needs at " +
"least as much offline grace as a real site.");
}
return failures.Count > 0
? ValidateOptionsResult.Fail(failures)
: ValidateOptionsResult.Success;
}
}

View File

@@ -1,4 +1,6 @@
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.DependencyInjection.Extensions;
using Microsoft.Extensions.Options;
namespace ScadaLink.HealthMonitoring;
@@ -10,6 +12,7 @@ public static class ServiceCollectionExtensions
/// </summary>
public static IServiceCollection AddSiteHealthMonitoring(this IServiceCollection services)
{
AddOptionsValidation(services);
services.AddSingleton<ISiteHealthCollector, SiteHealthCollector>();
services.AddHostedService<HealthReportSender>();
return services;
@@ -21,6 +24,7 @@ public static class ServiceCollectionExtensions
/// </summary>
public static IServiceCollection AddHealthMonitoring(this IServiceCollection services)
{
AddOptionsValidation(services);
services.AddSingleton<ISiteHealthCollector, SiteHealthCollector>();
return services;
}
@@ -32,10 +36,27 @@ public static class ServiceCollectionExtensions
/// </summary>
public static IServiceCollection AddCentralHealthAggregation(this IServiceCollection services)
{
AddOptionsValidation(services);
services.AddSingleton<CentralHealthAggregator>();
services.AddSingleton<ICentralHealthAggregator>(sp => sp.GetRequiredService<CentralHealthAggregator>());
services.AddHostedService(sp => sp.GetRequiredService<CentralHealthAggregator>());
services.AddHostedService<CentralHealthReportLoop>();
return services;
}
/// <summary>
/// HealthMonitoring-014: register the <see cref="HealthMonitoringOptionsValidator"/>
/// so a misconfigured <c>ScadaLink:HealthMonitoring</c> section (zero/negative
/// intervals, or a <c>CentralOfflineTimeout</c> shorter than
/// <c>OfflineTimeout</c>) is rejected with a clear, key-naming message when the
/// hosted services resolve their options at startup — rather than crashing
/// later inside a <see cref="PeriodicTimer"/> constructor with an opaque
/// <see cref="ArgumentOutOfRangeException"/>. Idempotent so it is safe when
/// more than one of the registration methods above is called.
/// </summary>
private static void AddOptionsValidation(IServiceCollection services)
{
services.TryAddEnumerable(
ServiceDescriptor.Singleton<IValidateOptions<HealthMonitoringOptions>, HealthMonitoringOptionsValidator>());
}
}

View File

@@ -23,6 +23,18 @@ public class SiteHealthCollector : ISiteHealthCollector
private volatile string _nodeHostname = "";
private volatile IReadOnlyList<Commons.Messages.Health.NodeStatus>? _clusterNodes;
private volatile bool _isActiveNode;
private readonly TimeProvider _timeProvider;
/// <summary>
/// Creates a collector. The <paramref name="timeProvider"/> stamps each
/// report's timestamp; it defaults to <see cref="TimeProvider.System"/> and
/// is injectable so the report timestamp is deterministically testable —
/// consistent with the rest of the module's time-dependent classes.
/// </summary>
public SiteHealthCollector(TimeProvider? timeProvider = null)
{
_timeProvider = timeProvider ?? TimeProvider.System;
}
/// <summary>
/// Increment the script error counter. Covers unhandled exceptions,
@@ -148,7 +160,7 @@ public class SiteHealthCollector : ISiteHealthCollector
return new SiteHealthReport(
SiteId: siteId,
SequenceNumber: 0, // Caller (HealthReportSender) assigns the sequence number
ReportTimestamp: DateTimeOffset.UtcNow,
ReportTimestamp: _timeProvider.GetUtcNow(),
DataConnectionStatuses: connectionStatuses,
TagResolutionCounts: tagResolution,
ScriptErrorCount: scriptErrors,