fix(health-monitoring): resolve HealthMonitoring-013,014,016 — shorter-timeout cadence, options validation, injected TimeProvider; HealthMonitoring-015 left open (cross-module design decision)
This commit is contained in:
@@ -191,9 +191,10 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
|
||||
"Central health aggregator started, offline timeout {Timeout}s (central {CentralTimeout}s)",
|
||||
_options.OfflineTimeout.TotalSeconds, _options.CentralOfflineTimeout.TotalSeconds);
|
||||
|
||||
// Check at half the (shorter) offline timeout interval for timely detection
|
||||
var checkInterval = TimeSpan.FromMilliseconds(_options.OfflineTimeout.TotalMilliseconds / 2);
|
||||
using var timer = new PeriodicTimer(checkInterval);
|
||||
// Check at half the shorter of the two offline timeouts so detection is
|
||||
// timely for whichever site class (real or "central") has the tighter
|
||||
// window — see ComputeCheckInterval.
|
||||
using var timer = new PeriodicTimer(ComputeCheckInterval(_options));
|
||||
|
||||
while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
|
||||
{
|
||||
@@ -201,6 +202,24 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the offline-check timer cadence: half of the <em>shorter</em> of
|
||||
/// <see cref="HealthMonitoringOptions.OfflineTimeout"/> and
|
||||
/// <see cref="HealthMonitoringOptions.CentralOfflineTimeout"/>. Deriving it
|
||||
/// from the shorter timeout guarantees that whichever site class has the
|
||||
/// tighter window is still polled at least twice within it — so if an
|
||||
/// operator configures <c>CentralOfflineTimeout</c> smaller than
|
||||
/// <c>OfflineTimeout</c>, central offline detection is not delayed by up to a
|
||||
/// full <c>OfflineTimeout / 2</c>.
|
||||
/// </summary>
|
||||
internal static TimeSpan ComputeCheckInterval(HealthMonitoringOptions options)
|
||||
{
|
||||
var shorter = options.OfflineTimeout < options.CentralOfflineTimeout
|
||||
? options.OfflineTimeout
|
||||
: options.CentralOfflineTimeout;
|
||||
return TimeSpan.FromMilliseconds(shorter.TotalMilliseconds / 2);
|
||||
}
|
||||
|
||||
internal void CheckForOfflineSites()
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
@@ -0,0 +1,59 @@
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace ScadaLink.HealthMonitoring;
|
||||
|
||||
/// <summary>
|
||||
/// HealthMonitoring-014: validates <see cref="HealthMonitoringOptions"/> at
|
||||
/// startup. The interval values are fed straight into <c>new PeriodicTimer(...)</c>
|
||||
/// (and into a division for the offline-check cadence); a zero or negative value
|
||||
/// makes <see cref="PeriodicTimer"/>'s constructor throw
|
||||
/// <see cref="ArgumentOutOfRangeException"/>, crashing the
|
||||
/// <see cref="HealthReportSender"/> / <see cref="CentralHealthReportLoop"/> /
|
||||
/// <see cref="CentralHealthAggregator"/> hosted service with an opaque exception
|
||||
/// that does not name the offending config key. Registered with
|
||||
/// <c>ValidateOnStart()</c> so a bad <c>ScadaLink:HealthMonitoring</c> section
|
||||
/// fails fast at boot with a clear, key-naming message.
|
||||
/// </summary>
|
||||
public sealed class HealthMonitoringOptionsValidator : IValidateOptions<HealthMonitoringOptions>
|
||||
{
|
||||
public ValidateOptionsResult Validate(string? name, HealthMonitoringOptions options)
|
||||
{
|
||||
var failures = new List<string>();
|
||||
|
||||
if (options.ReportInterval <= TimeSpan.Zero)
|
||||
{
|
||||
failures.Add(
|
||||
$"ScadaLink:HealthMonitoring:ReportInterval must be a positive duration " +
|
||||
$"(was {options.ReportInterval}); it is used directly as a PeriodicTimer period.");
|
||||
}
|
||||
|
||||
if (options.OfflineTimeout <= TimeSpan.Zero)
|
||||
{
|
||||
failures.Add(
|
||||
$"ScadaLink:HealthMonitoring:OfflineTimeout must be a positive duration " +
|
||||
$"(was {options.OfflineTimeout}); it drives the offline-check PeriodicTimer cadence.");
|
||||
}
|
||||
|
||||
if (options.CentralOfflineTimeout <= TimeSpan.Zero)
|
||||
{
|
||||
failures.Add(
|
||||
$"ScadaLink:HealthMonitoring:CentralOfflineTimeout must be a positive duration " +
|
||||
$"(was {options.CentralOfflineTimeout}).");
|
||||
}
|
||||
|
||||
if (options.OfflineTimeout > TimeSpan.Zero
|
||||
&& options.CentralOfflineTimeout > TimeSpan.Zero
|
||||
&& options.CentralOfflineTimeout < options.OfflineTimeout)
|
||||
{
|
||||
failures.Add(
|
||||
$"ScadaLink:HealthMonitoring:CentralOfflineTimeout ({options.CentralOfflineTimeout}) " +
|
||||
$"must be >= OfflineTimeout ({options.OfflineTimeout}): the synthetic 'central' site has " +
|
||||
"no heartbeat source and is fed only by the slower self-report loop, so it needs at " +
|
||||
"least as much offline grace as a real site.");
|
||||
}
|
||||
|
||||
return failures.Count > 0
|
||||
? ValidateOptionsResult.Fail(failures)
|
||||
: ValidateOptionsResult.Success;
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,6 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.DependencyInjection.Extensions;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace ScadaLink.HealthMonitoring;
|
||||
|
||||
@@ -10,6 +12,7 @@ public static class ServiceCollectionExtensions
|
||||
/// </summary>
|
||||
public static IServiceCollection AddSiteHealthMonitoring(this IServiceCollection services)
|
||||
{
|
||||
AddOptionsValidation(services);
|
||||
services.AddSingleton<ISiteHealthCollector, SiteHealthCollector>();
|
||||
services.AddHostedService<HealthReportSender>();
|
||||
return services;
|
||||
@@ -21,6 +24,7 @@ public static class ServiceCollectionExtensions
|
||||
/// </summary>
|
||||
public static IServiceCollection AddHealthMonitoring(this IServiceCollection services)
|
||||
{
|
||||
AddOptionsValidation(services);
|
||||
services.AddSingleton<ISiteHealthCollector, SiteHealthCollector>();
|
||||
return services;
|
||||
}
|
||||
@@ -32,10 +36,27 @@ public static class ServiceCollectionExtensions
|
||||
/// </summary>
|
||||
public static IServiceCollection AddCentralHealthAggregation(this IServiceCollection services)
|
||||
{
|
||||
AddOptionsValidation(services);
|
||||
services.AddSingleton<CentralHealthAggregator>();
|
||||
services.AddSingleton<ICentralHealthAggregator>(sp => sp.GetRequiredService<CentralHealthAggregator>());
|
||||
services.AddHostedService(sp => sp.GetRequiredService<CentralHealthAggregator>());
|
||||
services.AddHostedService<CentralHealthReportLoop>();
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// HealthMonitoring-014: register the <see cref="HealthMonitoringOptionsValidator"/>
|
||||
/// so a misconfigured <c>ScadaLink:HealthMonitoring</c> section (zero/negative
|
||||
/// intervals, or a <c>CentralOfflineTimeout</c> shorter than
|
||||
/// <c>OfflineTimeout</c>) is rejected with a clear, key-naming message when the
|
||||
/// hosted services resolve their options at startup — rather than crashing
|
||||
/// later inside a <see cref="PeriodicTimer"/> constructor with an opaque
|
||||
/// <see cref="ArgumentOutOfRangeException"/>. Idempotent so it is safe when
|
||||
/// more than one of the registration methods above is called.
|
||||
/// </summary>
|
||||
private static void AddOptionsValidation(IServiceCollection services)
|
||||
{
|
||||
services.TryAddEnumerable(
|
||||
ServiceDescriptor.Singleton<IValidateOptions<HealthMonitoringOptions>, HealthMonitoringOptionsValidator>());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,6 +23,18 @@ public class SiteHealthCollector : ISiteHealthCollector
|
||||
private volatile string _nodeHostname = "";
|
||||
private volatile IReadOnlyList<Commons.Messages.Health.NodeStatus>? _clusterNodes;
|
||||
private volatile bool _isActiveNode;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a collector. The <paramref name="timeProvider"/> stamps each
|
||||
/// report's timestamp; it defaults to <see cref="TimeProvider.System"/> and
|
||||
/// is injectable so the report timestamp is deterministically testable —
|
||||
/// consistent with the rest of the module's time-dependent classes.
|
||||
/// </summary>
|
||||
public SiteHealthCollector(TimeProvider? timeProvider = null)
|
||||
{
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Increment the script error counter. Covers unhandled exceptions,
|
||||
@@ -148,7 +160,7 @@ public class SiteHealthCollector : ISiteHealthCollector
|
||||
return new SiteHealthReport(
|
||||
SiteId: siteId,
|
||||
SequenceNumber: 0, // Caller (HealthReportSender) assigns the sequence number
|
||||
ReportTimestamp: DateTimeOffset.UtcNow,
|
||||
ReportTimestamp: _timeProvider.GetUtcNow(),
|
||||
DataConnectionStatuses: connectionStatuses,
|
||||
TagResolutionCounts: tagResolution,
|
||||
ScriptErrorCount: scriptErrors,
|
||||
|
||||
Reference in New Issue
Block a user