295150751f
The Test Run sandbox and Monaco analysis modelled a script API that had drifted from the site runtime's ScriptGlobals, so real scripts failed to compile in Test Run. Realign both to the runtime surface (Instance/Scripts/ExternalSystem/Attributes/Children/Parent) and drop the duplicate ScriptHost stub so the two cannot diverge again. - Script calls (Scripts.CallShared, Instance.CallScript, Route.To().Call) accept an anonymous object instead of a hand-built dictionary, via a shared ScriptArgs normalizer; existing dictionary calls still compile. - Test Run can optionally bind to a deployed instance, so Instance/ Attributes/CallScript route to it cross-site; adds site-side RouteToGetAttributes/RouteToSetAttributes handlers. - Adds Test Run panels to the API method and template script editors. - Fixes the TestDatabaseQuery seed script, which queried a table that never existed. Also commits unrelated in-progress work already in the tree: the health monitoring report loop, site streaming changes, and the Admin/Design data-connection and SMTP page reorganization.
160 lines
5.7 KiB
C#
160 lines
5.7 KiB
C#
using System.Collections.Concurrent;
|
|
using Microsoft.Extensions.Hosting;
|
|
using Microsoft.Extensions.Logging;
|
|
using Microsoft.Extensions.Options;
|
|
using ScadaLink.Commons.Messages.Health;
|
|
|
|
namespace ScadaLink.HealthMonitoring;
|
|
|
|
/// <summary>
|
|
/// Central-side aggregator that receives health reports from all sites,
|
|
/// tracks latest metrics in memory, and detects offline sites.
|
|
/// No persistence — display-only for Central UI consumption.
|
|
/// </summary>
|
|
public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregator
|
|
{
|
|
private readonly ConcurrentDictionary<string, SiteHealthState> _siteStates = new();
|
|
private readonly HealthMonitoringOptions _options;
|
|
private readonly ILogger<CentralHealthAggregator> _logger;
|
|
private readonly TimeProvider _timeProvider;
|
|
|
|
public CentralHealthAggregator(
|
|
IOptions<HealthMonitoringOptions> options,
|
|
ILogger<CentralHealthAggregator> logger,
|
|
TimeProvider? timeProvider = null)
|
|
{
|
|
_options = options.Value;
|
|
_logger = logger;
|
|
_timeProvider = timeProvider ?? TimeProvider.System;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Process an incoming health report from a site.
|
|
/// Only replaces stored state if incoming sequence number is greater than last received.
|
|
/// Auto-marks previously offline sites as online.
|
|
/// </summary>
|
|
public void ProcessReport(SiteHealthReport report)
|
|
{
|
|
var now = _timeProvider.GetUtcNow();
|
|
|
|
_siteStates.AddOrUpdate(
|
|
report.SiteId,
|
|
_ =>
|
|
{
|
|
_logger.LogInformation("Site {SiteId} registered with sequence #{Seq}", report.SiteId, report.SequenceNumber);
|
|
return new SiteHealthState
|
|
{
|
|
SiteId = report.SiteId,
|
|
LatestReport = report,
|
|
LastReportReceivedAt = now,
|
|
LastHeartbeatAt = now,
|
|
LastSequenceNumber = report.SequenceNumber,
|
|
IsOnline = true
|
|
};
|
|
},
|
|
(_, existing) =>
|
|
{
|
|
if (report.SequenceNumber <= existing.LastSequenceNumber)
|
|
{
|
|
_logger.LogDebug(
|
|
"Rejecting stale report from site {SiteId}: seq {Incoming} <= {Last}",
|
|
report.SiteId, report.SequenceNumber, existing.LastSequenceNumber);
|
|
return existing;
|
|
}
|
|
|
|
var wasOffline = !existing.IsOnline;
|
|
existing.LatestReport = report;
|
|
existing.LastReportReceivedAt = now;
|
|
existing.LastHeartbeatAt = now;
|
|
existing.LastSequenceNumber = report.SequenceNumber;
|
|
existing.IsOnline = true;
|
|
|
|
if (wasOffline)
|
|
{
|
|
_logger.LogInformation("Site {SiteId} is back online (seq #{Seq})", report.SiteId, report.SequenceNumber);
|
|
}
|
|
|
|
return existing;
|
|
});
|
|
}
|
|
|
|
/// <summary>
|
|
/// Bumps the last-seen timestamp for a site already known via a prior
|
|
/// SiteHealthReport. Heartbeats from sites we have not yet received a
|
|
/// full report from are ignored — registration only happens on report.
|
|
/// </summary>
|
|
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt)
|
|
{
|
|
if (!_siteStates.TryGetValue(siteId, out var state))
|
|
return;
|
|
|
|
if (receivedAt > state.LastHeartbeatAt)
|
|
state.LastHeartbeatAt = receivedAt;
|
|
|
|
if (!state.IsOnline)
|
|
{
|
|
state.IsOnline = true;
|
|
_logger.LogInformation("Site {SiteId} is back online (heartbeat)", siteId);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Get the current health state for all known sites.
|
|
/// </summary>
|
|
public IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates()
|
|
{
|
|
return new Dictionary<string, SiteHealthState>(_siteStates);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Get the current health state for a specific site, or null if unknown.
|
|
/// </summary>
|
|
public SiteHealthState? GetSiteState(string siteId)
|
|
{
|
|
_siteStates.TryGetValue(siteId, out var state);
|
|
return state;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Background task that periodically checks for offline sites.
|
|
/// </summary>
|
|
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
|
{
|
|
_logger.LogInformation(
|
|
"Central health aggregator started, offline timeout {Timeout}s",
|
|
_options.OfflineTimeout.TotalSeconds);
|
|
|
|
// Check at half the offline timeout interval for timely detection
|
|
var checkInterval = TimeSpan.FromMilliseconds(_options.OfflineTimeout.TotalMilliseconds / 2);
|
|
using var timer = new PeriodicTimer(checkInterval);
|
|
|
|
while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
|
|
{
|
|
CheckForOfflineSites();
|
|
}
|
|
}
|
|
|
|
internal void CheckForOfflineSites()
|
|
{
|
|
var now = _timeProvider.GetUtcNow();
|
|
|
|
foreach (var kvp in _siteStates)
|
|
{
|
|
var state = kvp.Value;
|
|
if (!state.IsOnline) continue;
|
|
|
|
// Use LastHeartbeatAt — heartbeats arrive every ~5s from any
|
|
// healthy site node, so OfflineTimeout only fires when no node
|
|
// can reach central, not during single-node failovers.
|
|
var elapsed = now - state.LastHeartbeatAt;
|
|
if (elapsed > _options.OfflineTimeout)
|
|
{
|
|
state.IsOnline = false;
|
|
_logger.LogWarning(
|
|
"Site {SiteId} marked offline — no signal for {Elapsed}s (timeout: {Timeout}s)",
|
|
state.SiteId, elapsed.TotalSeconds, _options.OfflineTimeout.TotalSeconds);
|
|
}
|
|
}
|
|
}
|
|
}
|