refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)

Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
2026-05-28 09:37:45 -04:00
parent 6d87ee3c3b
commit 7b0b9c7365
1531 changed files with 11180 additions and 11054 deletions
@@ -0,0 +1,263 @@
+using System.Collections.Concurrent;
+using Microsoft.Extensions.Hosting;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
+
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
+
+/// <summary>
+/// Central-side aggregator that receives health reports from all sites,
+/// tracks latest metrics in memory, and detects offline sites.
+/// No persistence — display-only for Central UI consumption.
+/// </summary>
+public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregator
+{
+    private readonly ConcurrentDictionary<string, SiteHealthState> _siteStates = new();
+    private readonly HealthMonitoringOptions _options;
+    private readonly ILogger<CentralHealthAggregator> _logger;
+    private readonly TimeProvider _timeProvider;
+
+    /// <summary>Initializes a new instance of <see cref="CentralHealthAggregator"/>.</summary>
+    /// <param name="options">Health monitoring configuration.</param>
+    /// <param name="logger">Logger for aggregator diagnostics.</param>
+    /// <param name="timeProvider">Optional time provider; defaults to <see cref="TimeProvider.System"/>.</param>
+    public CentralHealthAggregator(
+        IOptions<HealthMonitoringOptions> options,
+        ILogger<CentralHealthAggregator> logger,
+        TimeProvider? timeProvider = null)
+    {
+        _options = options.Value;
+        _logger = logger;
+        _timeProvider = timeProvider ?? TimeProvider.System;
+    }
+
+    /// <inheritdoc />
+    public void ProcessReport(SiteHealthReport report)
+    {
+        var now = _timeProvider.GetUtcNow();
+
+        while (true)
+        {
+            if (!_siteStates.TryGetValue(report.SiteId, out var existing))
+            {
+                var registered = new SiteHealthState
+                {
+                    SiteId = report.SiteId,
+                    LatestReport = report,
+                    LastReportReceivedAt = now,
+                    LastHeartbeatAt = now,
+                    LastSequenceNumber = report.SequenceNumber,
+                    IsOnline = true
+                };
+
+                if (_siteStates.TryAdd(report.SiteId, registered))
+                {
+                    _logger.LogInformation(
+                        "Site {SiteId} registered with sequence #{Seq}", report.SiteId, report.SequenceNumber);
+                    return;
+                }
+
+                // Lost the race — another thread registered first; retry as an update.
+                continue;
+            }
+
+            if (report.SequenceNumber <= existing.LastSequenceNumber)
+            {
+                _logger.LogDebug(
+                    "Rejecting stale report from site {SiteId}: seq {Incoming} <= {Last}",
+                    report.SiteId, report.SequenceNumber, existing.LastSequenceNumber);
+                return;
+            }
+
+            var updated = existing with
+            {
+                LatestReport = report,
+                LastReportReceivedAt = now,
+                LastHeartbeatAt = now,
+                LastSequenceNumber = report.SequenceNumber,
+                IsOnline = true
+            };
+
+            if (_siteStates.TryUpdate(report.SiteId, updated, existing))
+            {
+                if (!existing.IsOnline)
+                {
+                    _logger.LogInformation(
+                        "Site {SiteId} is back online (seq #{Seq})", report.SiteId, report.SequenceNumber);
+                }
+                return;
+            }
+
+            // CAS lost — the entry changed under us; retry with the fresh value.
+        }
+    }
+
+    /// <inheritdoc />
+    public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt)
+    {
+        while (true)
+        {
+            if (!_siteStates.TryGetValue(siteId, out var existing))
+            {
+                // Unknown site — register it as online, awaiting its first
+                // full report. LatestReport and LastReportReceivedAt both stay
+                // null until ProcessReport runs — "no report yet" is an explicit
+                // nullable state, not a year-0001 sentinel the UI must special-case.
+                var registered = new SiteHealthState
+                {
+                    SiteId = siteId,
+                    LatestReport = null,
+                    LastReportReceivedAt = null,
+                    LastHeartbeatAt = receivedAt,
+                    LastSequenceNumber = 0,
+                    IsOnline = true
+                };
+
+                if (_siteStates.TryAdd(siteId, registered))
+                {
+                    _logger.LogInformation(
+                        "Site {SiteId} registered online via heartbeat (awaiting first report)", siteId);
+                    return;
+                }
+
+                // Lost the race — another thread registered first; retry as an update.
+                continue;
+            }
+
+            // HealthMonitoring-020: when an offline→online transition is being
+            // applied, the heartbeat timestamp must reflect a fresh observation,
+            // not the prior stored value. If receivedAt is older than the stored
+            // LastHeartbeatAt (clock skew, an out-of-order heartbeat arriving
+            // after an earlier one already advanced the field), promoting the
+            // site back to online while leaving LastHeartbeatAt stale would let
+            // CheckForOfflineSites flap it straight back to offline on the next
+            // tick. Anchor the heartbeat to the current time provider instead,
+            // so an offline-to-online transition is always backed by an
+            // up-to-date heartbeat.
+            DateTimeOffset newHeartbeat;
+            if (!existing.IsOnline)
+            {
+                var now = _timeProvider.GetUtcNow();
+                newHeartbeat = receivedAt > now ? receivedAt : now;
+            }
+            else
+            {
+                newHeartbeat = receivedAt > existing.LastHeartbeatAt
+                    ? receivedAt
+                    : existing.LastHeartbeatAt;
+            }
+
+            // Nothing to change — avoid a needless swap.
+            if (newHeartbeat == existing.LastHeartbeatAt && existing.IsOnline)
+                return;
+
+            var updated = existing with
+            {
+                LastHeartbeatAt = newHeartbeat,
+                IsOnline = true
+            };
+
+            if (_siteStates.TryUpdate(siteId, updated, existing))
+            {
+                if (!existing.IsOnline)
+                    _logger.LogInformation("Site {SiteId} is back online (heartbeat)", siteId);
+                return;
+            }
+
+            // CAS lost — retry with the fresh value.
+        }
+    }
+
+    /// <inheritdoc />
+    public IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates()
+    {
+        return new Dictionary<string, SiteHealthState>(_siteStates);
+    }
+
+    /// <inheritdoc />
+    public SiteHealthState? GetSiteState(string siteId)
+    {
+        _siteStates.TryGetValue(siteId, out var state);
+        return state;
+    }
+
+    /// <inheritdoc />
+    protected override async Task ExecuteAsync(CancellationToken stoppingToken)
+    {
+        _logger.LogInformation(
+            "Central health aggregator started, offline timeout {Timeout}s (central {CentralTimeout}s)",
+            _options.OfflineTimeout.TotalSeconds, _options.CentralOfflineTimeout.TotalSeconds);
+
+        // Check at half the shorter of the two offline timeouts so detection is
+        // timely for whichever site class (real or "central") has the tighter
+        // window — see ComputeCheckInterval.
+        using var timer = new PeriodicTimer(ComputeCheckInterval(_options));
+
+        while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
+        {
+            CheckForOfflineSites();
+        }
+    }
+
+    /// <summary>
+    /// Computes the offline-check timer cadence: half of the <em>shorter</em> of
+    /// <see cref="HealthMonitoringOptions.OfflineTimeout"/> and
+    /// <see cref="HealthMonitoringOptions.CentralOfflineTimeout"/>. Deriving it
+    /// from the shorter timeout guarantees that whichever site class has the
+    /// tighter window is still polled at least twice within it — so if an
+    /// operator configures <c>CentralOfflineTimeout</c> smaller than
+    /// <c>OfflineTimeout</c>, central offline detection is not delayed by up to a
+    /// full <c>OfflineTimeout / 2</c>.
+    /// </summary>
+    /// <param name="options">The health monitoring options to derive the interval from.</param>
+    internal static TimeSpan ComputeCheckInterval(HealthMonitoringOptions options)
+    {
+        var shorter = options.OfflineTimeout < options.CentralOfflineTimeout
+            ? options.OfflineTimeout
+            : options.CentralOfflineTimeout;
+        return TimeSpan.FromMilliseconds(shorter.TotalMilliseconds / 2);
+    }
+
+    /// <summary>Iterates all tracked sites and marks any that have exceeded their offline timeout as offline.</summary>
+    internal void CheckForOfflineSites()
+    {
+        var now = _timeProvider.GetUtcNow();
+
+        foreach (var kvp in _siteStates)
+        {
+            var state = kvp.Value;
+            if (!state.IsOnline) continue;
+
+            // Use LastHeartbeatAt — heartbeats arrive every ~5s from any
+            // healthy site node (cadence owned by Cluster Infrastructure /
+            // SiteCommunicationActor — CommunicationOptions.TransportHeartbeatInterval),
+            // so the 60s OfflineTimeout tolerates several missed heartbeats and
+            // only fires when no node can reach central, not during single-node
+            // failovers.
+            //
+            // The synthetic "central" site has no heartbeat source — its only
+            // signal is the 30s CentralHealthReportLoop self-report — so it gets
+            // a longer grace window (CentralOfflineTimeout) to survive a single
+            // skipped/late self-report.
+            var timeout = kvp.Key == CentralHealthReportLoop.CentralSiteId
+                ? _options.CentralOfflineTimeout
+                : _options.OfflineTimeout;
+
+            var elapsed = now - state.LastHeartbeatAt;
+            if (elapsed <= timeout)
+                continue;
+
+            // Atomically swap to an offline copy. If the CAS loses to a
+            // concurrent report/heartbeat the site was just heard from, so
+            // leaving it online is the correct outcome — no retry needed.
+            var offline = state with { IsOnline = false };
+            if (_siteStates.TryUpdate(kvp.Key, offline, state))
+            {
+                _logger.LogWarning(
+                    "Site {SiteId} marked offline — no signal for {Elapsed}s (timeout: {Timeout}s)",
+                    state.SiteId, elapsed.TotalSeconds, timeout.TotalSeconds);
+            }
+        }
+    }
+}
@@ -0,0 +1,144 @@
+using Microsoft.Extensions.Hosting;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
+
+/// <summary>
+/// Central-side counterpart to <see cref="HealthReportSender"/>.
+/// Periodically builds a SiteHealthReport for the central cluster itself
+/// (siteId = <see cref="CentralSiteId"/>) and feeds it into the local
+/// CentralHealthAggregator so the UI can render central as another card
+/// on /monitoring/health. Only the cluster leader (Primary) generates
+/// reports — the standby's aggregator catches up on failover when it
+/// becomes Primary and starts its own loop.
+/// </summary>
+public class CentralHealthReportLoop : BackgroundService
+{
+    /// <summary>
+    /// Reserved siteId used to represent the central cluster in the
+    /// shared CentralHealthAggregator keyspace.
+    ///
+    /// HealthMonitoring-021: the value is prefixed with <c>$</c> — a character
+    /// that is forbidden in real site identifiers (the configuration /
+    /// repository layer only permits Sites whose <c>SiteIdentifier</c> is a
+    /// plain identifier) — so the synthetic central entry cannot collide with
+    /// a real site whose operator-set identifier happened to be the bare word
+    /// "central". A collision would have caused the two reports to clobber
+    /// each other in the aggregator keyspace via the sequence-number guard,
+    /// and the real site would inherit the longer
+    /// <see cref="HealthMonitoringOptions.CentralOfflineTimeout"/> grace and
+    /// stay falsely-online for an extra two minutes after going down.
+    /// Consumers (<see cref="CentralHealthAggregator.CheckForOfflineSites"/>,
+    /// the Central UI health dashboard) reference this constant rather than
+    /// the literal string, so the change is local.
+    /// </summary>
+    public const string CentralSiteId = "$central";
+
+    private readonly ISiteHealthCollector _collector;
+    private readonly ICentralHealthAggregator _aggregator;
+    private readonly IClusterNodeProvider _clusterNodeProvider;
+    private readonly HealthMonitoringOptions _options;
+    private readonly ILogger<CentralHealthReportLoop> _logger;
+
+    // Seeded with Unix-ms so reports from a newly-elected central leader
+    // always sort after reports from any prior leader for siteId="central".
+    // The clock is read through the injected TimeProvider so the seeding is
+    // deterministically testable.
+    private long _sequenceNumber;
+
+    /// <summary>
+    /// Initializes the central health report loop.
+    /// </summary>
+    /// <param name="collector">Local health metrics collector for the central node.</param>
+    /// <param name="aggregator">Aggregator that stores reports for the Central UI health dashboard.</param>
+    /// <param name="clusterNodeProvider">Provider used to determine whether this node is primary.</param>
+    /// <param name="options">Health monitoring configuration (report interval, offline threshold).</param>
+    /// <param name="logger">Logger for diagnostics.</param>
+    /// <param name="timeProvider">Optional time provider; defaults to <see cref="TimeProvider.System"/>.</param>
+    public CentralHealthReportLoop(
+        ISiteHealthCollector collector,
+        ICentralHealthAggregator aggregator,
+        IClusterNodeProvider clusterNodeProvider,
+        IOptions<HealthMonitoringOptions> options,
+        ILogger<CentralHealthReportLoop> logger,
+        TimeProvider? timeProvider = null)
+    {
+        _collector = collector;
+        _aggregator = aggregator;
+        _clusterNodeProvider = clusterNodeProvider;
+        _options = options.Value;
+        _logger = logger;
+        _sequenceNumber = (timeProvider ?? TimeProvider.System).GetUtcNow().ToUnixTimeMilliseconds();
+    }
+
+    /// <summary>
+    /// Current sequence number (for testing).
+    /// </summary>
+    public long CurrentSequenceNumber => Interlocked.Read(ref _sequenceNumber);
+
+    /// <inheritdoc />
+    protected override async Task ExecuteAsync(CancellationToken stoppingToken)
+    {
+        _logger.LogInformation(
+            "Central health report loop starting, interval {Interval}s",
+            _options.ReportInterval.TotalSeconds);
+
+        using var timer = new PeriodicTimer(_options.ReportInterval);
+
+        while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
+        {
+            try
+            {
+                var isPrimary = _clusterNodeProvider.SelfIsPrimary;
+                _collector.SetActiveNode(isPrimary);
+
+                if (!isPrimary)
+                    continue;
+
+                _collector.SetClusterNodes(_clusterNodeProvider.GetClusterNodes());
+
+                var seq = Interlocked.Increment(ref _sequenceNumber);
+
+                // HealthMonitoring-018: CollectReport atomically read-and-resets
+                // the per-interval error counters via Interlocked.Exchange. If
+                // ProcessReport throws (or any other failure occurs between the
+                // collect and the publish), those counts would otherwise be
+                // lost — neither in the un-published report nor in the
+                // now-zeroed collector. Snapshot the freshly-collected report
+                // so that on a publish failure we can atomically restore the
+                // counts back into the shared SiteHealthCollector via
+                // Interlocked.Add. Concurrent increments arriving during the
+                // ProcessReport call are preserved on the counter; the restore
+                // Add safely sums with any such concurrent increments. Same
+                // shape as the HealthMonitoring-017 fix in HealthReportSender.
+                var report = _collector.CollectReport(CentralSiteId);
+                var reportWithSeq = report with { SequenceNumber = seq };
+
+                try
+                {
+                    _aggregator.ProcessReport(reportWithSeq);
+                }
+                catch
+                {
+                    // Restore the captured per-interval counters atomically so
+                    // they roll forward into the next report — see
+                    // HealthMonitoring-018.
+                    _collector.AddIntervalCounters(
+                        scriptErrors: report.ScriptErrorCount,
+                        alarmErrors: report.AlarmEvaluationErrorCount,
+                        deadLetters: report.DeadLetterCount,
+                        siteAuditWriteFailures: report.SiteAuditWriteFailures,
+                        auditRedactionFailures: report.AuditRedactionFailure);
+                    throw;
+                }
+
+                _logger.LogDebug("Generated central health report #{Seq}", seq);
+            }
+            catch (Exception ex)
+            {
+                _logger.LogError(ex, "Failed to generate central health report");
+            }
+        }
+    }
+}
@@ -0,0 +1,22 @@
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
+
+public class HealthMonitoringOptions
+{
+    /// <summary>Interval at which sites emit health reports to the central cluster.</summary>
+    public TimeSpan ReportInterval { get; set; } = TimeSpan.FromSeconds(30);
+    /// <summary>Duration of silence after which a site is classified as offline.</summary>
+    public TimeSpan OfflineTimeout { get; set; } = TimeSpan.FromMinutes(1);
+
+    /// <summary>
+    /// Offline timeout applied to the synthetic "central" site only. Real sites
+    /// emit frequent heartbeats that keep <c>LastHeartbeatAt</c> fresh, so the
+    /// normal <see cref="OfflineTimeout"/> only fires on genuine total loss. The
+    /// "central" self-report has no heartbeat source — its only signal is the
+    /// 30s <see cref="CentralHealthReportLoop"/>, so a single skipped/late
+    /// self-report (leader GC pause, brief stall, mid-failover before the new
+    /// leader's loop spins up) would flap it offline under the 60s site timeout.
+    /// A longer central grace gives the equivalent of "one missed report" that
+    /// the design doc grants real sites. Default: 3x the report interval.
+    /// </summary>
+    public TimeSpan CentralOfflineTimeout { get; set; } = TimeSpan.FromMinutes(3);
+}
@@ -0,0 +1,64 @@
+using Microsoft.Extensions.Options;
+
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
+
+/// <summary>
+/// HealthMonitoring-014: validates <see cref="HealthMonitoringOptions"/> at
+/// startup. The interval values are fed straight into <c>new PeriodicTimer(...)</c>
+/// (and into a division for the offline-check cadence); a zero or negative value
+/// makes <see cref="PeriodicTimer"/>'s constructor throw
+/// <see cref="ArgumentOutOfRangeException"/>, crashing the
+/// <see cref="HealthReportSender"/> / <see cref="CentralHealthReportLoop"/> /
+/// <see cref="CentralHealthAggregator"/> hosted service with an opaque exception
+/// that does not name the offending config key. Registered with
+/// <c>ValidateOnStart()</c> so a bad <c>ScadaBridge:HealthMonitoring</c> section
+/// fails fast at boot with a clear, key-naming message.
+/// </summary>
+public sealed class HealthMonitoringOptionsValidator : IValidateOptions<HealthMonitoringOptions>
+{
+    /// <summary>
+    /// Validates the health monitoring options, returning a failure result if any interval values are non-positive.
+    /// </summary>
+    /// <param name="name">Named options instance name (unused).</param>
+    /// <param name="options">The health monitoring options to validate.</param>
+    public ValidateOptionsResult Validate(string? name, HealthMonitoringOptions options)
+    {
+        var failures = new List<string>();
+
+        if (options.ReportInterval <= TimeSpan.Zero)
+        {
+            failures.Add(
+                $"ScadaBridge:HealthMonitoring:ReportInterval must be a positive duration " +
+                $"(was {options.ReportInterval}); it is used directly as a PeriodicTimer period.");
+        }
+
+        if (options.OfflineTimeout <= TimeSpan.Zero)
+        {
+            failures.Add(
+                $"ScadaBridge:HealthMonitoring:OfflineTimeout must be a positive duration " +
+                $"(was {options.OfflineTimeout}); it drives the offline-check PeriodicTimer cadence.");
+        }
+
+        if (options.CentralOfflineTimeout <= TimeSpan.Zero)
+        {
+            failures.Add(
+                $"ScadaBridge:HealthMonitoring:CentralOfflineTimeout must be a positive duration " +
+                $"(was {options.CentralOfflineTimeout}).");
+        }
+
+        if (options.OfflineTimeout > TimeSpan.Zero
+            && options.CentralOfflineTimeout > TimeSpan.Zero
+            && options.CentralOfflineTimeout < options.OfflineTimeout)
+        {
+            failures.Add(
+                $"ScadaBridge:HealthMonitoring:CentralOfflineTimeout ({options.CentralOfflineTimeout}) " +
+                $"must be >= OfflineTimeout ({options.OfflineTimeout}): the synthetic 'central' site has " +
+                "no heartbeat source and is fed only by the slower self-report loop, so it needs at " +
+                "least as much offline grace as a real site.");
+        }
+
+        return failures.Count > 0
+            ? ValidateOptionsResult.Fail(failures)
+            : ValidateOptionsResult.Success;
+    }
+}
@@ -0,0 +1,187 @@
+using Microsoft.Extensions.Hosting;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
+using ZB.MOM.WW.ScadaBridge.StoreAndForward;
+
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
+
+/// <summary>
+/// Periodically collects a SiteHealthReport and sends it to central via Akka remoting.
+/// Sequence numbers are monotonic and reset on service restart. They are <b>not</b>
+/// zero/one-based: the per-process counter is seeded with the current Unix epoch
+/// (milliseconds) at construction so that, after a failover, reports from a
+/// freshly-active node always sort after reports from any prior active node for the
+/// same site — otherwise the central aggregator's sequence-number guard would
+/// silently reject the new active's first reports as stale.
+/// </summary>
+public class HealthReportSender : BackgroundService
+{
+    private readonly ISiteHealthCollector _collector;
+    private readonly IHealthReportTransport _transport;
+    private readonly HealthMonitoringOptions _options;
+    private readonly ILogger<HealthReportSender> _logger;
+    private readonly string _siteId;
+    private readonly StoreAndForwardStorage? _sfStorage;
+    private readonly IClusterNodeProvider? _clusterNodeProvider;
+
+    // Seeded with Unix-ms at construction so reports from a freshly-active
+    // node always sort after reports from any prior active node for the same
+    // site. Without this seeding, failover would silently drop the new
+    // active's first reports because their per-process counter starts below
+    // the prior active's last sequence number. The clock is read through the
+    // injected TimeProvider so the seeding is deterministically testable.
+    private long _sequenceNumber;
+
+    /// <summary>Initializes the sender, seeds the monotonic sequence number from the current Unix timestamp.</summary>
+    /// <param name="collector">Site health metric collector supplying the report payload.</param>
+    /// <param name="transport">Transport used to send the health report to central.</param>
+    /// <param name="options">Health monitoring options including the report interval.</param>
+    /// <param name="logger">Logger instance.</param>
+    /// <param name="siteIdentityProvider">Provides the site identifier embedded in each report.</param>
+    /// <param name="sfStorage">Optional store-and-forward storage for queue depth metrics.</param>
+    /// <param name="clusterNodeProvider">Optional cluster node provider for active-node detection.</param>
+    /// <param name="timeProvider">Optional time provider; defaults to <see cref="TimeProvider.System"/>.</param>
+    public HealthReportSender(
+        ISiteHealthCollector collector,
+        IHealthReportTransport transport,
+        IOptions<HealthMonitoringOptions> options,
+        ILogger<HealthReportSender> logger,
+        ISiteIdentityProvider siteIdentityProvider,
+        StoreAndForwardStorage? sfStorage = null,
+        IClusterNodeProvider? clusterNodeProvider = null,
+        TimeProvider? timeProvider = null)
+    {
+        _collector = collector;
+        _transport = transport;
+        _options = options.Value;
+        _logger = logger;
+        _siteId = siteIdentityProvider.SiteId;
+        _sfStorage = sfStorage;
+        _clusterNodeProvider = clusterNodeProvider;
+        _sequenceNumber = (timeProvider ?? TimeProvider.System).GetUtcNow().ToUnixTimeMilliseconds();
+    }
+
+    /// <summary>
+    /// Current sequence number (for testing).
+    /// </summary>
+    public long CurrentSequenceNumber => Interlocked.Read(ref _sequenceNumber);
+
+    /// <inheritdoc />
+    protected override async Task ExecuteAsync(CancellationToken stoppingToken)
+    {
+        _logger.LogInformation(
+            "Health report sender starting for site {SiteId}, interval {Interval}s",
+            _siteId, _options.ReportInterval.TotalSeconds);
+
+        using var timer = new PeriodicTimer(_options.ReportInterval);
+
+        while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
+        {
+            try
+            {
+                // Only the active node (running the DeploymentManager singleton) sends health reports.
+                // The standby node has no instance/connection data and would overwrite the active's report.
+                if (!_collector.IsActiveNode)
+                    continue;
+
+                if (_clusterNodeProvider != null)
+                {
+                    try
+                    {
+                        _collector.SetClusterNodes(_clusterNodeProvider.GetClusterNodes());
+                    }
+                    catch (Exception ex)
+                    {
+                        // Non-fatal — the report ships with the previous cluster
+                        // node list. Logged so a persistent failure is diagnosable.
+                        _logger.LogWarning(ex,
+                            "Failed to refresh cluster nodes for health report (site {SiteId}); using stale list",
+                            _siteId);
+                    }
+                }
+
+                if (_sfStorage != null)
+                {
+                    try
+                    {
+                        var parkedCount = await _sfStorage.GetParkedMessageCountAsync();
+                        _collector.SetParkedMessageCount(parkedCount);
+                    }
+                    catch (Exception ex)
+                    {
+                        // Non-fatal — parked count will be 0 in this report.
+                        _logger.LogWarning(ex,
+                            "Failed to query parked message count for health report (site {SiteId})",
+                            _siteId);
+                    }
+
+                    try
+                    {
+                        // Per-category pending-message buffer depths (the documented
+                        // "store-and-forward buffer depth" triage metric). Keyed by
+                        // StoreAndForwardCategory name so the central dashboard can
+                        // render external/notification/DB-write depths separately.
+                        var depthsByCategory = await _sfStorage.GetBufferDepthByCategoryAsync();
+                        var depths = depthsByCategory.ToDictionary(
+                            kvp => kvp.Key.ToString(),
+                            kvp => kvp.Value);
+                        _collector.SetStoreAndForwardDepths(depths);
+                    }
+                    catch (Exception ex)
+                    {
+                        // Non-fatal — buffer depths will be empty in this report.
+                        _logger.LogWarning(ex,
+                            "Failed to query store-and-forward buffer depths for health report (site {SiteId})",
+                            _siteId);
+                    }
+                }
+
+                var seq = Interlocked.Increment(ref _sequenceNumber);
+
+                // HealthMonitoring-017: CollectReport atomically read-and-resets
+                // the per-interval error counters via Interlocked.Exchange. If
+                // the Send below throws, those counts are otherwise lost
+                // forever — neither in the un-sent report nor in the now-zeroed
+                // collector. Snapshot the freshly-collected report so that on a
+                // transport failure we can atomically restore the counts back
+                // into the collector via Interlocked.Add, so the next
+                // successful report includes them. Concurrent increments
+                // arriving during the Send are preserved on the counter (they
+                // accumulate against zero); the restore Add safely sums with
+                // any such concurrent increments.
+                var report = _collector.CollectReport(_siteId);
+
+                // Replace the placeholder sequence number with our monotonic one
+                var reportWithSeq = report with { SequenceNumber = seq };
+
+                try
+                {
+                    _transport.Send(reportWithSeq);
+                }
+                catch
+                {
+                    // Restore the captured per-interval counters atomically so
+                    // they roll forward into the next report — see
+                    // HealthMonitoring-017. Any concurrent increment that
+                    // arrived during the failed Send remains on the counter;
+                    // Interlocked.Add sums correctly with it.
+                    _collector.AddIntervalCounters(
+                        scriptErrors: report.ScriptErrorCount,
+                        alarmErrors: report.AlarmEvaluationErrorCount,
+                        deadLetters: report.DeadLetterCount,
+                        siteAuditWriteFailures: report.SiteAuditWriteFailures,
+                        auditRedactionFailures: report.AuditRedactionFailure);
+                    throw;
+                }
+
+                _logger.LogInformation("Sent health report #{Seq} for site {SiteId}", seq, _siteId);
+            }
+            catch (Exception ex)
+            {
+                _logger.LogError(ex, "Failed to send health report for site {SiteId}", _siteId);
+                // Continue sending — don't let a single failure stop reporting
+            }
+        }
+    }
+}
@@ -0,0 +1,40 @@
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
+
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
+
+/// <summary>
+/// Interface for central-side health aggregation.
+/// Consumed by Central UI to display site health dashboards.
+/// </summary>
+public interface ICentralHealthAggregator
+{
+    /// <summary>
+    /// Processes an incoming health report from a site and updates the aggregated state.
+    /// </summary>
+    /// <param name="report">The health report received from the site.</param>
+    void ProcessReport(SiteHealthReport report);
+
+    /// <summary>
+    /// Bumps the last-seen timestamp for a site, keeping it marked online
+    /// between full 30s reports when heartbeats are arriving — protects against
+    /// the offline threshold firing on a transiently delayed report. Heartbeat
+    /// cadence is owned by the Cluster Infrastructure / <c>SiteCommunicationActor</c>
+    /// (the application-level heartbeat to central, sent every
+    /// <c>CommunicationOptions.TransportHeartbeatInterval</c> — 5s by default);
+    /// the 60s <see cref="HealthMonitoringOptions.OfflineTimeout"/> therefore
+    /// tolerates several missed heartbeats. A heartbeat for a site with no
+    /// aggregator state yet (e.g. just after a central restart/failover)
+    /// registers that site as online with no
+    /// <see cref="SiteHealthState.LatestReport"/>, so reachable sites are not
+    /// shown as "unknown" during the failover window.
+    /// </summary>
+    /// <param name="siteId">The string identifier of the site that sent the heartbeat.</param>
+    /// <param name="receivedAt">The UTC timestamp when the heartbeat was received.</param>
+    void MarkHeartbeat(string siteId, DateTimeOffset receivedAt);
+
+    /// <summary>Returns a snapshot of all currently tracked site health states.</summary>
+    IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates();
+    /// <summary>Returns the current health state for the specified site, or null if not tracked.</summary>
+    /// <param name="siteId">The string identifier of the site to look up.</param>
+    SiteHealthState? GetSiteState(string siteId);
+}
@@ -0,0 +1,20 @@
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
+
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
+
+/// <summary>
+/// Provides cluster node status information for health reporting.
+/// Implemented by the Host project which has access to the Akka.NET actor system.
+/// </summary>
+public interface IClusterNodeProvider
+{
+    /// <summary>Returns the current status of all cluster nodes for the provider's role scope.</summary>
+    IReadOnlyList<NodeStatus> GetClusterNodes();
+
+    /// <summary>
+    /// True when this node is currently the cluster leader (Primary) for the
+    /// provider's role scope. Used by the central report loop to decide which
+    /// node should generate the "central" health report.
+    /// </summary>
+    bool SelfIsPrimary { get; }
+}
@@ -0,0 +1,16 @@
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
+
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
+
+/// <summary>
+/// Abstraction for sending health reports to central.
+/// In production, implemented via Akka remoting (Tell, fire-and-forget).
+/// </summary>
+public interface IHealthReportTransport
+{
+    /// <summary>
+    /// Sends a health report to central (fire-and-forget).
+    /// </summary>
+    /// <param name="report">The site health report to send.</param>
+    void Send(SiteHealthReport report);
+}
@@ -0,0 +1,172 @@
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
+using ZB.MOM.WW.ScadaBridge.Commons.Types;
+using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
+
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
+
+/// <summary>
+/// Interface for site-side health metric collection.
+/// Consumed by Site Runtime actors to report errors, and by DCL to report connection health.
+/// </summary>
+public interface ISiteHealthCollector
+{
+    /// <summary>
+    /// Increments the script error count.
+    /// </summary>
+    void IncrementScriptError();
+
+    /// <summary>
+    /// Increments the alarm error count.
+    /// </summary>
+    void IncrementAlarmError();
+
+    /// <summary>
+    /// Increments the dead letter count.
+    /// </summary>
+    void IncrementDeadLetter();
+
+    /// <summary>
+    /// Audit Log (#23) Bundle G — increment the per-interval count of
+    /// <c>FallbackAuditWriter</c> primary failures. Bridged from the
+    /// <c>IAuditWriteFailureCounter</c> binding registered via
+    /// <c>AddAuditLogHealthMetricsBridge()</c>.
+    /// </summary>
+    void IncrementSiteAuditWriteFailures();
+
+    /// <summary>
+    /// Audit Log (#23) M5 Bundle C — increment the per-interval count of
+    /// payload-filter redactor over-redactions (header / body / SQL
+    /// parameter stage throws routed to the
+    /// <c>&lt;redacted: redactor error&gt;</c> marker). Bridged from the
+    /// <c>IAuditRedactionFailureCounter</c> binding registered via
+    /// <c>AddAuditLogHealthMetricsBridge()</c>.
+    /// </summary>
+    void IncrementAuditRedactionFailure();
+
+    /// <summary>
+    /// Audit Log (#23) M6 Bundle E (T6) — replace the latest site-local
+    /// audit-queue backlog snapshot (pending count, oldest pending row,
+    /// on-disk file bytes) used by the next <see cref="CollectReport"/> call.
+    /// Refreshed periodically by the <c>SiteAuditBacklogReporter</c> hosted
+    /// service so each report carries a recent point-in-time view of the
+    /// site→central drain health.
+    /// </summary>
+    /// <param name="snapshot">The audit backlog snapshot.</param>
+    void UpdateSiteAuditBacklog(SiteAuditBacklogSnapshot snapshot);
+
+    /// <summary>
+    /// Updates the health status for a data connection.
+    /// </summary>
+    /// <param name="connectionName">The name of the connection.</param>
+    /// <param name="health">The connection health status.</param>
+    void UpdateConnectionHealth(string connectionName, ConnectionHealth health);
+
+    /// <summary>
+    /// Removes a connection from health tracking.
+    /// </summary>
+    /// <param name="connectionName">The name of the connection.</param>
+    void RemoveConnection(string connectionName);
+
+    /// <summary>
+    /// Updates tag resolution metrics for a connection.
+    /// </summary>
+    /// <param name="connectionName">The name of the connection.</param>
+    /// <param name="totalSubscribed">Total number of subscribed tags.</param>
+    /// <param name="successfullyResolved">Number of successfully resolved tags.</param>
+    void UpdateTagResolution(string connectionName, int totalSubscribed, int successfullyResolved);
+
+    /// <summary>
+    /// Updates the endpoint for a connection.
+    /// </summary>
+    /// <param name="connectionName">The name of the connection.</param>
+    /// <param name="endpoint">The connection endpoint.</param>
+    void UpdateConnectionEndpoint(string connectionName, string endpoint);
+
+    /// <summary>
+    /// Updates tag quality metrics for a connection.
+    /// </summary>
+    /// <param name="connectionName">The name of the connection.</param>
+    /// <param name="good">Number of good quality tags.</param>
+    /// <param name="bad">Number of bad quality tags.</param>
+    /// <param name="uncertain">Number of uncertain quality tags.</param>
+    void UpdateTagQuality(string connectionName, int good, int bad, int uncertain);
+
+    /// <summary>
+    /// Sets the store-and-forward buffer depths for all categories.
+    /// </summary>
+    /// <param name="depths">Dictionary mapping category names to their buffer depths.</param>
+    void SetStoreAndForwardDepths(IReadOnlyDictionary<string, int> depths);
+
+    /// <summary>
+    /// Sets the counts of instances in each state.
+    /// </summary>
+    /// <param name="deployed">Number of deployed instances.</param>
+    /// <param name="enabled">Number of enabled instances.</param>
+    /// <param name="disabled">Number of disabled instances.</param>
+    void SetInstanceCounts(int deployed, int enabled, int disabled);
+
+    /// <summary>
+    /// Sets the count of parked messages.
+    /// </summary>
+    /// <param name="count">The number of parked messages.</param>
+    void SetParkedMessageCount(int count);
+
+    /// <summary>
+    /// Sets the hostname of this node.
+    /// </summary>
+    /// <param name="hostname">The node hostname.</param>
+    void SetNodeHostname(string hostname);
+
+    /// <summary>
+    /// Sets the list of cluster nodes.
+    /// </summary>
+    /// <param name="nodes">The list of cluster node statuses.</param>
+    void SetClusterNodes(IReadOnlyList<Commons.Messages.Health.NodeStatus> nodes);
+
+    /// <summary>
+    /// Sets whether this node is the active node in the cluster.
+    /// </summary>
+    /// <param name="isActive">True if this node is active, false otherwise.</param>
+    void SetActiveNode(bool isActive);
+
+    /// <summary>
+    /// Gets whether this node is the active node in the cluster.
+    /// </summary>
+    bool IsActiveNode { get; }
+
+    /// <summary>
+    /// Collects and returns a health report for a site.
+    /// </summary>
+    /// <param name="siteId">The site identifier.</param>
+    /// <returns>A health report for the specified site.</returns>
+    SiteHealthReport CollectReport(string siteId);
+
+    /// <summary>
+    /// HealthMonitoring-017: atomically add back the given per-interval error
+    /// counts into the collector's accumulators. Called by the report sender
+    /// when transport delivery of a freshly-collected report fails, so the
+    /// counts that <see cref="CollectReport"/> already drained roll forward
+    /// into the next report rather than being silently lost. Concurrent
+    /// increments arriving between the failed Send and this restore are
+    /// preserved — <c>Interlocked.Add</c> sums correctly with them. The
+    /// default interface implementation is a no-op so existing test fakes
+    /// (the only implementations outside <see cref="SiteHealthCollector"/>)
+    /// continue to compile without per-fake updates; production callers see
+    /// the real behaviour via the concrete class.
+    /// </summary>
+    /// <param name="scriptErrors">Script error count to add back.</param>
+    /// <param name="alarmErrors">Alarm evaluation error count to add back.</param>
+    /// <param name="deadLetters">Dead letter count to add back.</param>
+    /// <param name="siteAuditWriteFailures">Site audit write failure count to add back.</param>
+    /// <param name="auditRedactionFailures">Audit redaction failure count to add back.</param>
+    void AddIntervalCounters(
+        int scriptErrors,
+        int alarmErrors,
+        int deadLetters,
+        int siteAuditWriteFailures,
+        int auditRedactionFailures)
+    {
+        // Default no-op so test fakes do not need to be updated. The real
+        // SiteHealthCollector overrides this with the Interlocked.Add restore.
+    }
+}
@@ -0,0 +1,11 @@
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
+
+/// <summary>
+/// Provides the identity of the current site.
+/// Implemented by the Host component to supply configuration-driven site ID.
+/// </summary>
+public interface ISiteIdentityProvider
+{
+    /// <summary>The unique identifier of this site node.</summary>
+    string SiteId { get; }
+}
@@ -0,0 +1,65 @@
+using Microsoft.Extensions.DependencyInjection;
+using Microsoft.Extensions.DependencyInjection.Extensions;
+using Microsoft.Extensions.Options;
+
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
+
+public static class ServiceCollectionExtensions
+{
+    /// <summary>
+    /// Register site-side health monitoring services (metric collection + periodic reporting).
+    /// Call this on site nodes only. For central, call AddCentralHealthAggregation() instead.
+    /// </summary>
+    /// <param name="services">The DI service collection to register into.</param>
+    public static IServiceCollection AddSiteHealthMonitoring(this IServiceCollection services)
+    {
+        AddOptionsValidation(services);
+        services.AddSingleton<ISiteHealthCollector, SiteHealthCollector>();
+        services.AddHostedService<HealthReportSender>();
+        return services;
+    }
+
+    /// <summary>
+    /// Register shared health monitoring services (safe for both central and site).
+    /// Does not start the HealthReportSender — call AddSiteHealthMonitoring() on site nodes for that.
+    /// </summary>
+    /// <param name="services">The DI service collection to register into.</param>
+    public static IServiceCollection AddHealthMonitoring(this IServiceCollection services)
+    {
+        AddOptionsValidation(services);
+        services.AddSingleton<ISiteHealthCollector, SiteHealthCollector>();
+        return services;
+    }
+
+    /// <summary>
+    /// Register central-side health aggregation services. Includes the
+    /// <see cref="CentralHealthReportLoop"/> that generates a self-report
+    /// for the central cluster so it appears on /monitoring/health.
+    /// </summary>
+    /// <param name="services">The DI service collection to register into.</param>
+    public static IServiceCollection AddCentralHealthAggregation(this IServiceCollection services)
+    {
+        AddOptionsValidation(services);
+        services.AddSingleton<CentralHealthAggregator>();
+        services.AddSingleton<ICentralHealthAggregator>(sp => sp.GetRequiredService<CentralHealthAggregator>());
+        services.AddHostedService(sp => sp.GetRequiredService<CentralHealthAggregator>());
+        services.AddHostedService<CentralHealthReportLoop>();
+        return services;
+    }
+
+    /// <summary>
+    /// HealthMonitoring-014: register the <see cref="HealthMonitoringOptionsValidator"/>
+    /// so a misconfigured <c>ScadaBridge:HealthMonitoring</c> section (zero/negative
+    /// intervals, or a <c>CentralOfflineTimeout</c> shorter than
+    /// <c>OfflineTimeout</c>) is rejected with a clear, key-naming message when the
+    /// hosted services resolve their options at startup — rather than crashing
+    /// later inside a <see cref="PeriodicTimer"/> constructor with an opaque
+    /// <see cref="ArgumentOutOfRangeException"/>. Idempotent so it is safe when
+    /// more than one of the registration methods above is called.
+    /// </summary>
+    private static void AddOptionsValidation(IServiceCollection services)
+    {
+        services.TryAddEnumerable(
+            ServiceDescriptor.Singleton<IValidateOptions<HealthMonitoringOptions>, HealthMonitoringOptionsValidator>());
+    }
+}
@@ -0,0 +1,211 @@
+using System.Collections.Concurrent;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
+using ZB.MOM.WW.ScadaBridge.Commons.Types;
+using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
+
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
+
+/// <summary>
+/// Collects health metrics from all site subsystems.
+/// Thread-safe: counters use Interlocked operations, connection/tag data uses ConcurrentDictionary.
+/// </summary>
+public class SiteHealthCollector : ISiteHealthCollector
+{
+    private int _scriptErrorCount;
+    private int _alarmErrorCount;
+    private int _deadLetterCount;
+    private int _siteAuditWriteFailures;
+    private int _auditRedactionFailures;
+    private volatile SiteAuditBacklogSnapshot? _siteAuditBacklog;
+    private readonly ConcurrentDictionary<string, ConnectionHealth> _connectionStatuses = new();
+    private readonly ConcurrentDictionary<string, TagResolutionStatus> _tagResolutionCounts = new();
+    private readonly ConcurrentDictionary<string, string> _connectionEndpoints = new();
+    private readonly ConcurrentDictionary<string, TagQualityCounts> _tagQualityCounts = new();
+    private IReadOnlyDictionary<string, int> _sfBufferDepths = new Dictionary<string, int>();
+    private int _deployedInstanceCount, _enabledInstanceCount, _disabledInstanceCount;
+    private int _parkedMessageCount;
+    private volatile string _nodeHostname = "";
+    private volatile IReadOnlyList<Commons.Messages.Health.NodeStatus>? _clusterNodes;
+    private volatile bool _isActiveNode;
+    private readonly TimeProvider _timeProvider;
+
+    /// <summary>
+    /// Creates a collector. The <paramref name="timeProvider"/> stamps each
+    /// report's timestamp; it defaults to <see cref="TimeProvider.System"/> and
+    /// is injectable so the report timestamp is deterministically testable —
+    /// consistent with the rest of the module's time-dependent classes.
+    /// </summary>
+    /// <param name="timeProvider">Optional custom time provider; defaults to system time.</param>
+    public SiteHealthCollector(TimeProvider? timeProvider = null)
+    {
+        _timeProvider = timeProvider ?? TimeProvider.System;
+    }
+
+    /// <inheritdoc />
+    public void IncrementScriptError()
+    {
+        Interlocked.Increment(ref _scriptErrorCount);
+    }
+
+    /// <inheritdoc />
+    public void IncrementAlarmError()
+    {
+        Interlocked.Increment(ref _alarmErrorCount);
+    }
+
+    /// <inheritdoc />
+    public void IncrementDeadLetter()
+    {
+        Interlocked.Increment(ref _deadLetterCount);
+    }
+
+    /// <inheritdoc />
+    public void IncrementSiteAuditWriteFailures()
+    {
+        Interlocked.Increment(ref _siteAuditWriteFailures);
+    }
+
+    /// <inheritdoc />
+    public void IncrementAuditRedactionFailure()
+    {
+        Interlocked.Increment(ref _auditRedactionFailures);
+    }
+
+    /// <inheritdoc />
+    public void UpdateSiteAuditBacklog(SiteAuditBacklogSnapshot snapshot)
+    {
+        _siteAuditBacklog = snapshot ?? throw new ArgumentNullException(nameof(snapshot));
+    }
+
+    /// <inheritdoc />
+    public void UpdateConnectionHealth(string connectionName, ConnectionHealth health)
+    {
+        _connectionStatuses[connectionName] = health;
+    }
+
+    /// <inheritdoc />
+    public void RemoveConnection(string connectionName)
+    {
+        _connectionStatuses.TryRemove(connectionName, out _);
+        _tagResolutionCounts.TryRemove(connectionName, out _);
+        _connectionEndpoints.TryRemove(connectionName, out _);
+        _tagQualityCounts.TryRemove(connectionName, out _);
+    }
+
+    /// <inheritdoc />
+    public void UpdateTagResolution(string connectionName, int totalSubscribed, int successfullyResolved)
+    {
+        _tagResolutionCounts[connectionName] = new TagResolutionStatus(totalSubscribed, successfullyResolved);
+    }
+
+    /// <inheritdoc />
+    public void UpdateConnectionEndpoint(string connectionName, string endpoint)
+    {
+        _connectionEndpoints[connectionName] = endpoint;
+    }
+
+    /// <inheritdoc />
+    public void UpdateTagQuality(string connectionName, int good, int bad, int uncertain)
+    {
+        _tagQualityCounts[connectionName] = new TagQualityCounts(good, bad, uncertain);
+    }
+
+    /// <inheritdoc />
+    public void SetParkedMessageCount(int count)
+    {
+        Interlocked.Exchange(ref _parkedMessageCount, count);
+    }
+
+    /// <inheritdoc />
+    public void SetNodeHostname(string hostname) => _nodeHostname = hostname;
+
+    /// <inheritdoc />
+    public void SetClusterNodes(IReadOnlyList<Commons.Messages.Health.NodeStatus> nodes) => _clusterNodes = nodes;
+
+    /// <inheritdoc />
+    public void SetStoreAndForwardDepths(IReadOnlyDictionary<string, int> depths)
+    {
+        _sfBufferDepths = depths;
+    }
+
+    /// <inheritdoc />
+    public void SetInstanceCounts(int deployed, int enabled, int disabled)
+    {
+        Interlocked.Exchange(ref _deployedInstanceCount, deployed);
+        Interlocked.Exchange(ref _enabledInstanceCount, enabled);
+        Interlocked.Exchange(ref _disabledInstanceCount, disabled);
+    }
+
+    /// <inheritdoc />
+    public void SetActiveNode(bool isActive) => _isActiveNode = isActive;
+
+    /// <inheritdoc />
+    public bool IsActiveNode => _isActiveNode;
+
+    /// <inheritdoc />
+    public void AddIntervalCounters(
+        int scriptErrors,
+        int alarmErrors,
+        int deadLetters,
+        int siteAuditWriteFailures,
+        int auditRedactionFailures)
+    {
+        // HealthMonitoring-017: each counter is restored atomically via
+        // Interlocked.Add so an increment that arrived during the failed Send
+        // (and therefore accumulated against the zero left by CollectReport's
+        // Exchange) is correctly summed with the values being put back. No
+        // ordering between the five Adds is required — they target independent
+        // fields.
+        if (scriptErrors != 0) Interlocked.Add(ref _scriptErrorCount, scriptErrors);
+        if (alarmErrors != 0) Interlocked.Add(ref _alarmErrorCount, alarmErrors);
+        if (deadLetters != 0) Interlocked.Add(ref _deadLetterCount, deadLetters);
+        if (siteAuditWriteFailures != 0) Interlocked.Add(ref _siteAuditWriteFailures, siteAuditWriteFailures);
+        if (auditRedactionFailures != 0) Interlocked.Add(ref _auditRedactionFailures, auditRedactionFailures);
+    }
+
+    /// <inheritdoc />
+    public SiteHealthReport CollectReport(string siteId)
+    {
+        // Atomically read and reset the counters
+        var scriptErrors = Interlocked.Exchange(ref _scriptErrorCount, 0);
+        var alarmErrors = Interlocked.Exchange(ref _alarmErrorCount, 0);
+        var deadLetters = Interlocked.Exchange(ref _deadLetterCount, 0);
+        var siteAuditWriteFailures = Interlocked.Exchange(ref _siteAuditWriteFailures, 0);
+        var auditRedactionFailures = Interlocked.Exchange(ref _auditRedactionFailures, 0);
+
+        // Snapshot current connection and tag resolution state
+        var connectionStatuses = new Dictionary<string, ConnectionHealth>(_connectionStatuses);
+        var tagResolution = new Dictionary<string, TagResolutionStatus>(_tagResolutionCounts);
+        var connectionEndpoints = new Dictionary<string, string>(_connectionEndpoints);
+        var tagQuality = new Dictionary<string, TagQualityCounts>(_tagQualityCounts);
+
+        // Snapshot current S&F buffer depths
+        var sfBufferDepths = new Dictionary<string, int>(_sfBufferDepths);
+
+        // Determine node role from active/standby state
+        var nodeRole = _isActiveNode ? "Active" : "Standby";
+
+        return new SiteHealthReport(
+            SiteId: siteId,
+            SequenceNumber: 0, // Caller (HealthReportSender) assigns the sequence number
+            ReportTimestamp: _timeProvider.GetUtcNow(),
+            DataConnectionStatuses: connectionStatuses,
+            TagResolutionCounts: tagResolution,
+            ScriptErrorCount: scriptErrors,
+            AlarmEvaluationErrorCount: alarmErrors,
+            StoreAndForwardBufferDepths: sfBufferDepths,
+            DeadLetterCount: deadLetters,
+            DeployedInstanceCount: _deployedInstanceCount,
+            EnabledInstanceCount: _enabledInstanceCount,
+            DisabledInstanceCount: _disabledInstanceCount,
+            NodeRole: nodeRole,
+            NodeHostname: _nodeHostname,
+            DataConnectionEndpoints: connectionEndpoints,
+            DataConnectionTagQuality: tagQuality,
+            ParkedMessageCount: Interlocked.CompareExchange(ref _parkedMessageCount, 0, 0),
+            ClusterNodes: _clusterNodes?.ToList(),
+            SiteAuditWriteFailures: siteAuditWriteFailures,
+            AuditRedactionFailure: auditRedactionFailures,
+            SiteAuditBacklog: _siteAuditBacklog);
+    }
+}
@@ -0,0 +1,47 @@
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
+
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
+
+/// <summary>
+/// In-memory state for a single site's health, stored by the central aggregator.
+/// Immutable: every state transition produces a new instance which the aggregator
+/// installs into its <c>ConcurrentDictionary</c> via an atomic compare-and-swap.
+/// This makes handing the reference straight to UI callers safe — a consumer can
+/// never observe a torn or half-applied update.
+/// </summary>
+public sealed record SiteHealthState
+{
+    /// <summary>Gets the unique identifier of the site this state record belongs to.</summary>
+    public required string SiteId { get; init; }
+
+    /// <summary>
+    /// The latest full <see cref="SiteHealthReport"/> received for the site, or
+    /// <c>null</c> if the site is known only via heartbeats and has not yet sent
+    /// a report.
+    /// </summary>
+    public SiteHealthReport? LatestReport { get; init; }
+
+    /// <summary>
+    /// Time the latest full <see cref="SiteHealthReport"/> was processed, or
+    /// <c>null</c> if the site is known only via heartbeats and has not yet sent
+    /// a report. Used by the UI to surface report staleness during failover;
+    /// the <c>null</c> case must be rendered as "no report yet" rather than as a
+    /// timestamp (a <c>default</c> sentinel would display as year-0001).
+    /// </summary>
+    public DateTimeOffset? LastReportReceivedAt { get; init; }
+
+    /// <summary>
+    /// Time the most recent signal of any kind (full report OR heartbeat) was
+    /// received. Drives offline detection — heartbeats from the standby keep the
+    /// site marked online even when the active node is unable to produce a report
+    /// (mid-failover, brief stalls). Heartbeat cadence is owned by the Cluster
+    /// Infrastructure / SiteCommunicationActor (every
+    /// CommunicationOptions.TransportHeartbeatInterval — 5s by default).
+    /// </summary>
+    public DateTimeOffset LastHeartbeatAt { get; init; }
+
+    /// <summary>Gets the sequence number of the last accepted health report, used to reject out-of-order duplicates.</summary>
+    public long LastSequenceNumber { get; init; }
+    /// <summary>Gets a value indicating whether the site is currently considered online.</summary>
+    public bool IsOnline { get; init; }
+}
@@ -0,0 +1,27 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>net10.0</TargetFramework>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+    <TreatWarningsAsErrors>true</TreatWarningsAsErrors>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
+    <PackageReference Include="Microsoft.Extensions.Hosting.Abstractions" />
+    <PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
+    <PackageReference Include="Microsoft.Extensions.Options" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="../ZB.MOM.WW.ScadaBridge.Commons/ZB.MOM.WW.ScadaBridge.Commons.csproj" />
+    <ProjectReference Include="../ZB.MOM.WW.ScadaBridge.StoreAndForward/ZB.MOM.WW.ScadaBridge.StoreAndForward.csproj" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <InternalsVisibleTo Include="ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests" />
+    <InternalsVisibleTo Include="ZB.MOM.WW.ScadaBridge.IntegrationTests" />
+  </ItemGroup>
+
+</Project>