ScadaBridge/src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/CentralHealthReportLoop.cs

using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;

namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;

/// <summary>
/// Central-side counterpart to <see cref="HealthReportSender"/>.
/// Periodically builds a SiteHealthReport for the central cluster itself
/// (siteId = <see cref="CentralSiteId"/>) and feeds it into the local
/// CentralHealthAggregator so the UI can render central as another card
/// on /monitoring/health. Only the cluster leader (Primary) generates
/// reports — the standby's aggregator catches up on failover when it
/// becomes Primary and starts its own loop.
/// </summary>
public class CentralHealthReportLoop : BackgroundService
{
    /// <summary>
    /// Reserved siteId used to represent the central cluster in the
    /// shared CentralHealthAggregator keyspace.
    ///
    /// HealthMonitoring-021: the value is prefixed with <c>$</c> — a character
    /// that is forbidden in real site identifiers (the configuration /
    /// repository layer only permits Sites whose <c>SiteIdentifier</c> is a
    /// plain identifier) — so the synthetic central entry cannot collide with
    /// a real site whose operator-set identifier happened to be the bare word
    /// "central". A collision would have caused the two reports to clobber
    /// each other in the aggregator keyspace via the sequence-number guard,
    /// and the real site would inherit the longer
    /// <see cref="HealthMonitoringOptions.CentralOfflineTimeout"/> grace and
    /// stay falsely-online for an extra two minutes after going down.
    /// Consumers (<see cref="CentralHealthAggregator.CheckForOfflineSites"/>,
    /// the Central UI health dashboard) reference this constant rather than
    /// the literal string, so the change is local.
    /// </summary>
    public const string CentralSiteId = "$central";

    private readonly ISiteHealthCollector _collector;
    private readonly ICentralHealthAggregator _aggregator;
    private readonly IClusterNodeProvider _clusterNodeProvider;
    private readonly HealthMonitoringOptions _options;
    private readonly ILogger<CentralHealthReportLoop> _logger;

    // Seeded with Unix-ms so reports from a newly-elected central leader
    // always sort after reports from any prior leader for siteId="central".
    // The clock is read through the injected TimeProvider so the seeding is
    // deterministically testable.
    private long _sequenceNumber;

    /// <summary>
    /// Initializes the central health report loop.
    /// </summary>
    /// <param name="collector">Local health metrics collector for the central node.</param>
    /// <param name="aggregator">Aggregator that stores reports for the Central UI health dashboard.</param>
    /// <param name="clusterNodeProvider">Provider used to determine whether this node is primary.</param>
    /// <param name="options">Health monitoring configuration (report interval, offline threshold).</param>
    /// <param name="logger">Logger for diagnostics.</param>
    /// <param name="timeProvider">Optional time provider; defaults to <see cref="TimeProvider.System"/>.</param>
    public CentralHealthReportLoop(
        ISiteHealthCollector collector,
        ICentralHealthAggregator aggregator,
        IClusterNodeProvider clusterNodeProvider,
        IOptions<HealthMonitoringOptions> options,
        ILogger<CentralHealthReportLoop> logger,
        TimeProvider? timeProvider = null)
    {
        _collector = collector;
        _aggregator = aggregator;
        _clusterNodeProvider = clusterNodeProvider;
        _options = options.Value;
        _logger = logger;
        _sequenceNumber = (timeProvider ?? TimeProvider.System).GetUtcNow().ToUnixTimeMilliseconds();
    }

    /// <summary>
    /// Current sequence number (for testing).
    /// </summary>
    public long CurrentSequenceNumber => Interlocked.Read(ref _sequenceNumber);

    /// <inheritdoc />
    protected override async Task ExecuteAsync(CancellationToken stoppingToken)
    {
        _logger.LogInformation(
            "Central health report loop starting, interval {Interval}s",
            _options.ReportInterval.TotalSeconds);

        using var timer = new PeriodicTimer(_options.ReportInterval);

        while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
        {
            try
            {
                var isPrimary = _clusterNodeProvider.SelfIsPrimary;
                _collector.SetActiveNode(isPrimary);

                if (!isPrimary)
                    continue;

                _collector.SetClusterNodes(_clusterNodeProvider.GetClusterNodes());

                var seq = Interlocked.Increment(ref _sequenceNumber);

                // HealthMonitoring-018: CollectReport atomically read-and-resets
                // the per-interval error counters via Interlocked.Exchange. If
                // ProcessReport throws (or any other failure occurs between the
                // collect and the publish), those counts would otherwise be
                // lost — neither in the un-published report nor in the
                // now-zeroed collector. Snapshot the freshly-collected report
                // so that on a publish failure we can atomically restore the
                // counts back into the shared SiteHealthCollector via
                // Interlocked.Add. Concurrent increments arriving during the
                // ProcessReport call are preserved on the counter; the restore
                // Add safely sums with any such concurrent increments. Same
                // shape as the HealthMonitoring-017 fix in HealthReportSender.
                var report = _collector.CollectReport(CentralSiteId);
                var reportWithSeq = report with { SequenceNumber = seq };

                try
                {
                    _aggregator.ProcessReport(reportWithSeq);
                }
                catch
                {
                    // Restore the captured per-interval counters atomically so
                    // they roll forward into the next report — see
                    // HealthMonitoring-018.
                    _collector.AddIntervalCounters(
                        scriptErrors: report.ScriptErrorCount,
                        alarmErrors: report.AlarmEvaluationErrorCount,
                        deadLetters: report.DeadLetterCount,
                        siteAuditWriteFailures: report.SiteAuditWriteFailures,
                        auditRedactionFailures: report.AuditRedactionFailure);
                    throw;
                }

                _logger.LogDebug("Generated central health report #{Seq}", seq);
            }
            catch (Exception ex)
            {
                _logger.LogError(ex, "Failed to generate central health report");
            }
        }
    }
}