ScadaBridge/src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/HealthReportSender.cs

using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
using ZB.MOM.WW.ScadaBridge.StoreAndForward;

namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;

/// <summary>
/// Periodically collects a SiteHealthReport and sends it to central via Akka remoting.
/// Sequence numbers are monotonic and reset on service restart. They are <b>not</b>
/// zero/one-based: the per-process counter is seeded with the current Unix epoch
/// (milliseconds) at construction so that, after a failover, reports from a
/// freshly-active node always sort after reports from any prior active node for the
/// same site — otherwise the central aggregator's sequence-number guard would
/// silently reject the new active's first reports as stale.
/// </summary>
public class HealthReportSender : BackgroundService
{
    private readonly ISiteHealthCollector _collector;
    private readonly IHealthReportTransport _transport;
    private readonly HealthMonitoringOptions _options;
    private readonly ILogger<HealthReportSender> _logger;
    private readonly string _siteId;
    private readonly StoreAndForwardStorage? _sfStorage;
    private readonly IClusterNodeProvider? _clusterNodeProvider;

    // Seeded with Unix-ms at construction so reports from a freshly-active
    // node always sort after reports from any prior active node for the same
    // site. Without this seeding, failover would silently drop the new
    // active's first reports because their per-process counter starts below
    // the prior active's last sequence number. The clock is read through the
    // injected TimeProvider so the seeding is deterministically testable.
    private long _sequenceNumber;

    /// <summary>Initializes the sender, seeds the monotonic sequence number from the current Unix timestamp.</summary>
    /// <param name="collector">Site health metric collector supplying the report payload.</param>
    /// <param name="transport">Transport used to send the health report to central.</param>
    /// <param name="options">Health monitoring options including the report interval.</param>
    /// <param name="logger">Logger instance.</param>
    /// <param name="siteIdentityProvider">Provides the site identifier embedded in each report.</param>
    /// <param name="sfStorage">Optional store-and-forward storage for queue depth metrics.</param>
    /// <param name="clusterNodeProvider">Optional cluster node provider for active-node detection.</param>
    /// <param name="timeProvider">Optional time provider; defaults to <see cref="TimeProvider.System"/>.</param>
    public HealthReportSender(
        ISiteHealthCollector collector,
        IHealthReportTransport transport,
        IOptions<HealthMonitoringOptions> options,
        ILogger<HealthReportSender> logger,
        ISiteIdentityProvider siteIdentityProvider,
        StoreAndForwardStorage? sfStorage = null,
        IClusterNodeProvider? clusterNodeProvider = null,
        TimeProvider? timeProvider = null)
    {
        _collector = collector;
        _transport = transport;
        _options = options.Value;
        _logger = logger;
        _siteId = siteIdentityProvider.SiteId;
        _sfStorage = sfStorage;
        _clusterNodeProvider = clusterNodeProvider;
        _sequenceNumber = (timeProvider ?? TimeProvider.System).GetUtcNow().ToUnixTimeMilliseconds();
    }

    /// <summary>
    /// Current sequence number (for testing).
    /// </summary>
    public long CurrentSequenceNumber => Interlocked.Read(ref _sequenceNumber);

    /// <inheritdoc />
    protected override async Task ExecuteAsync(CancellationToken stoppingToken)
    {
        _logger.LogInformation(
            "Health report sender starting for site {SiteId}, interval {Interval}s",
            _siteId, _options.ReportInterval.TotalSeconds);

        using var timer = new PeriodicTimer(_options.ReportInterval);

        while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
        {
            try
            {
                // Only the active node (running the DeploymentManager singleton) sends health reports.
                // The standby node has no instance/connection data and would overwrite the active's report.
                if (!_collector.IsActiveNode)
                    continue;

                if (_clusterNodeProvider != null)
                {
                    try
                    {
                        _collector.SetClusterNodes(_clusterNodeProvider.GetClusterNodes());
                    }
                    catch (Exception ex)
                    {
                        // Non-fatal — the report ships with the previous cluster
                        // node list. Logged so a persistent failure is diagnosable.
                        _logger.LogWarning(ex,
                            "Failed to refresh cluster nodes for health report (site {SiteId}); using stale list",
                            _siteId);
                    }
                }

                if (_sfStorage != null)
                {
                    try
                    {
                        var parkedCount = await _sfStorage.GetParkedMessageCountAsync();
                        _collector.SetParkedMessageCount(parkedCount);
                    }
                    catch (Exception ex)
                    {
                        // Non-fatal — parked count will be 0 in this report.
                        _logger.LogWarning(ex,
                            "Failed to query parked message count for health report (site {SiteId})",
                            _siteId);
                    }

                    try
                    {
                        // Per-category pending-message buffer depths (the documented
                        // "store-and-forward buffer depth" triage metric). Keyed by
                        // StoreAndForwardCategory name so the central dashboard can
                        // render external/notification/DB-write depths separately.
                        var depthsByCategory = await _sfStorage.GetBufferDepthByCategoryAsync();
                        var depths = depthsByCategory.ToDictionary(
                            kvp => kvp.Key.ToString(),
                            kvp => kvp.Value);
                        _collector.SetStoreAndForwardDepths(depths);
                    }
                    catch (Exception ex)
                    {
                        // Non-fatal — buffer depths will be empty in this report.
                        _logger.LogWarning(ex,
                            "Failed to query store-and-forward buffer depths for health report (site {SiteId})",
                            _siteId);
                    }
                }

                var seq = Interlocked.Increment(ref _sequenceNumber);

                // HealthMonitoring-017: CollectReport atomically read-and-resets
                // the per-interval error counters via Interlocked.Exchange. If
                // the Send below throws, those counts are otherwise lost
                // forever — neither in the un-sent report nor in the now-zeroed
                // collector. Snapshot the freshly-collected report so that on a
                // transport failure we can atomically restore the counts back
                // into the collector via Interlocked.Add, so the next
                // successful report includes them. Concurrent increments
                // arriving during the Send are preserved on the counter (they
                // accumulate against zero); the restore Add safely sums with
                // any such concurrent increments.
                var report = _collector.CollectReport(_siteId);

                // Replace the placeholder sequence number with our monotonic one
                var reportWithSeq = report with { SequenceNumber = seq };

                try
                {
                    _transport.Send(reportWithSeq);
                }
                catch
                {
                    // Restore the captured per-interval counters atomically so
                    // they roll forward into the next report — see
                    // HealthMonitoring-017. Any concurrent increment that
                    // arrived during the failed Send remains on the counter;
                    // Interlocked.Add sums correctly with it.
                    _collector.AddIntervalCounters(
                        scriptErrors: report.ScriptErrorCount,
                        alarmErrors: report.AlarmEvaluationErrorCount,
                        deadLetters: report.DeadLetterCount,
                        siteAuditWriteFailures: report.SiteAuditWriteFailures,
                        auditRedactionFailures: report.AuditRedactionFailure);
                    throw;
                }

                _logger.LogInformation("Sent health report #{Seq} for site {SiteId}", seq, _siteId);
            }
            catch (Exception ex)
            {
                _logger.LogError(ex, "Failed to send health report for site {SiteId}", _siteId);
                // Continue sending — don't let a single failure stop reporting
            }
        }
    }
}