lmxopcua/src/ZB.MOM.WW.OtOpcUa.Server/HostStatusPublisher.cs

using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.OtOpcUa.Configuration;
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
using ZB.MOM.WW.OtOpcUa.Core.Hosting;

namespace ZB.MOM.WW.OtOpcUa.Server;

/// <summary>
///     Walks every registered driver once per heartbeat interval, asks each
///     <see cref="IHostConnectivityProbe"/>-capable driver for its current
///     <see cref="HostConnectivityStatus"/> list, and upserts one
///     <see cref="DriverHostStatus"/> row per (NodeId, DriverInstanceId, HostName) into the
///     central config DB. Powers the Admin UI's per-host drill-down page (LMX follow-up #7).
/// </summary>
/// <remarks>
///     <para>
///         Polling rather than event-driven: simpler, and matches the cadence the Admin UI
///         consumes. An event-subscription optimization (push on <c>OnHostStatusChanged</c> for
///         immediate reflection) is a straightforward follow-up but adds lifecycle complexity
///         — drivers can be registered after the publisher starts, and subscribing to each
///         one's event on register + unsubscribing on unregister requires DriverHost to expose
///         lifecycle events it doesn't today.
///     </para>
///     <para>
///         <see cref="DriverHostStatus.LastSeenUtc"/> advances every heartbeat so the Admin UI
///         can flag stale rows from a crashed Server process independent of
///         <see cref="DriverHostStatus.State"/> — a Faulted publisher that stops heartbeating
///         stays Faulted in the DB but its LastSeenUtc ages out, which is the signal
///         operators actually want.
///     </para>
///     <para>
///         If the DB is unreachable on a given tick, the publisher logs and moves on — it
///         does not retry or buffer. The next heartbeat picks up the current-state snapshot,
///         which is more useful than replaying stale transitions after a long outage.
///     </para>
/// </remarks>
public sealed class HostStatusPublisher(
    DriverHost driverHost,
    NodeOptions nodeOptions,
    IServiceScopeFactory scopeFactory,
    ILogger<HostStatusPublisher> logger) : BackgroundService
{
    internal static readonly TimeSpan HeartbeatInterval = TimeSpan.FromSeconds(10);

    protected override async Task ExecuteAsync(CancellationToken stoppingToken)
    {
        // Wait a short moment at startup so NodeBootstrap's RegisterAsync calls have had a
        // chance to land. First tick runs immediately after so a freshly-started Server
        // surfaces its host topology in the Admin UI without waiting a full interval.
        try { await Task.Delay(TimeSpan.FromSeconds(2), stoppingToken); }
        catch (OperationCanceledException) { return; }

        while (!stoppingToken.IsCancellationRequested)
        {
            try { await PublishOnceAsync(stoppingToken); }
            catch (OperationCanceledException) { return; }
            catch (Exception ex)
            {
                // Never take down the Server on a publisher failure. Log and continue —
                // stale-row detection on the Admin side will surface the outage.
                logger.LogWarning(ex, "Host-status publisher tick failed — will retry next heartbeat");
            }

            try { await Task.Delay(HeartbeatInterval, stoppingToken); }
            catch (OperationCanceledException) { return; }
        }
    }

    internal async Task PublishOnceAsync(CancellationToken ct)
    {
        var driverIds = driverHost.RegisteredDriverIds;
        if (driverIds.Count == 0) return;

        var now = DateTime.UtcNow;
        using var scope = scopeFactory.CreateScope();
        var db = scope.ServiceProvider.GetRequiredService<OtOpcUaConfigDbContext>();

        foreach (var driverId in driverIds)
        {
            var driver = driverHost.GetDriver(driverId);
            if (driver is not IHostConnectivityProbe probe) continue;

            IReadOnlyList<HostConnectivityStatus> statuses;
            try { statuses = probe.GetHostStatuses(); }
            catch (Exception ex)
            {
                logger.LogWarning(ex, "Driver {DriverId} GetHostStatuses threw — skipping this tick", driverId);
                continue;
            }

            foreach (var status in statuses)
            {
                await UpsertAsync(db, driverId, status, now, ct);
            }
        }

        await db.SaveChangesAsync(ct);
    }

    private async Task UpsertAsync(OtOpcUaConfigDbContext db, string driverId,
        HostConnectivityStatus status, DateTime now, CancellationToken ct)
    {
        var mapped = MapState(status.State);
        var existing = await db.DriverHostStatuses.SingleOrDefaultAsync(r =>
            r.NodeId == nodeOptions.NodeId
            && r.DriverInstanceId == driverId
            && r.HostName == status.HostName, ct);

        if (existing is null)
        {
            db.DriverHostStatuses.Add(new DriverHostStatus
            {
                NodeId = nodeOptions.NodeId,
                DriverInstanceId = driverId,
                HostName = status.HostName,
                State = mapped,
                StateChangedUtc = status.LastChangedUtc,
                LastSeenUtc = now,
            });
            return;
        }

        existing.LastSeenUtc = now;
        if (existing.State != mapped)
        {
            existing.State = mapped;
            existing.StateChangedUtc = status.LastChangedUtc;
        }
    }

    internal static DriverHostState MapState(HostState state) => state switch
    {
        HostState.Running => DriverHostState.Running,
        HostState.Stopped => DriverHostState.Stopped,
        HostState.Faulted => DriverHostState.Faulted,
        _ => DriverHostState.Unknown,
    };
}