lmxopcua/src/ZB.MOM.WW.OtOpcUa.Admin/Services/HostStatusService.cs

using Microsoft.EntityFrameworkCore;
using ZB.MOM.WW.OtOpcUa.Configuration;
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;

namespace ZB.MOM.WW.OtOpcUa.Admin.Services;

/// <summary>
///     One row per <see cref="DriverHostStatus"/> record, enriched with the owning
///     <c>ClusterNode.ClusterId</c> (left-join) + the per-<c>(DriverInstanceId, HostName)</c>
///     <see cref="DriverInstanceResilienceStatus"/> counters (also left-join) so the Admin
///     <c>/hosts</c> page renders the resilience surface inline with host state.
/// </summary>
public sealed record HostStatusRow(
    string NodeId,
    string? ClusterId,
    string DriverInstanceId,
    string HostName,
    DriverHostState State,
    DateTime StateChangedUtc,
    DateTime LastSeenUtc,
    string? Detail,
    int ConsecutiveFailures,
    DateTime? LastCircuitBreakerOpenUtc,
    int CurrentBulkheadDepth,
    DateTime? LastRecycleUtc);

/// <summary>
///     Read-side service for the Admin UI's per-host drill-down. Loads
///     <see cref="DriverHostStatus"/> rows (written by the Server process's
///     <c>HostStatusPublisher</c>) and left-joins <c>ClusterNode</c> so each row knows which
///     cluster it belongs to — the Admin UI groups by cluster for the fleet-wide view.
/// </summary>
/// <remarks>
///     The publisher heartbeat is 10s (<c>HostStatusPublisher.HeartbeatInterval</c>). The
///     Admin page also polls every ~10s and treats rows with <c>LastSeenUtc</c> older than
///     <c>StaleThreshold</c> (30s) as stale — covers a missed heartbeat tolerance plus
///     a generous buffer for clock skew and publisher GC pauses.
/// </remarks>
public sealed class HostStatusService(OtOpcUaConfigDbContext db)
{
    public static readonly TimeSpan StaleThreshold = TimeSpan.FromSeconds(30);

    /// <summary>Consecutive-failure threshold at which <see cref="IsFlagged"/> returns <c>true</c>
    /// so the Admin UI can paint a red badge. Matches Phase 6.1 decision #143's conservative
    /// half-of-breaker-threshold convention — flags before the breaker actually opens.</summary>
    public const int FailureFlagThreshold = 3;

    public async Task<IReadOnlyList<HostStatusRow>> ListAsync(CancellationToken ct = default)
    {
        // Two LEFT JOINs:
        //   1. ClusterNodes on NodeId — row persists even when its owning ClusterNode row
        //      hasn't been created yet (first-boot bootstrap case).
        //   2. DriverInstanceResilienceStatuses on (DriverInstanceId, HostName) — resilience
        //      counters haven't been sampled yet for brand-new hosts, so a missing row means
        //      zero failures + never-opened breaker.
        var rows = await (from s in db.DriverHostStatuses.AsNoTracking()
                          join n in db.ClusterNodes.AsNoTracking()
                              on s.NodeId equals n.NodeId into nodeJoin
                          from n in nodeJoin.DefaultIfEmpty()
                          join r in db.DriverInstanceResilienceStatuses.AsNoTracking()
                              on new { s.DriverInstanceId, s.HostName } equals new { r.DriverInstanceId, r.HostName } into resilJoin
                          from r in resilJoin.DefaultIfEmpty()
                          orderby s.NodeId, s.DriverInstanceId, s.HostName
                          select new HostStatusRow(
                              s.NodeId,
                              n != null ? n.ClusterId : null,
                              s.DriverInstanceId,
                              s.HostName,
                              s.State,
                              s.StateChangedUtc,
                              s.LastSeenUtc,
                              s.Detail,
                              r != null ? r.ConsecutiveFailures : 0,
                              r != null ? r.LastCircuitBreakerOpenUtc : null,
                              r != null ? r.CurrentBulkheadDepth : 0,
                              r != null ? r.LastRecycleUtc : null)).ToListAsync(ct);
        return rows;
    }

    public static bool IsStale(HostStatusRow row) =>
        DateTime.UtcNow - row.LastSeenUtc > StaleThreshold;

    /// <summary>
    ///     Red-badge predicate — <c>true</c> when the host has accumulated enough consecutive
    ///     failures that an operator should take notice before the breaker trips.
    /// </summary>
    public static bool IsFlagged(HostStatusRow row) =>
        row.ConsecutiveFailures >= FailureFlagThreshold;
}