using Microsoft.EntityFrameworkCore;
using ZB.MOM.WW.OtOpcUa.Configuration;
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
namespace ZB.MOM.WW.OtOpcUa.Admin.Services;
///
/// One row per record, enriched with the owning
/// ClusterNode.ClusterId (left-join) + the per-(DriverInstanceId, HostName)
/// counters (also left-join) so the Admin
/// /hosts page renders the resilience surface inline with host state.
///
public sealed record HostStatusRow(
string NodeId,
string? ClusterId,
string DriverInstanceId,
string HostName,
DriverHostState State,
DateTime StateChangedUtc,
DateTime LastSeenUtc,
string? Detail,
int ConsecutiveFailures,
DateTime? LastCircuitBreakerOpenUtc,
int CurrentBulkheadDepth,
DateTime? LastRecycleUtc);
///
/// Read-side service for the Admin UI's per-host drill-down. Loads
/// rows (written by the Server process's
/// HostStatusPublisher) and left-joins ClusterNode so each row knows which
/// cluster it belongs to — the Admin UI groups by cluster for the fleet-wide view.
///
///
/// The publisher heartbeat is 10s (HostStatusPublisher.HeartbeatInterval). The
/// Admin page also polls every ~10s and treats rows with LastSeenUtc older than
/// StaleThreshold (30s) as stale — covers a missed heartbeat tolerance plus
/// a generous buffer for clock skew and publisher GC pauses.
///
public sealed class HostStatusService(OtOpcUaConfigDbContext db)
{
public static readonly TimeSpan StaleThreshold = TimeSpan.FromSeconds(30);
/// Consecutive-failure threshold at which returns true
/// so the Admin UI can paint a red badge. Matches Phase 6.1 decision #143's conservative
/// half-of-breaker-threshold convention — flags before the breaker actually opens.
public const int FailureFlagThreshold = 3;
public async Task> ListAsync(CancellationToken ct = default)
{
// Two LEFT JOINs:
// 1. ClusterNodes on NodeId — row persists even when its owning ClusterNode row
// hasn't been created yet (first-boot bootstrap case).
// 2. DriverInstanceResilienceStatuses on (DriverInstanceId, HostName) — resilience
// counters haven't been sampled yet for brand-new hosts, so a missing row means
// zero failures + never-opened breaker.
var rows = await (from s in db.DriverHostStatuses.AsNoTracking()
join n in db.ClusterNodes.AsNoTracking()
on s.NodeId equals n.NodeId into nodeJoin
from n in nodeJoin.DefaultIfEmpty()
join r in db.DriverInstanceResilienceStatuses.AsNoTracking()
on new { s.DriverInstanceId, s.HostName } equals new { r.DriverInstanceId, r.HostName } into resilJoin
from r in resilJoin.DefaultIfEmpty()
orderby s.NodeId, s.DriverInstanceId, s.HostName
select new HostStatusRow(
s.NodeId,
n != null ? n.ClusterId : null,
s.DriverInstanceId,
s.HostName,
s.State,
s.StateChangedUtc,
s.LastSeenUtc,
s.Detail,
r != null ? r.ConsecutiveFailures : 0,
r != null ? r.LastCircuitBreakerOpenUtc : null,
r != null ? r.CurrentBulkheadDepth : 0,
r != null ? r.LastRecycleUtc : null)).ToListAsync(ct);
return rows;
}
public static bool IsStale(HostStatusRow row) =>
DateTime.UtcNow - row.LastSeenUtc > StaleThreshold;
///
/// Red-badge predicate — true when the host has accumulated enough consecutive
/// failures that an operator should take notice before the breaker trips.
///
public static bool IsFlagged(HostStatusRow row) =>
row.ConsecutiveFailures >= FailureFlagThreshold;
}