91 lines
4.4 KiB
C#
91 lines
4.4 KiB
C#
using Microsoft.EntityFrameworkCore;
|
|
using ZB.MOM.WW.OtOpcUa.Configuration;
|
|
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
|
|
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
|
|
|
|
namespace ZB.MOM.WW.OtOpcUa.Admin.Services;
|
|
|
|
/// <summary>
|
|
/// One row per <see cref="DriverHostStatus"/> record, enriched with the owning
|
|
/// <c>ClusterNode.ClusterId</c> (left-join) + the per-<c>(DriverInstanceId, HostName)</c>
|
|
/// <see cref="DriverInstanceResilienceStatus"/> counters (also left-join) so the Admin
|
|
/// <c>/hosts</c> page renders the resilience surface inline with host state.
|
|
/// </summary>
|
|
public sealed record HostStatusRow(
|
|
string NodeId,
|
|
string? ClusterId,
|
|
string DriverInstanceId,
|
|
string HostName,
|
|
DriverHostState State,
|
|
DateTime StateChangedUtc,
|
|
DateTime LastSeenUtc,
|
|
string? Detail,
|
|
int ConsecutiveFailures,
|
|
DateTime? LastCircuitBreakerOpenUtc,
|
|
int CurrentBulkheadDepth,
|
|
DateTime? LastRecycleUtc);
|
|
|
|
/// <summary>
|
|
/// Read-side service for the Admin UI's per-host drill-down. Loads
|
|
/// <see cref="DriverHostStatus"/> rows (written by the Server process's
|
|
/// <c>HostStatusPublisher</c>) and left-joins <c>ClusterNode</c> so each row knows which
|
|
/// cluster it belongs to — the Admin UI groups by cluster for the fleet-wide view.
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// The publisher heartbeat is 10s (<c>HostStatusPublisher.HeartbeatInterval</c>). The
|
|
/// Admin page also polls every ~10s and treats rows with <c>LastSeenUtc</c> older than
|
|
/// <c>StaleThreshold</c> (30s) as stale — covers a missed heartbeat tolerance plus
|
|
/// a generous buffer for clock skew and publisher GC pauses.
|
|
/// </remarks>
|
|
public sealed class HostStatusService(OtOpcUaConfigDbContext db)
|
|
{
|
|
public static readonly TimeSpan StaleThreshold = TimeSpan.FromSeconds(30);
|
|
|
|
/// <summary>Consecutive-failure threshold at which <see cref="IsFlagged"/> returns <c>true</c>
|
|
/// so the Admin UI can paint a red badge. Matches Phase 6.1 decision #143's conservative
|
|
/// half-of-breaker-threshold convention — flags before the breaker actually opens.</summary>
|
|
public const int FailureFlagThreshold = 3;
|
|
|
|
public async Task<IReadOnlyList<HostStatusRow>> ListAsync(CancellationToken ct = default)
|
|
{
|
|
// Two LEFT JOINs:
|
|
// 1. ClusterNodes on NodeId — row persists even when its owning ClusterNode row
|
|
// hasn't been created yet (first-boot bootstrap case).
|
|
// 2. DriverInstanceResilienceStatuses on (DriverInstanceId, HostName) — resilience
|
|
// counters haven't been sampled yet for brand-new hosts, so a missing row means
|
|
// zero failures + never-opened breaker.
|
|
var rows = await (from s in db.DriverHostStatuses.AsNoTracking()
|
|
join n in db.ClusterNodes.AsNoTracking()
|
|
on s.NodeId equals n.NodeId into nodeJoin
|
|
from n in nodeJoin.DefaultIfEmpty()
|
|
join r in db.DriverInstanceResilienceStatuses.AsNoTracking()
|
|
on new { s.DriverInstanceId, s.HostName } equals new { r.DriverInstanceId, r.HostName } into resilJoin
|
|
from r in resilJoin.DefaultIfEmpty()
|
|
orderby s.NodeId, s.DriverInstanceId, s.HostName
|
|
select new HostStatusRow(
|
|
s.NodeId,
|
|
n != null ? n.ClusterId : null,
|
|
s.DriverInstanceId,
|
|
s.HostName,
|
|
s.State,
|
|
s.StateChangedUtc,
|
|
s.LastSeenUtc,
|
|
s.Detail,
|
|
r != null ? r.ConsecutiveFailures : 0,
|
|
r != null ? r.LastCircuitBreakerOpenUtc : null,
|
|
r != null ? r.CurrentBulkheadDepth : 0,
|
|
r != null ? r.LastRecycleUtc : null)).ToListAsync(ct);
|
|
return rows;
|
|
}
|
|
|
|
public static bool IsStale(HostStatusRow row) =>
|
|
DateTime.UtcNow - row.LastSeenUtc > StaleThreshold;
|
|
|
|
/// <summary>
|
|
/// Red-badge predicate — <c>true</c> when the host has accumulated enough consecutive
|
|
/// failures that an operator should take notice before the breaker trips.
|
|
/// </summary>
|
|
public static bool IsFlagged(HostStatusRow row) =>
|
|
row.ConsecutiveFailures >= FailureFlagThreshold;
|
|
}
|