2b33b64a58
- Admin-010: vendor Bootstrap 5.3.3 (CSS + JS bundle + maps + provenance README) under wwwroot/lib/bootstrap and reference local paths from App.razor — Admin no longer pulls Bootstrap from jsDelivr. - Admin-011: swap FleetStatusPoller's three plain dictionaries for ConcurrentDictionary so ResetCache can't race a poll tick. - Admin-012: drop the EquipmentId column from EquipmentCsvImporter (per admin-ui.md — equipment id is system-derived from EquipmentUuid); EquipmentImportBatchService and the textarea placeholder updated to match. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
197 lines
8.6 KiB
C#
197 lines
8.6 KiB
C#
using System.Collections.Concurrent;
|
|
using Microsoft.AspNetCore.SignalR;
|
|
using Microsoft.EntityFrameworkCore;
|
|
using ZB.MOM.WW.OtOpcUa.Admin.Services;
|
|
using ZB.MOM.WW.OtOpcUa.Configuration;
|
|
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
|
|
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
|
|
|
|
namespace ZB.MOM.WW.OtOpcUa.Admin.Hubs;
|
|
|
|
/// <summary>
|
|
/// Polls <c>ClusterNodeGenerationState</c> every <see cref="PollInterval"/> and publishes
|
|
/// per-node deltas to <see cref="FleetStatusHub"/>. Also raises sticky
|
|
/// <see cref="AlertMessage"/>s on transitions into <c>Failed</c>.
|
|
/// </summary>
|
|
public sealed class FleetStatusPoller(
|
|
IServiceScopeFactory scopeFactory,
|
|
IHubContext<FleetStatusHub> fleetHub,
|
|
IHubContext<AlertHub> alertHub,
|
|
ILogger<FleetStatusPoller> logger,
|
|
RedundancyMetrics redundancyMetrics) : BackgroundService
|
|
{
|
|
public TimeSpan PollInterval { get; init; } = TimeSpan.FromSeconds(5);
|
|
|
|
// Admin-011: ConcurrentDictionary so the steady-state poll path and ResetCache() (which
|
|
// is exposed `internal` for tests) can race without throwing InvalidOperationException
|
|
// or corrupting state. Plain Dictionary<,> is not safe under concurrent clear + mutate.
|
|
private readonly ConcurrentDictionary<string, NodeStateSnapshot> _last = new();
|
|
private readonly ConcurrentDictionary<string, RedundancyRole> _lastRole = new(StringComparer.Ordinal);
|
|
private readonly ConcurrentDictionary<string, ResilienceSnapshot> _lastResilience = new(StringComparer.Ordinal);
|
|
|
|
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
|
{
|
|
logger.LogInformation("FleetStatusPoller starting — interval {Interval}s", PollInterval.TotalSeconds);
|
|
|
|
while (!stoppingToken.IsCancellationRequested)
|
|
{
|
|
try { await PollOnceAsync(stoppingToken); }
|
|
catch (Exception ex) when (ex is not OperationCanceledException)
|
|
{
|
|
logger.LogWarning(ex, "FleetStatusPoller tick failed");
|
|
}
|
|
|
|
try { await Task.Delay(PollInterval, stoppingToken); }
|
|
catch (OperationCanceledException) { break; }
|
|
}
|
|
}
|
|
|
|
internal async Task PollOnceAsync(CancellationToken ct)
|
|
{
|
|
using var scope = scopeFactory.CreateScope();
|
|
var db = scope.ServiceProvider.GetRequiredService<OtOpcUaConfigDbContext>();
|
|
|
|
var nodes = await db.ClusterNodes.AsNoTracking().ToListAsync(ct);
|
|
await PollRolesAsync(nodes, ct);
|
|
UpdateClusterGauges(nodes);
|
|
await PollResilienceAsync(db, ct);
|
|
|
|
var rows = await db.ClusterNodeGenerationStates.AsNoTracking()
|
|
.Join(db.ClusterNodes.AsNoTracking(), s => s.NodeId, n => n.NodeId, (s, n) => new { s, n.ClusterId })
|
|
.ToListAsync(ct);
|
|
|
|
foreach (var r in rows)
|
|
{
|
|
var snapshot = new NodeStateSnapshot(
|
|
r.s.NodeId, r.ClusterId, r.s.CurrentGenerationId,
|
|
r.s.LastAppliedStatus?.ToString(), r.s.LastAppliedError,
|
|
r.s.LastAppliedAt, r.s.LastSeenAt);
|
|
|
|
var hadPrior = _last.TryGetValue(r.s.NodeId, out var prior);
|
|
if (!hadPrior || prior != snapshot)
|
|
{
|
|
_last[r.s.NodeId] = snapshot;
|
|
|
|
var msg = new NodeStateChangedMessage(
|
|
snapshot.NodeId, snapshot.ClusterId, snapshot.GenerationId,
|
|
snapshot.Status, snapshot.Error, snapshot.AppliedAt, snapshot.SeenAt);
|
|
|
|
await fleetHub.Clients.Group(FleetStatusHub.GroupName(snapshot.ClusterId))
|
|
.SendAsync("NodeStateChanged", msg, ct);
|
|
await fleetHub.Clients.Group(FleetStatusHub.FleetGroup)
|
|
.SendAsync("NodeStateChanged", msg, ct);
|
|
|
|
if (snapshot.Status == "Failed" && (!hadPrior || prior.Status != "Failed"))
|
|
{
|
|
var alert = new AlertMessage(
|
|
AlertId: $"{snapshot.NodeId}:apply-failed",
|
|
Severity: "error",
|
|
Title: $"Apply failed on {snapshot.NodeId}",
|
|
Detail: snapshot.Error ?? "(no detail)",
|
|
RaisedAtUtc: DateTime.UtcNow,
|
|
ClusterId: snapshot.ClusterId,
|
|
NodeId: snapshot.NodeId);
|
|
await alertHub.Clients.Group(AlertHub.AllAlertsGroup)
|
|
.SendAsync("AlertRaised", alert, ct);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>Exposed for tests — forces a snapshot reset so stub data re-seeds.</summary>
|
|
internal void ResetCache()
|
|
{
|
|
_last.Clear();
|
|
_lastRole.Clear();
|
|
_lastResilience.Clear();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Phase 6.1 Stream E.2 — detect deltas on the <c>DriverInstanceResilienceStatus</c>
|
|
/// table and push <see cref="ResilienceStatusChangedMessage"/> on the fleet group so
|
|
/// the Admin <c>/hosts</c> page can upsert the matching row without waiting for the
|
|
/// 10-second poll cycle. Keyed on <c>(DriverInstanceId, HostName)</c>; same key the
|
|
/// UI uses to join status rows.
|
|
/// </summary>
|
|
private async Task PollResilienceAsync(OtOpcUaConfigDbContext db, CancellationToken ct)
|
|
{
|
|
var rows = await db.DriverInstanceResilienceStatuses.AsNoTracking().ToListAsync(ct);
|
|
foreach (var r in rows)
|
|
{
|
|
var key = $"{r.DriverInstanceId} |