Files
lmxopcua/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Clusters/RedundancyTab.razor
Joseph Doherty 1f3343e61f OpenTelemetry redundancy metrics + RoleChanged SignalR push. Closes instrumentation + live-push slices of task #198; the exporter wiring (OTLP vs Prometheus package decision) is split to new task #201 because the collector/scrape-endpoint choice is a fleet-ops decision that deserves its own PR rather than hardcoded here. New RedundancyMetrics class (Singleton-registered in DI) owning a System.Diagnostics.Metrics.Meter("ZB.MOM.WW.OtOpcUa.Redundancy", "1.0.0"). Three ObservableGauge instruments — otopcua.redundancy.primary_count / secondary_count / stale_count — all tagged by cluster.id, populated by SetClusterCounts(clusterId, primary, secondary, stale) which the poller calls at the tail of every tick; ObservableGauge callbacks snapshot the last value set under a lock so the reader (OTel collector, dotnet-counters) sees consistent tuples. One Counter — otopcua.redundancy.role_transition — tagged cluster.id, node.id, from_role, to_role; ideal for tracking "how often does Cluster-X failover" + "which node transitions most" aggregate queries. In-box Metrics API means zero NuGet dep here — the exporter PR adds OpenTelemetry.Extensions.Hosting + OpenTelemetry.Exporter.OpenTelemetryProtocol or OpenTelemetry.Exporter.Prometheus.AspNetCore to actually ship the data somewhere. FleetStatusPoller extended with role-change detection. Its PollOnceAsync now pulls ClusterNode rows alongside the existing ClusterNodeGenerationState scan, and a new PollRolesAsync walks every node comparing RedundancyRole to the _lastRole cache. On change: records the transition to RedundancyMetrics + emits a RoleChanged SignalR message to both FleetStatusHub.GroupName(cluster) + FleetStatusHub.FleetGroup so cluster-scoped + fleet-wide subscribers both see it. First observation per node is a bootstrap (cache fill) + NOT a transition — avoids spurious churn on service startup or pod restart. UpdateClusterGauges groups nodes by cluster + sets the three gauge values, using ClusterNodeService.StaleThreshold (shared 30s convention) for staleness so the /hosts page + the gauge agree. RoleChangedMessage record lives alongside NodeStateChangedMessage in FleetStatusPoller.cs. RedundancyTab.razor subscribes to the fleet-status hub on first parameters-set, filters RoleChanged events to the current cluster, reloads the node list + paints a blue info banner ("Role changed on node-a: Primary → Secondary at HH:mm:ss UTC") so operators see the transition without needing to poll-refresh the page. IAsyncDisposable closes the connection on tab swap-away. Two new RedundancyMetricsTests covering RecordRoleTransition tag emission (cluster.id + node.id + from_role + to_role all flow through the MeterListener callback) + ObservableGauge snapshot for two clusters (assert primary_count=1 for c1, stale_count=1 for c2). Existing FleetStatusPollerTests ctor-line updated to pass a RedundancyMetrics instance; all tests still pass. Full Admin.Tests suite 87/87 passing (was 85, +2). Admin project builds 0 errors. Task #201 captures the exporter-wiring follow-up — OpenTelemetry.Extensions.Hosting + OTLP vs Prometheus + /metrics endpoint decision, driven by fleet-ops infra direction.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 23:16:09 -04:00

176 lines
6.8 KiB
Plaintext

@using Microsoft.AspNetCore.SignalR.Client
@using ZB.MOM.WW.OtOpcUa.Admin.Hubs
@using ZB.MOM.WW.OtOpcUa.Admin.Services
@using ZB.MOM.WW.OtOpcUa.Configuration.Entities
@using ZB.MOM.WW.OtOpcUa.Configuration.Enums
@inject ClusterNodeService NodeSvc
@inject NavigationManager Nav
@implements IAsyncDisposable
<h4>Redundancy topology</h4>
@if (_roleChangedBanner is not null)
{
<div class="alert alert-info small mb-2">@_roleChangedBanner</div>
}
<p class="text-muted small">
One row per <code>ClusterNode</code> in this cluster. Role, <code>ApplicationUri</code>,
and <code>ServiceLevelBase</code> are authored separately; the Admin UI shows them read-only
here so operators can confirm the published topology without touching it. LastSeen older than
@((int)ClusterNodeService.StaleThreshold.TotalSeconds)s is flagged Stale — the node has
stopped heart-beating and is likely down. Role swap goes through the server-side
<code>RedundancyCoordinator</code> apply-lease flow, not direct DB edits.
</p>
@if (_nodes is null)
{
<p>Loading…</p>
}
else if (_nodes.Count == 0)
{
<div class="alert alert-warning">
No ClusterNode rows for this cluster. The server process needs at least one entry
(with a non-blank <code>ApplicationUri</code>) before it can start up per OPC UA spec.
</div>
}
else
{
var primaries = _nodes.Count(n => n.RedundancyRole == RedundancyRole.Primary);
var secondaries = _nodes.Count(n => n.RedundancyRole == RedundancyRole.Secondary);
var standalone = _nodes.Count(n => n.RedundancyRole == RedundancyRole.Standalone);
var staleCount = _nodes.Count(ClusterNodeService.IsStale);
<div class="row g-3 mb-4">
<div class="col-md-3"><div class="card"><div class="card-body">
<h6 class="text-muted mb-1">Nodes</h6>
<div class="fs-3">@_nodes.Count</div>
</div></div></div>
<div class="col-md-3"><div class="card border-success"><div class="card-body">
<h6 class="text-muted mb-1">Primary</h6>
<div class="fs-3 text-success">@primaries</div>
</div></div></div>
<div class="col-md-3"><div class="card border-info"><div class="card-body">
<h6 class="text-muted mb-1">Secondary</h6>
<div class="fs-3 text-info">@secondaries</div>
</div></div></div>
<div class="col-md-3"><div class="card @(staleCount > 0 ? "border-warning" : "")"><div class="card-body">
<h6 class="text-muted mb-1">Stale</h6>
<div class="fs-3 @(staleCount > 0 ? "text-warning" : "")">@staleCount</div>
</div></div></div>
</div>
@if (primaries == 0 && standalone == 0)
{
<div class="alert alert-danger small mb-3">
No Primary or Standalone node — the cluster has no authoritative write target. Secondaries
stay read-only until one of them gets promoted via <code>RedundancyCoordinator</code>.
</div>
}
else if (primaries > 1)
{
<div class="alert alert-danger small mb-3">
<strong>Split-brain:</strong> @primaries nodes claim the Primary role. Apply-lease
enforcement should have made this impossible at the coordinator level. Investigate
immediately — one of the rows was likely hand-edited.
</div>
}
<table class="table table-sm table-hover align-middle">
<thead>
<tr>
<th>Node</th>
<th>Role</th>
<th>Host</th>
<th class="text-end">OPC UA port</th>
<th class="text-end">ServiceLevel base</th>
<th>ApplicationUri</th>
<th>Enabled</th>
<th>Last seen</th>
</tr>
</thead>
<tbody>
@foreach (var n in _nodes)
{
<tr class="@RowClass(n)">
<td><code>@n.NodeId</code></td>
<td><span class="badge @RoleBadge(n.RedundancyRole)">@n.RedundancyRole</span></td>
<td>@n.Host</td>
<td class="text-end"><code>@n.OpcUaPort</code></td>
<td class="text-end">@n.ServiceLevelBase</td>
<td class="small text-break"><code>@n.ApplicationUri</code></td>
<td>
@if (n.Enabled) { <span class="badge bg-success">Enabled</span> }
else { <span class="badge bg-secondary">Disabled</span> }
</td>
<td class="small @(ClusterNodeService.IsStale(n) ? "text-warning fw-bold" : "")">
@(n.LastSeenAt is null ? "never" : FormatAge(n.LastSeenAt.Value))
@if (ClusterNodeService.IsStale(n)) { <span class="badge bg-warning text-dark ms-1">Stale</span> }
</td>
</tr>
}
</tbody>
</table>
}
@code {
[Parameter] public string ClusterId { get; set; } = string.Empty;
private List<ClusterNode>? _nodes;
private HubConnection? _hub;
private string? _roleChangedBanner;
protected override async Task OnParametersSetAsync()
{
_nodes = await NodeSvc.ListByClusterAsync(ClusterId, CancellationToken.None);
if (_hub is null) await ConnectHubAsync();
}
private async Task ConnectHubAsync()
{
_hub = new HubConnectionBuilder()
.WithUrl(Nav.ToAbsoluteUri("/hubs/fleet-status"))
.WithAutomaticReconnect()
.Build();
_hub.On<RoleChangedMessage>("RoleChanged", async msg =>
{
if (msg.ClusterId != ClusterId) return;
_roleChangedBanner = $"Role changed on {msg.NodeId}: {msg.FromRole} → {msg.ToRole} at {msg.ObservedAtUtc:HH:mm:ss 'UTC'}";
_nodes = await NodeSvc.ListByClusterAsync(ClusterId, CancellationToken.None);
await InvokeAsync(StateHasChanged);
});
await _hub.StartAsync();
await _hub.SendAsync("SubscribeCluster", ClusterId);
}
public async ValueTask DisposeAsync()
{
if (_hub is not null)
{
await _hub.DisposeAsync();
_hub = null;
}
}
private static string RowClass(ClusterNode n) =>
ClusterNodeService.IsStale(n) ? "table-warning" :
!n.Enabled ? "table-secondary" : "";
private static string RoleBadge(RedundancyRole r) => r switch
{
RedundancyRole.Primary => "bg-success",
RedundancyRole.Secondary => "bg-info",
RedundancyRole.Standalone => "bg-primary",
_ => "bg-secondary",
};
private static string FormatAge(DateTime t)
{
var age = DateTime.UtcNow - t;
if (age.TotalSeconds < 60) return $"{(int)age.TotalSeconds}s ago";
if (age.TotalMinutes < 60) return $"{(int)age.TotalMinutes}m ago";
if (age.TotalHours < 24) return $"{(int)age.TotalHours}h ago";
return t.ToString("yyyy-MM-dd HH:mm 'UTC'");
}
}