From 13d5a7968b9f469f1b8390e8a7f66945380adda1 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Sun, 19 Apr 2026 22:14:25 -0400 Subject: [PATCH] =?UTF-8?q?Admin=20RedundancyTab=20=E2=80=94=20per-cluster?= =?UTF-8?q?=20read-only=20topology=20view.=20Closes=20the=20UI=20slice=20o?= =?UTF-8?q?f=20task=20#149=20(Phase=206.3=20Stream=20E=20=E2=80=94=20Admin?= =?UTF-8?q?=20UI=20RedundancyTab=20+=20OpenTelemetry=20metrics=20+=20Signa?= =?UTF-8?q?lR);=20the=20OpenTelemetry=20metrics=20+=20RoleChanged=20Signal?= =?UTF-8?q?R=20push=20are=20split=20into=20new=20follow-up=20task=20#198?= =?UTF-8?q?=20because=20each=20is=20a=20structural=20add=20that=20deserves?= =?UTF-8?q?=20its=20own=20test=20matrix=20+=20NuGet-dep=20decision=20rathe?= =?UTF-8?q?r=20than=20riding=20this=20UI=20PR.=20New=20/clusters/{ClusterI?= =?UTF-8?q?d}=20Redundancy=20tab=20slotted=20between=20ACLs=20and=20Audit?= =?UTF-8?q?=20in=20the=20existing=20ClusterDetail=20tab=20bar.=20Shows=20e?= =?UTF-8?q?ach=20ClusterNode=20row=20in=20the=20cluster=20with=20columns?= =?UTF-8?q?=20Node=20/=20Role=20(Primary=20green,=20Secondary=20blue,=20St?= =?UTF-8?q?andalone=20primary-blue=20badge)=20/=20Host=20/=20OPC=20UA=20po?= =?UTF-8?q?rt=20/=20ServiceLevel=20base=20/=20ApplicationUri=20(text-break?= =?UTF-8?q?=20so=20the=20long=20urn:=20doesn't=20blow=20out=20the=20table)?= =?UTF-8?q?=20/=20Enabled=20badge=20/=20Last=20seen=20(relative=20age=20vi?= =?UTF-8?q?a=20the=20same=20FormatAge=20helper=20as=20Hosts.razor,=20with?= =?UTF-8?q?=20a=20yellow=20"Stale"=20chip=20once=20LastSeenAt=20crosses=20?= =?UTF-8?q?the=2030s=20threshold=20shared=20with=20HostStatusService.Stale?= =?UTF-8?q?Threshold=20=E2=80=94=20a=20missed=20heartbeat=20plus=20clock-s?= =?UTF-8?q?kew=20buffer).=20Four=20summary=20cards=20above=20the=20table?= =?UTF-8?q?=20=E2=80=94=20total=20Nodes,=20Primary=20count,=20Secondary=20?= =?UTF-8?q?count,=20Stale=20count.=20Two=20guard-rail=20alerts:=20(a)=20re?= =?UTF-8?q?d=20"No=20Primary=20or=20Standalone"=20when=20the=20cluster=20h?= =?UTF-8?q?as=20no=20authoritative=20write=20target=20(all=20rows=20are=20?= =?UTF-8?q?Secondaries=20=E2=80=94=20read-only=20until=20one=20is=20promot?= =?UTF-8?q?ed=20by=20the=20server-side=20RedundancyCoordinator=20apply-lea?= =?UTF-8?q?se=20flow);=20(b)=20red=20"Split-brain"=20when=20>1=20Primary?= =?UTF-8?q?=20exists=20=E2=80=94=20apply-lease=20enforcement=20at=20the=20?= =?UTF-8?q?coordinator=20level=20should=20have=20made=20this=20impossible,?= =?UTF-8?q?=20so=20the=20alert=20implies=20a=20hand-edited=20DB=20row=20+?= =?UTF-8?q?=20an=20investigation.=20New=20ClusterNodeService=20with=20List?= =?UTF-8?q?ByClusterAsync=20(ordered=20by=20ServiceLevelBase=20descending?= =?UTF-8?q?=20so=20Primary=20rows=20with=20higher=20base=20float=20to=20th?= =?UTF-8?q?e=20top)=20+=20a=20static=20IsStale=20predicate=20matching=20Ho?= =?UTF-8?q?stStatusService's=2030s=20convention.=20DI-registered=20alongsi?= =?UTF-8?q?de=20the=20existing=20scoped=20services=20in=20Program.cs.=20Wr?= =?UTF-8?q?ites=20(role=20swap,=20enable/disable)=20are=20deliberately=20a?= =?UTF-8?q?bsent=20from=20the=20service=20=E2=80=94=20they=20go=20through?= =?UTF-8?q?=20the=20RedundancyCoordinator=20apply-lease=20flow=20on=20the?= =?UTF-8?q?=20server=20side=20+=20direct=20DB=20mutation=20from=20Admin=20?= =?UTF-8?q?would=20race=20with=20it.=20New=20ClusterNodeServiceTests=20cov?= =?UTF-8?q?ering=20IsStale=20across=20null/recent/old=20LastSeenAt=20+=20L?= =?UTF-8?q?istByClusterAsync=20ordering=20+=20cluster=20filter.=204/4=20ne?= =?UTF-8?q?w=20tests=20passing;=20full=20Admin.Tests=20suite=2076/76=20(wa?= =?UTF-8?q?s=2072=20before=20this=20PR,=20+4).=20Admin=20project=20builds?= =?UTF-8?q?=200=20errors.=20Task=20#198=20captures=20the=20deferred=20work?= =?UTF-8?q?:=20(1)=20OpenTelemetry=20Meter=20for=20primary/secondary/stale?= =?UTF-8?q?=20counts=20+=20role=5Ftransition=20counter=20with=20from/to/no?= =?UTF-8?q?de=20tags=20+=20OTLP=20exporter=20config;=20(2)=20RoleChanged?= =?UTF-8?q?=20SignalR=20push=20=E2=80=94=20extend=20FleetStatusPoller=20to?= =?UTF-8?q?=20detect=20RedundancyRole=20changes=20on=20ClusterNode=20rows?= =?UTF-8?q?=20+=20emit=20a=20RoleChanged=20hub=20message=20so=20the=20Redu?= =?UTF-8?q?ndancyTab=20refreshes=20instantly=20instead=20of=20on-page-load?= =?UTF-8?q?=20polling.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Pages/Clusters/ClusterDetail.razor | 5 + .../Pages/Clusters/RedundancyTab.razor | 136 ++++++++++++++++++ src/ZB.MOM.WW.OtOpcUa.Admin/Program.cs | 1 + .../Services/ClusterNodeService.cs | 28 ++++ .../ClusterNodeServiceTests.cs | 78 ++++++++++ 5 files changed, 248 insertions(+) create mode 100644 src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Clusters/RedundancyTab.razor create mode 100644 src/ZB.MOM.WW.OtOpcUa.Admin/Services/ClusterNodeService.cs create mode 100644 tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/ClusterNodeServiceTests.cs diff --git a/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Clusters/ClusterDetail.razor b/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Clusters/ClusterDetail.razor index 781af6d..72cd9ba 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Clusters/ClusterDetail.razor +++ b/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Clusters/ClusterDetail.razor @@ -52,6 +52,7 @@ else + @@ -92,6 +93,10 @@ else { } + else if (_tab == "redundancy") + { + + } else if (_tab == "audit") { diff --git a/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Clusters/RedundancyTab.razor b/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Clusters/RedundancyTab.razor new file mode 100644 index 0000000..068f059 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Clusters/RedundancyTab.razor @@ -0,0 +1,136 @@ +@using ZB.MOM.WW.OtOpcUa.Admin.Services +@using ZB.MOM.WW.OtOpcUa.Configuration.Entities +@using ZB.MOM.WW.OtOpcUa.Configuration.Enums +@inject ClusterNodeService NodeSvc + +

Redundancy topology

+

+ One row per ClusterNode in this cluster. Role, ApplicationUri, + and ServiceLevelBase are authored separately; the Admin UI shows them read-only + here so operators can confirm the published topology without touching it. LastSeen older than + @((int)ClusterNodeService.StaleThreshold.TotalSeconds)s is flagged Stale — the node has + stopped heart-beating and is likely down. Role swap goes through the server-side + RedundancyCoordinator apply-lease flow, not direct DB edits. +

+ +@if (_nodes is null) +{ +

Loading…

+} +else if (_nodes.Count == 0) +{ +
+ No ClusterNode rows for this cluster. The server process needs at least one entry + (with a non-blank ApplicationUri) before it can start up per OPC UA spec. +
+} +else +{ + var primaries = _nodes.Count(n => n.RedundancyRole == RedundancyRole.Primary); + var secondaries = _nodes.Count(n => n.RedundancyRole == RedundancyRole.Secondary); + var standalone = _nodes.Count(n => n.RedundancyRole == RedundancyRole.Standalone); + var staleCount = _nodes.Count(ClusterNodeService.IsStale); + +
+
+
Nodes
+
@_nodes.Count
+
+
+
Primary
+
@primaries
+
+
+
Secondary
+
@secondaries
+
+
+
Stale
+
@staleCount
+
+
+ + @if (primaries == 0 && standalone == 0) + { +
+ No Primary or Standalone node — the cluster has no authoritative write target. Secondaries + stay read-only until one of them gets promoted via RedundancyCoordinator. +
+ } + else if (primaries > 1) + { +
+ Split-brain: @primaries nodes claim the Primary role. Apply-lease + enforcement should have made this impossible at the coordinator level. Investigate + immediately — one of the rows was likely hand-edited. +
+ } + + + + + + + + + + + + + + + + @foreach (var n in _nodes) + { + + + + + + + + + + + } + +
NodeRoleHostOPC UA portServiceLevel baseApplicationUriEnabledLast seen
@n.NodeId@n.RedundancyRole@n.Host@n.OpcUaPort@n.ServiceLevelBase@n.ApplicationUri + @if (n.Enabled) { Enabled } + else { Disabled } + + @(n.LastSeenAt is null ? "never" : FormatAge(n.LastSeenAt.Value)) + @if (ClusterNodeService.IsStale(n)) { Stale } +
+} + +@code { + [Parameter] public string ClusterId { get; set; } = string.Empty; + + private List? _nodes; + + protected override async Task OnParametersSetAsync() + { + _nodes = await NodeSvc.ListByClusterAsync(ClusterId, CancellationToken.None); + } + + private static string RowClass(ClusterNode n) => + ClusterNodeService.IsStale(n) ? "table-warning" : + !n.Enabled ? "table-secondary" : ""; + + private static string RoleBadge(RedundancyRole r) => r switch + { + RedundancyRole.Primary => "bg-success", + RedundancyRole.Secondary => "bg-info", + RedundancyRole.Standalone => "bg-primary", + _ => "bg-secondary", + }; + + private static string FormatAge(DateTime t) + { + var age = DateTime.UtcNow - t; + if (age.TotalSeconds < 60) return $"{(int)age.TotalSeconds}s ago"; + if (age.TotalMinutes < 60) return $"{(int)age.TotalMinutes}m ago"; + if (age.TotalHours < 24) return $"{(int)age.TotalHours}h ago"; + return t.ToString("yyyy-MM-dd HH:mm 'UTC'"); + } +} diff --git a/src/ZB.MOM.WW.OtOpcUa.Admin/Program.cs b/src/ZB.MOM.WW.OtOpcUa.Admin/Program.cs index ef3d3e7..a0fe448 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Admin/Program.cs +++ b/src/ZB.MOM.WW.OtOpcUa.Admin/Program.cs @@ -48,6 +48,7 @@ builder.Services.AddScoped(); builder.Services.AddScoped(); builder.Services.AddScoped(); builder.Services.AddScoped(); +builder.Services.AddScoped(); builder.Services.AddScoped(); builder.Services.AddScoped(); diff --git a/src/ZB.MOM.WW.OtOpcUa.Admin/Services/ClusterNodeService.cs b/src/ZB.MOM.WW.OtOpcUa.Admin/Services/ClusterNodeService.cs new file mode 100644 index 0000000..1729a54 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Admin/Services/ClusterNodeService.cs @@ -0,0 +1,28 @@ +using Microsoft.EntityFrameworkCore; +using ZB.MOM.WW.OtOpcUa.Configuration; +using ZB.MOM.WW.OtOpcUa.Configuration.Entities; + +namespace ZB.MOM.WW.OtOpcUa.Admin.Services; + +/// +/// Read-side service for ClusterNode rows + their cluster-scoped redundancy view. Consumed +/// by the RedundancyTab on the cluster detail page. Writes (role swap, node enable/disable) +/// are not supported here — role swap happens through the RedundancyCoordinator apply-lease +/// flow on the server side and would conflict with any direct DB mutation from Admin. +/// +public sealed class ClusterNodeService(OtOpcUaConfigDbContext db) +{ + /// Stale-threshold matching HostStatusService.StaleThreshold — 30s of clock + /// tolerance covers a missed heartbeat plus publisher GC pauses. + public static readonly TimeSpan StaleThreshold = TimeSpan.FromSeconds(30); + + public Task> ListByClusterAsync(string clusterId, CancellationToken ct) => + db.ClusterNodes.AsNoTracking() + .Where(n => n.ClusterId == clusterId) + .OrderByDescending(n => n.ServiceLevelBase) + .ThenBy(n => n.NodeId) + .ToListAsync(ct); + + public static bool IsStale(ClusterNode node) => + node.LastSeenAt is null || DateTime.UtcNow - node.LastSeenAt.Value > StaleThreshold; +} diff --git a/tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/ClusterNodeServiceTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/ClusterNodeServiceTests.cs new file mode 100644 index 0000000..ad3f72b --- /dev/null +++ b/tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/ClusterNodeServiceTests.cs @@ -0,0 +1,78 @@ +using Microsoft.EntityFrameworkCore; +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Admin.Services; +using ZB.MOM.WW.OtOpcUa.Configuration; +using ZB.MOM.WW.OtOpcUa.Configuration.Entities; +using ZB.MOM.WW.OtOpcUa.Configuration.Enums; + +namespace ZB.MOM.WW.OtOpcUa.Admin.Tests; + +[Trait("Category", "Unit")] +public sealed class ClusterNodeServiceTests +{ + [Fact] + public void IsStale_NullLastSeen_Returns_True() + { + var node = NewNode("A", RedundancyRole.Primary, lastSeenAt: null); + ClusterNodeService.IsStale(node).ShouldBeTrue(); + } + + [Fact] + public void IsStale_RecentLastSeen_Returns_False() + { + var node = NewNode("A", RedundancyRole.Primary, lastSeenAt: DateTime.UtcNow.AddSeconds(-5)); + ClusterNodeService.IsStale(node).ShouldBeFalse(); + } + + [Fact] + public void IsStale_Old_LastSeen_Returns_True() + { + var node = NewNode("A", RedundancyRole.Primary, + lastSeenAt: DateTime.UtcNow - ClusterNodeService.StaleThreshold - TimeSpan.FromSeconds(1)); + ClusterNodeService.IsStale(node).ShouldBeTrue(); + } + + [Fact] + public async Task ListByClusterAsync_OrdersByServiceLevelBase_Descending_Then_NodeId() + { + using var ctx = NewContext(); + ctx.ClusterNodes.AddRange( + NewNode("B-low", RedundancyRole.Secondary, serviceLevelBase: 150, clusterId: "c1"), + NewNode("A-high", RedundancyRole.Primary, serviceLevelBase: 200, clusterId: "c1"), + NewNode("other-cluster", RedundancyRole.Primary, serviceLevelBase: 200, clusterId: "c2")); + await ctx.SaveChangesAsync(); + + var svc = new ClusterNodeService(ctx); + var rows = await svc.ListByClusterAsync("c1", CancellationToken.None); + + rows.Count.ShouldBe(2); + rows[0].NodeId.ShouldBe("A-high"); // higher ServiceLevelBase first + rows[1].NodeId.ShouldBe("B-low"); + } + + private static ClusterNode NewNode( + string nodeId, + RedundancyRole role, + DateTime? lastSeenAt = null, + int serviceLevelBase = 200, + string clusterId = "c1") => new() + { + NodeId = nodeId, + ClusterId = clusterId, + RedundancyRole = role, + Host = $"{nodeId}.example", + ApplicationUri = $"urn:{nodeId}", + ServiceLevelBase = (byte)serviceLevelBase, + LastSeenAt = lastSeenAt, + CreatedBy = "test", + }; + + private static OtOpcUaConfigDbContext NewContext() + { + var opts = new DbContextOptionsBuilder() + .UseInMemoryDatabase(Guid.NewGuid().ToString()) + .Options; + return new OtOpcUaConfigDbContext(opts); + } +}