diff --git a/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Clusters/RedundancyTab.razor b/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Clusters/RedundancyTab.razor
index 068f059..c562415 100644
--- a/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Clusters/RedundancyTab.razor
+++ b/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Clusters/RedundancyTab.razor
@@ -1,9 +1,17 @@
+@using Microsoft.AspNetCore.SignalR.Client
+@using ZB.MOM.WW.OtOpcUa.Admin.Hubs
@using ZB.MOM.WW.OtOpcUa.Admin.Services
@using ZB.MOM.WW.OtOpcUa.Configuration.Entities
@using ZB.MOM.WW.OtOpcUa.Configuration.Enums
@inject ClusterNodeService NodeSvc
+@inject NavigationManager Nav
+@implements IAsyncDisposable
Redundancy topology
+@if (_roleChangedBanner is not null)
+{
+ @_roleChangedBanner
+}
One row per ClusterNode in this cluster. Role, ApplicationUri,
and ServiceLevelBase are authored separately; the Admin UI shows them read-only
@@ -107,10 +115,41 @@ else
[Parameter] public string ClusterId { get; set; } = string.Empty;
private List? _nodes;
+ private HubConnection? _hub;
+ private string? _roleChangedBanner;
protected override async Task OnParametersSetAsync()
{
_nodes = await NodeSvc.ListByClusterAsync(ClusterId, CancellationToken.None);
+ if (_hub is null) await ConnectHubAsync();
+ }
+
+ private async Task ConnectHubAsync()
+ {
+ _hub = new HubConnectionBuilder()
+ .WithUrl(Nav.ToAbsoluteUri("/hubs/fleet-status"))
+ .WithAutomaticReconnect()
+ .Build();
+
+ _hub.On("RoleChanged", async msg =>
+ {
+ if (msg.ClusterId != ClusterId) return;
+ _roleChangedBanner = $"Role changed on {msg.NodeId}: {msg.FromRole} → {msg.ToRole} at {msg.ObservedAtUtc:HH:mm:ss 'UTC'}";
+ _nodes = await NodeSvc.ListByClusterAsync(ClusterId, CancellationToken.None);
+ await InvokeAsync(StateHasChanged);
+ });
+
+ await _hub.StartAsync();
+ await _hub.SendAsync("SubscribeCluster", ClusterId);
+ }
+
+ public async ValueTask DisposeAsync()
+ {
+ if (_hub is not null)
+ {
+ await _hub.DisposeAsync();
+ _hub = null;
+ }
}
private static string RowClass(ClusterNode n) =>
diff --git a/src/ZB.MOM.WW.OtOpcUa.Admin/Hubs/FleetStatusPoller.cs b/src/ZB.MOM.WW.OtOpcUa.Admin/Hubs/FleetStatusPoller.cs
index bead926..cd617d1 100644
--- a/src/ZB.MOM.WW.OtOpcUa.Admin/Hubs/FleetStatusPoller.cs
+++ b/src/ZB.MOM.WW.OtOpcUa.Admin/Hubs/FleetStatusPoller.cs
@@ -1,7 +1,9 @@
using Microsoft.AspNetCore.SignalR;
using Microsoft.EntityFrameworkCore;
+using ZB.MOM.WW.OtOpcUa.Admin.Services;
using ZB.MOM.WW.OtOpcUa.Configuration;
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
+using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
namespace ZB.MOM.WW.OtOpcUa.Admin.Hubs;
@@ -14,11 +16,13 @@ public sealed class FleetStatusPoller(
IServiceScopeFactory scopeFactory,
IHubContext fleetHub,
IHubContext alertHub,
- ILogger logger) : BackgroundService
+ ILogger logger,
+ RedundancyMetrics redundancyMetrics) : BackgroundService
{
public TimeSpan PollInterval { get; init; } = TimeSpan.FromSeconds(5);
private readonly Dictionary _last = new();
+ private readonly Dictionary _lastRole = new(StringComparer.Ordinal);
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
@@ -42,6 +46,10 @@ public sealed class FleetStatusPoller(
using var scope = scopeFactory.CreateScope();
var db = scope.ServiceProvider.GetRequiredService();
+ var nodes = await db.ClusterNodes.AsNoTracking().ToListAsync(ct);
+ await PollRolesAsync(nodes, ct);
+ UpdateClusterGauges(nodes);
+
var rows = await db.ClusterNodeGenerationStates.AsNoTracking()
.Join(db.ClusterNodes.AsNoTracking(), s => s.NodeId, n => n.NodeId, (s, n) => new { s, n.ClusterId })
.ToListAsync(ct);
@@ -85,9 +93,63 @@ public sealed class FleetStatusPoller(
}
/// Exposed for tests — forces a snapshot reset so stub data re-seeds.
- internal void ResetCache() => _last.Clear();
+ internal void ResetCache()
+ {
+ _last.Clear();
+ _lastRole.Clear();
+ }
+
+ private async Task PollRolesAsync(IReadOnlyList nodes, CancellationToken ct)
+ {
+ foreach (var n in nodes)
+ {
+ var hadPrior = _lastRole.TryGetValue(n.NodeId, out var priorRole);
+ if (hadPrior && priorRole == n.RedundancyRole) continue;
+
+ _lastRole[n.NodeId] = n.RedundancyRole;
+ if (!hadPrior) continue; // first-observation bootstrap — not a transition
+
+ redundancyMetrics.RecordRoleTransition(
+ clusterId: n.ClusterId, nodeId: n.NodeId,
+ fromRole: priorRole.ToString(), toRole: n.RedundancyRole.ToString());
+
+ var msg = new RoleChangedMessage(
+ ClusterId: n.ClusterId, NodeId: n.NodeId,
+ FromRole: priorRole.ToString(), ToRole: n.RedundancyRole.ToString(),
+ ObservedAtUtc: DateTime.UtcNow);
+
+ await fleetHub.Clients.Group(FleetStatusHub.GroupName(n.ClusterId))
+ .SendAsync("RoleChanged", msg, ct);
+ await fleetHub.Clients.Group(FleetStatusHub.FleetGroup)
+ .SendAsync("RoleChanged", msg, ct);
+ }
+ }
+
+ private void UpdateClusterGauges(IReadOnlyList nodes)
+ {
+ var staleCutoff = DateTime.UtcNow - Services.ClusterNodeService.StaleThreshold;
+ foreach (var group in nodes.GroupBy(n => n.ClusterId))
+ {
+ var primary = group.Count(n => n.RedundancyRole == RedundancyRole.Primary);
+ var secondary = group.Count(n => n.RedundancyRole == RedundancyRole.Secondary);
+ var stale = group.Count(n => n.LastSeenAt is null || n.LastSeenAt.Value < staleCutoff);
+ redundancyMetrics.SetClusterCounts(group.Key, primary, secondary, stale);
+ }
+ }
private readonly record struct NodeStateSnapshot(
string NodeId, string ClusterId, long? GenerationId,
string? Status, string? Error, DateTime? AppliedAt, DateTime? SeenAt);
}
+
+///
+/// Pushed by when it observes a change in
+/// . Consumed by the Admin RedundancyTab to trigger
+/// an instant reload instead of waiting for the next on-parameter-set poll.
+///
+public sealed record RoleChangedMessage(
+ string ClusterId,
+ string NodeId,
+ string FromRole,
+ string ToRole,
+ DateTime ObservedAtUtc);
diff --git a/src/ZB.MOM.WW.OtOpcUa.Admin/Program.cs b/src/ZB.MOM.WW.OtOpcUa.Admin/Program.cs
index a0fe448..b6f3ea3 100644
--- a/src/ZB.MOM.WW.OtOpcUa.Admin/Program.cs
+++ b/src/ZB.MOM.WW.OtOpcUa.Admin/Program.cs
@@ -49,6 +49,7 @@ builder.Services.AddScoped();
builder.Services.AddScoped();
builder.Services.AddScoped();
builder.Services.AddScoped();
+builder.Services.AddSingleton();
builder.Services.AddScoped();
builder.Services.AddScoped();
diff --git a/src/ZB.MOM.WW.OtOpcUa.Admin/Services/RedundancyMetrics.cs b/src/ZB.MOM.WW.OtOpcUa.Admin/Services/RedundancyMetrics.cs
new file mode 100644
index 0000000..9def449
--- /dev/null
+++ b/src/ZB.MOM.WW.OtOpcUa.Admin/Services/RedundancyMetrics.cs
@@ -0,0 +1,102 @@
+using System.Diagnostics.Metrics;
+
+namespace ZB.MOM.WW.OtOpcUa.Admin.Services;
+
+///
+/// OpenTelemetry-compatible instrumentation for the redundancy surface. Uses in-box
+/// so no NuGet dependency is required to emit —
+/// any MeterListener (dotnet-counters, OpenTelemetry.Extensions.Hosting OTLP exporter,
+/// Prometheus exporter, etc.) picks up the instruments by the .
+///
+///
+/// Exporter configuration (OTLP, Prometheus, etc.) is intentionally NOT wired here —
+/// that's a deployment-ops decision that belongs in Program.cs behind an
+/// appsettings toggle. This class owns only the Meter + instruments so the
+/// production data stream exists regardless of exporter availability.
+///
+/// Counter + gauge names follow the otel-semantic-conventions pattern:
+/// otopcua.redundancy.* with tags for ClusterId + (for transitions) FromRole/ToRole/NodeId.
+///
+public sealed class RedundancyMetrics : IDisposable
+{
+ public const string MeterName = "ZB.MOM.WW.OtOpcUa.Redundancy";
+
+ private readonly Meter _meter;
+ private readonly Counter _roleTransitions;
+ private readonly object _gaugeLock = new();
+ private readonly Dictionary _gaugeState = new();
+
+ public RedundancyMetrics()
+ {
+ _meter = new Meter(MeterName, version: "1.0.0");
+ _roleTransitions = _meter.CreateCounter(
+ "otopcua.redundancy.role_transition",
+ unit: "{transition}",
+ description: "Observed RedundancyRole changes per node — tagged FromRole, ToRole, NodeId, ClusterId.");
+
+ // Observable gauges — the callback reports whatever the last Observe*Count call stashed.
+ _meter.CreateObservableGauge(
+ "otopcua.redundancy.primary_count",
+ ObservePrimaryCounts,
+ unit: "{node}",
+ description: "Count of Primary-role nodes per cluster (should be 1 for N+1 redundant clusters, 0 during failover).");
+ _meter.CreateObservableGauge(
+ "otopcua.redundancy.secondary_count",
+ ObserveSecondaryCounts,
+ unit: "{node}",
+ description: "Count of Secondary-role nodes per cluster.");
+ _meter.CreateObservableGauge(
+ "otopcua.redundancy.stale_count",
+ ObserveStaleCounts,
+ unit: "{node}",
+ description: "Count of cluster nodes whose LastSeenAt is older than StaleThreshold.");
+ }
+
+ ///
+ /// Update the per-cluster snapshot consumed by the ObservableGauges. Poller calls this
+ /// at the end of every tick so the collectors see fresh numbers on the next observation
+ /// window (by default 1s for dotnet-counters, configurable per exporter).
+ ///
+ public void SetClusterCounts(string clusterId, int primary, int secondary, int stale)
+ {
+ lock (_gaugeLock)
+ {
+ _gaugeState[clusterId] = new ClusterGaugeState(primary, secondary, stale);
+ }
+ }
+
+ ///
+ /// Increment the role_transition counter when a node's RedundancyRole changes. Tags
+ /// allow breakdowns by from/to roles (e.g. Primary → Secondary for planned failover vs
+ /// Primary → Standalone for emergency recovery) + by cluster for multi-site fleets.
+ ///
+ public void RecordRoleTransition(string clusterId, string nodeId, string fromRole, string toRole)
+ {
+ _roleTransitions.Add(1,
+ new KeyValuePair("cluster.id", clusterId),
+ new KeyValuePair("node.id", nodeId),
+ new KeyValuePair("from_role", fromRole),
+ new KeyValuePair("to_role", toRole));
+ }
+
+ public void Dispose() => _meter.Dispose();
+
+ private IEnumerable> ObservePrimaryCounts() => SnapshotGauge(s => s.Primary);
+ private IEnumerable> ObserveSecondaryCounts() => SnapshotGauge(s => s.Secondary);
+ private IEnumerable> ObserveStaleCounts() => SnapshotGauge(s => s.Stale);
+
+ private IEnumerable> SnapshotGauge(Func selector)
+ {
+ List> results;
+ lock (_gaugeLock)
+ {
+ results = new List>(_gaugeState.Count);
+ foreach (var (cluster, state) in _gaugeState)
+ results.Add(new Measurement(selector(state),
+ new KeyValuePair("cluster.id", cluster)));
+ }
+ return results;
+ }
+
+ private readonly record struct ClusterGaugeState(int Primary, int Secondary, int Stale);
+}
diff --git a/tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/FleetStatusPollerTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/FleetStatusPollerTests.cs
index 906388d..df07586 100644
--- a/tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/FleetStatusPollerTests.cs
+++ b/tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/FleetStatusPollerTests.cs
@@ -5,6 +5,7 @@ using Microsoft.Extensions.Logging.Abstractions;
using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Admin.Hubs;
+using ZB.MOM.WW.OtOpcUa.Admin.Services;
using ZB.MOM.WW.OtOpcUa.Configuration;
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
@@ -97,7 +98,7 @@ END";
var poller = new FleetStatusPoller(
_sp.GetRequiredService(),
- fleetHub, alertHub, NullLogger.Instance);
+ fleetHub, alertHub, NullLogger.Instance, new RedundancyMetrics());
await poller.PollOnceAsync(CancellationToken.None);
@@ -142,7 +143,7 @@ END";
var poller = new FleetStatusPoller(
_sp.GetRequiredService(),
- fleetHub, alertHub, NullLogger.Instance);
+ fleetHub, alertHub, NullLogger.Instance, new RedundancyMetrics());
await poller.PollOnceAsync(CancellationToken.None);
diff --git a/tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/RedundancyMetricsTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/RedundancyMetricsTests.cs
new file mode 100644
index 0000000..4faee99
--- /dev/null
+++ b/tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/RedundancyMetricsTests.cs
@@ -0,0 +1,70 @@
+using System.Diagnostics.Metrics;
+using Shouldly;
+using Xunit;
+using ZB.MOM.WW.OtOpcUa.Admin.Services;
+
+namespace ZB.MOM.WW.OtOpcUa.Admin.Tests;
+
+[Trait("Category", "Unit")]
+public sealed class RedundancyMetricsTests
+{
+ [Fact]
+ public void RecordRoleTransition_Increments_Counter_WithExpectedTags()
+ {
+ using var metrics = new RedundancyMetrics();
+ using var listener = new MeterListener();
+ var observed = new List<(long Value, Dictionary Tags)>();
+ listener.InstrumentPublished = (instrument, l) =>
+ {
+ if (instrument.Meter.Name == RedundancyMetrics.MeterName &&
+ instrument.Name == "otopcua.redundancy.role_transition")
+ {
+ l.EnableMeasurementEvents(instrument);
+ }
+ };
+ listener.SetMeasurementEventCallback((_, value, tags, _) =>
+ {
+ var dict = new Dictionary();
+ foreach (var tag in tags) dict[tag.Key] = tag.Value;
+ observed.Add((value, dict));
+ });
+ listener.Start();
+
+ metrics.RecordRoleTransition("c1", "node-a", "Primary", "Secondary");
+
+ observed.Count.ShouldBe(1);
+ observed[0].Value.ShouldBe(1);
+ observed[0].Tags["cluster.id"].ShouldBe("c1");
+ observed[0].Tags["node.id"].ShouldBe("node-a");
+ observed[0].Tags["from_role"].ShouldBe("Primary");
+ observed[0].Tags["to_role"].ShouldBe("Secondary");
+ }
+
+ [Fact]
+ public void SetClusterCounts_Observed_Via_ObservableGauges()
+ {
+ using var metrics = new RedundancyMetrics();
+ metrics.SetClusterCounts("c1", primary: 1, secondary: 2, stale: 0);
+ metrics.SetClusterCounts("c2", primary: 0, secondary: 1, stale: 1);
+
+ var observations = new List<(string Name, long Value, string Cluster)>();
+ using var listener = new MeterListener();
+ listener.InstrumentPublished = (instrument, l) =>
+ {
+ if (instrument.Meter.Name == RedundancyMetrics.MeterName)
+ l.EnableMeasurementEvents(instrument);
+ };
+ listener.SetMeasurementEventCallback((instrument, value, tags, _) =>
+ {
+ string? cluster = null;
+ foreach (var t in tags) if (t.Key == "cluster.id") cluster = t.Value as string;
+ observations.Add((instrument.Name, value, cluster ?? "?"));
+ });
+ listener.Start();
+ listener.RecordObservableInstruments();
+
+ observations.ShouldContain(o => o.Name == "otopcua.redundancy.primary_count" && o.Cluster == "c1" && o.Value == 1);
+ observations.ShouldContain(o => o.Name == "otopcua.redundancy.secondary_count" && o.Cluster == "c1" && o.Value == 2);
+ observations.ShouldContain(o => o.Name == "otopcua.redundancy.stale_count" && o.Cluster == "c2" && o.Value == 1);
+ }
+}