diff --git a/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Clusters/RedundancyTab.razor b/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Clusters/RedundancyTab.razor index 068f059..c562415 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Clusters/RedundancyTab.razor +++ b/src/ZB.MOM.WW.OtOpcUa.Admin/Components/Pages/Clusters/RedundancyTab.razor @@ -1,9 +1,17 @@ +@using Microsoft.AspNetCore.SignalR.Client +@using ZB.MOM.WW.OtOpcUa.Admin.Hubs @using ZB.MOM.WW.OtOpcUa.Admin.Services @using ZB.MOM.WW.OtOpcUa.Configuration.Entities @using ZB.MOM.WW.OtOpcUa.Configuration.Enums @inject ClusterNodeService NodeSvc +@inject NavigationManager Nav +@implements IAsyncDisposable

Redundancy topology

+@if (_roleChangedBanner is not null) +{ +
@_roleChangedBanner
+}

One row per ClusterNode in this cluster. Role, ApplicationUri, and ServiceLevelBase are authored separately; the Admin UI shows them read-only @@ -107,10 +115,41 @@ else [Parameter] public string ClusterId { get; set; } = string.Empty; private List? _nodes; + private HubConnection? _hub; + private string? _roleChangedBanner; protected override async Task OnParametersSetAsync() { _nodes = await NodeSvc.ListByClusterAsync(ClusterId, CancellationToken.None); + if (_hub is null) await ConnectHubAsync(); + } + + private async Task ConnectHubAsync() + { + _hub = new HubConnectionBuilder() + .WithUrl(Nav.ToAbsoluteUri("/hubs/fleet-status")) + .WithAutomaticReconnect() + .Build(); + + _hub.On("RoleChanged", async msg => + { + if (msg.ClusterId != ClusterId) return; + _roleChangedBanner = $"Role changed on {msg.NodeId}: {msg.FromRole} → {msg.ToRole} at {msg.ObservedAtUtc:HH:mm:ss 'UTC'}"; + _nodes = await NodeSvc.ListByClusterAsync(ClusterId, CancellationToken.None); + await InvokeAsync(StateHasChanged); + }); + + await _hub.StartAsync(); + await _hub.SendAsync("SubscribeCluster", ClusterId); + } + + public async ValueTask DisposeAsync() + { + if (_hub is not null) + { + await _hub.DisposeAsync(); + _hub = null; + } } private static string RowClass(ClusterNode n) => diff --git a/src/ZB.MOM.WW.OtOpcUa.Admin/Hubs/FleetStatusPoller.cs b/src/ZB.MOM.WW.OtOpcUa.Admin/Hubs/FleetStatusPoller.cs index bead926..cd617d1 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Admin/Hubs/FleetStatusPoller.cs +++ b/src/ZB.MOM.WW.OtOpcUa.Admin/Hubs/FleetStatusPoller.cs @@ -1,7 +1,9 @@ using Microsoft.AspNetCore.SignalR; using Microsoft.EntityFrameworkCore; +using ZB.MOM.WW.OtOpcUa.Admin.Services; using ZB.MOM.WW.OtOpcUa.Configuration; using ZB.MOM.WW.OtOpcUa.Configuration.Entities; +using ZB.MOM.WW.OtOpcUa.Configuration.Enums; namespace ZB.MOM.WW.OtOpcUa.Admin.Hubs; @@ -14,11 +16,13 @@ public sealed class FleetStatusPoller( IServiceScopeFactory scopeFactory, IHubContext fleetHub, IHubContext alertHub, - ILogger logger) : BackgroundService + ILogger logger, + RedundancyMetrics redundancyMetrics) : BackgroundService { public TimeSpan PollInterval { get; init; } = TimeSpan.FromSeconds(5); private readonly Dictionary _last = new(); + private readonly Dictionary _lastRole = new(StringComparer.Ordinal); protected override async Task ExecuteAsync(CancellationToken stoppingToken) { @@ -42,6 +46,10 @@ public sealed class FleetStatusPoller( using var scope = scopeFactory.CreateScope(); var db = scope.ServiceProvider.GetRequiredService(); + var nodes = await db.ClusterNodes.AsNoTracking().ToListAsync(ct); + await PollRolesAsync(nodes, ct); + UpdateClusterGauges(nodes); + var rows = await db.ClusterNodeGenerationStates.AsNoTracking() .Join(db.ClusterNodes.AsNoTracking(), s => s.NodeId, n => n.NodeId, (s, n) => new { s, n.ClusterId }) .ToListAsync(ct); @@ -85,9 +93,63 @@ public sealed class FleetStatusPoller( } ///

Exposed for tests — forces a snapshot reset so stub data re-seeds. - internal void ResetCache() => _last.Clear(); + internal void ResetCache() + { + _last.Clear(); + _lastRole.Clear(); + } + + private async Task PollRolesAsync(IReadOnlyList nodes, CancellationToken ct) + { + foreach (var n in nodes) + { + var hadPrior = _lastRole.TryGetValue(n.NodeId, out var priorRole); + if (hadPrior && priorRole == n.RedundancyRole) continue; + + _lastRole[n.NodeId] = n.RedundancyRole; + if (!hadPrior) continue; // first-observation bootstrap — not a transition + + redundancyMetrics.RecordRoleTransition( + clusterId: n.ClusterId, nodeId: n.NodeId, + fromRole: priorRole.ToString(), toRole: n.RedundancyRole.ToString()); + + var msg = new RoleChangedMessage( + ClusterId: n.ClusterId, NodeId: n.NodeId, + FromRole: priorRole.ToString(), ToRole: n.RedundancyRole.ToString(), + ObservedAtUtc: DateTime.UtcNow); + + await fleetHub.Clients.Group(FleetStatusHub.GroupName(n.ClusterId)) + .SendAsync("RoleChanged", msg, ct); + await fleetHub.Clients.Group(FleetStatusHub.FleetGroup) + .SendAsync("RoleChanged", msg, ct); + } + } + + private void UpdateClusterGauges(IReadOnlyList nodes) + { + var staleCutoff = DateTime.UtcNow - Services.ClusterNodeService.StaleThreshold; + foreach (var group in nodes.GroupBy(n => n.ClusterId)) + { + var primary = group.Count(n => n.RedundancyRole == RedundancyRole.Primary); + var secondary = group.Count(n => n.RedundancyRole == RedundancyRole.Secondary); + var stale = group.Count(n => n.LastSeenAt is null || n.LastSeenAt.Value < staleCutoff); + redundancyMetrics.SetClusterCounts(group.Key, primary, secondary, stale); + } + } private readonly record struct NodeStateSnapshot( string NodeId, string ClusterId, long? GenerationId, string? Status, string? Error, DateTime? AppliedAt, DateTime? SeenAt); } + +/// +/// Pushed by when it observes a change in +/// . Consumed by the Admin RedundancyTab to trigger +/// an instant reload instead of waiting for the next on-parameter-set poll. +/// +public sealed record RoleChangedMessage( + string ClusterId, + string NodeId, + string FromRole, + string ToRole, + DateTime ObservedAtUtc); diff --git a/src/ZB.MOM.WW.OtOpcUa.Admin/Program.cs b/src/ZB.MOM.WW.OtOpcUa.Admin/Program.cs index a0fe448..b6f3ea3 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Admin/Program.cs +++ b/src/ZB.MOM.WW.OtOpcUa.Admin/Program.cs @@ -49,6 +49,7 @@ builder.Services.AddScoped(); builder.Services.AddScoped(); builder.Services.AddScoped(); builder.Services.AddScoped(); +builder.Services.AddSingleton(); builder.Services.AddScoped(); builder.Services.AddScoped(); diff --git a/src/ZB.MOM.WW.OtOpcUa.Admin/Services/RedundancyMetrics.cs b/src/ZB.MOM.WW.OtOpcUa.Admin/Services/RedundancyMetrics.cs new file mode 100644 index 0000000..9def449 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Admin/Services/RedundancyMetrics.cs @@ -0,0 +1,102 @@ +using System.Diagnostics.Metrics; + +namespace ZB.MOM.WW.OtOpcUa.Admin.Services; + +/// +/// OpenTelemetry-compatible instrumentation for the redundancy surface. Uses in-box +/// so no NuGet dependency is required to emit — +/// any MeterListener (dotnet-counters, OpenTelemetry.Extensions.Hosting OTLP exporter, +/// Prometheus exporter, etc.) picks up the instruments by the . +/// +/// +/// Exporter configuration (OTLP, Prometheus, etc.) is intentionally NOT wired here — +/// that's a deployment-ops decision that belongs in Program.cs behind an +/// appsettings toggle. This class owns only the Meter + instruments so the +/// production data stream exists regardless of exporter availability. +/// +/// Counter + gauge names follow the otel-semantic-conventions pattern: +/// otopcua.redundancy.* with tags for ClusterId + (for transitions) FromRole/ToRole/NodeId. +/// +public sealed class RedundancyMetrics : IDisposable +{ + public const string MeterName = "ZB.MOM.WW.OtOpcUa.Redundancy"; + + private readonly Meter _meter; + private readonly Counter _roleTransitions; + private readonly object _gaugeLock = new(); + private readonly Dictionary _gaugeState = new(); + + public RedundancyMetrics() + { + _meter = new Meter(MeterName, version: "1.0.0"); + _roleTransitions = _meter.CreateCounter( + "otopcua.redundancy.role_transition", + unit: "{transition}", + description: "Observed RedundancyRole changes per node — tagged FromRole, ToRole, NodeId, ClusterId."); + + // Observable gauges — the callback reports whatever the last Observe*Count call stashed. + _meter.CreateObservableGauge( + "otopcua.redundancy.primary_count", + ObservePrimaryCounts, + unit: "{node}", + description: "Count of Primary-role nodes per cluster (should be 1 for N+1 redundant clusters, 0 during failover)."); + _meter.CreateObservableGauge( + "otopcua.redundancy.secondary_count", + ObserveSecondaryCounts, + unit: "{node}", + description: "Count of Secondary-role nodes per cluster."); + _meter.CreateObservableGauge( + "otopcua.redundancy.stale_count", + ObserveStaleCounts, + unit: "{node}", + description: "Count of cluster nodes whose LastSeenAt is older than StaleThreshold."); + } + + /// + /// Update the per-cluster snapshot consumed by the ObservableGauges. Poller calls this + /// at the end of every tick so the collectors see fresh numbers on the next observation + /// window (by default 1s for dotnet-counters, configurable per exporter). + /// + public void SetClusterCounts(string clusterId, int primary, int secondary, int stale) + { + lock (_gaugeLock) + { + _gaugeState[clusterId] = new ClusterGaugeState(primary, secondary, stale); + } + } + + /// + /// Increment the role_transition counter when a node's RedundancyRole changes. Tags + /// allow breakdowns by from/to roles (e.g. Primary → Secondary for planned failover vs + /// Primary → Standalone for emergency recovery) + by cluster for multi-site fleets. + /// + public void RecordRoleTransition(string clusterId, string nodeId, string fromRole, string toRole) + { + _roleTransitions.Add(1, + new KeyValuePair("cluster.id", clusterId), + new KeyValuePair("node.id", nodeId), + new KeyValuePair("from_role", fromRole), + new KeyValuePair("to_role", toRole)); + } + + public void Dispose() => _meter.Dispose(); + + private IEnumerable> ObservePrimaryCounts() => SnapshotGauge(s => s.Primary); + private IEnumerable> ObserveSecondaryCounts() => SnapshotGauge(s => s.Secondary); + private IEnumerable> ObserveStaleCounts() => SnapshotGauge(s => s.Stale); + + private IEnumerable> SnapshotGauge(Func selector) + { + List> results; + lock (_gaugeLock) + { + results = new List>(_gaugeState.Count); + foreach (var (cluster, state) in _gaugeState) + results.Add(new Measurement(selector(state), + new KeyValuePair("cluster.id", cluster))); + } + return results; + } + + private readonly record struct ClusterGaugeState(int Primary, int Secondary, int Stale); +} diff --git a/tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/FleetStatusPollerTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/FleetStatusPollerTests.cs index 906388d..df07586 100644 --- a/tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/FleetStatusPollerTests.cs +++ b/tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/FleetStatusPollerTests.cs @@ -5,6 +5,7 @@ using Microsoft.Extensions.Logging.Abstractions; using Shouldly; using Xunit; using ZB.MOM.WW.OtOpcUa.Admin.Hubs; +using ZB.MOM.WW.OtOpcUa.Admin.Services; using ZB.MOM.WW.OtOpcUa.Configuration; using ZB.MOM.WW.OtOpcUa.Configuration.Entities; using ZB.MOM.WW.OtOpcUa.Configuration.Enums; @@ -97,7 +98,7 @@ END"; var poller = new FleetStatusPoller( _sp.GetRequiredService(), - fleetHub, alertHub, NullLogger.Instance); + fleetHub, alertHub, NullLogger.Instance, new RedundancyMetrics()); await poller.PollOnceAsync(CancellationToken.None); @@ -142,7 +143,7 @@ END"; var poller = new FleetStatusPoller( _sp.GetRequiredService(), - fleetHub, alertHub, NullLogger.Instance); + fleetHub, alertHub, NullLogger.Instance, new RedundancyMetrics()); await poller.PollOnceAsync(CancellationToken.None); diff --git a/tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/RedundancyMetricsTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/RedundancyMetricsTests.cs new file mode 100644 index 0000000..4faee99 --- /dev/null +++ b/tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/RedundancyMetricsTests.cs @@ -0,0 +1,70 @@ +using System.Diagnostics.Metrics; +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Admin.Services; + +namespace ZB.MOM.WW.OtOpcUa.Admin.Tests; + +[Trait("Category", "Unit")] +public sealed class RedundancyMetricsTests +{ + [Fact] + public void RecordRoleTransition_Increments_Counter_WithExpectedTags() + { + using var metrics = new RedundancyMetrics(); + using var listener = new MeterListener(); + var observed = new List<(long Value, Dictionary Tags)>(); + listener.InstrumentPublished = (instrument, l) => + { + if (instrument.Meter.Name == RedundancyMetrics.MeterName && + instrument.Name == "otopcua.redundancy.role_transition") + { + l.EnableMeasurementEvents(instrument); + } + }; + listener.SetMeasurementEventCallback((_, value, tags, _) => + { + var dict = new Dictionary(); + foreach (var tag in tags) dict[tag.Key] = tag.Value; + observed.Add((value, dict)); + }); + listener.Start(); + + metrics.RecordRoleTransition("c1", "node-a", "Primary", "Secondary"); + + observed.Count.ShouldBe(1); + observed[0].Value.ShouldBe(1); + observed[0].Tags["cluster.id"].ShouldBe("c1"); + observed[0].Tags["node.id"].ShouldBe("node-a"); + observed[0].Tags["from_role"].ShouldBe("Primary"); + observed[0].Tags["to_role"].ShouldBe("Secondary"); + } + + [Fact] + public void SetClusterCounts_Observed_Via_ObservableGauges() + { + using var metrics = new RedundancyMetrics(); + metrics.SetClusterCounts("c1", primary: 1, secondary: 2, stale: 0); + metrics.SetClusterCounts("c2", primary: 0, secondary: 1, stale: 1); + + var observations = new List<(string Name, long Value, string Cluster)>(); + using var listener = new MeterListener(); + listener.InstrumentPublished = (instrument, l) => + { + if (instrument.Meter.Name == RedundancyMetrics.MeterName) + l.EnableMeasurementEvents(instrument); + }; + listener.SetMeasurementEventCallback((instrument, value, tags, _) => + { + string? cluster = null; + foreach (var t in tags) if (t.Key == "cluster.id") cluster = t.Value as string; + observations.Add((instrument.Name, value, cluster ?? "?")); + }); + listener.Start(); + listener.RecordObservableInstruments(); + + observations.ShouldContain(o => o.Name == "otopcua.redundancy.primary_count" && o.Cluster == "c1" && o.Value == 1); + observations.ShouldContain(o => o.Name == "otopcua.redundancy.secondary_count" && o.Cluster == "c1" && o.Value == 2); + observations.ShouldContain(o => o.Name == "otopcua.redundancy.stale_count" && o.Cluster == "c2" && o.Value == 1); + } +}