OpenTelemetry redundancy metrics + RoleChanged SignalR push. Closes instrumentation + live-push slices of task #198; the exporter wiring (OTLP vs Prometheus package decision) is split to new task #201 because the collector/scrape-endpoint choice is a fleet-ops decision that deserves its own PR rather than hardcoded here. New RedundancyMetrics class (Singleton-registered in DI) owning a System.Diagnostics.Metrics.Meter("ZB.MOM.WW.OtOpcUa.Redundancy", "1.0.0"). Three ObservableGauge instruments — otopcua.redundancy.primary_count / secondary_count / stale_count — all tagged by cluster.id, populated by SetClusterCounts(clusterId, primary, secondary, stale) which the poller calls at the tail of every tick; ObservableGauge callbacks snapshot the last value set under a lock so the reader (OTel collector, dotnet-counters) sees consistent tuples. One Counter — otopcua.redundancy.role_transition — tagged cluster.id, node.id, from_role, to_role; ideal for tracking "how often does Cluster-X failover" + "which node transitions most" aggregate queries. In-box Metrics API means zero NuGet dep here — the exporter PR adds OpenTelemetry.Extensions.Hosting + OpenTelemetry.Exporter.OpenTelemetryProtocol or OpenTelemetry.Exporter.Prometheus.AspNetCore to actually ship the data somewhere. FleetStatusPoller extended with role-change detection. Its PollOnceAsync now pulls ClusterNode rows alongside the existing ClusterNodeGenerationState scan, and a new PollRolesAsync walks every node comparing RedundancyRole to the _lastRole cache. On change: records the transition to RedundancyMetrics + emits a RoleChanged SignalR message to both FleetStatusHub.GroupName(cluster) + FleetStatusHub.FleetGroup so cluster-scoped + fleet-wide subscribers both see it. First observation per node is a bootstrap (cache fill) + NOT a transition — avoids spurious churn on service startup or pod restart. UpdateClusterGauges groups nodes by cluster + sets the three gauge values, using ClusterNodeService.StaleThreshold (shared 30s convention) for staleness so the /hosts page + the gauge agree. RoleChangedMessage record lives alongside NodeStateChangedMessage in FleetStatusPoller.cs. RedundancyTab.razor subscribes to the fleet-status hub on first parameters-set, filters RoleChanged events to the current cluster, reloads the node list + paints a blue info banner ("Role changed on node-a: Primary → Secondary at HH:mm:ss UTC") so operators see the transition without needing to poll-refresh the page. IAsyncDisposable closes the connection on tab swap-away. Two new RedundancyMetricsTests covering RecordRoleTransition tag emission (cluster.id + node.id + from_role + to_role all flow through the MeterListener callback) + ObservableGauge snapshot for two clusters (assert primary_count=1 for c1, stale_count=1 for c2). Existing FleetStatusPollerTests ctor-line updated to pass a RedundancyMetrics instance; all tests still pass. Full Admin.Tests suite 87/87 passing (was 85, +2). Admin project builds 0 errors. Task #201 captures the exporter-wiring follow-up — OpenTelemetry.Extensions.Hosting + OTLP vs Prometheus + /metrics endpoint decision, driven by fleet-ops infra direction.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 23:16:09 -04:00
parent 251f567b98
commit 1f3343e61f
6 changed files with 279 additions and 4 deletions
--- a/src/ZB.MOM.WW.OtOpcUa.Admin/Services/RedundancyMetrics.cs
+++ b/src/ZB.MOM.WW.OtOpcUa.Admin/Services/RedundancyMetrics.cs
@@ -0,0 +1,102 @@
+using System.Diagnostics.Metrics;
+
+namespace ZB.MOM.WW.OtOpcUa.Admin.Services;
+
+/// <summary>
+///     OpenTelemetry-compatible instrumentation for the redundancy surface. Uses in-box
+///     <see cref="System.Diagnostics.Metrics"/> so no NuGet dependency is required to emit —
+///     any MeterListener (dotnet-counters, OpenTelemetry.Extensions.Hosting OTLP exporter,
+///     Prometheus exporter, etc.) picks up the instruments by the <see cref="MeterName"/>.
+/// </summary>
+/// <remarks>
+///     Exporter configuration (OTLP, Prometheus, etc.) is intentionally NOT wired here —
+///     that's a deployment-ops decision that belongs in <c>Program.cs</c> behind an
+///     <c>appsettings</c> toggle. This class owns only the Meter + instruments so the
+///     production data stream exists regardless of exporter availability.
+///
+///     Counter + gauge names follow the otel-semantic-conventions pattern:
+///     <c>otopcua.redundancy.*</c> with tags for ClusterId + (for transitions) FromRole/ToRole/NodeId.
+/// </remarks>
+public sealed class RedundancyMetrics : IDisposable
+{
+    public const string MeterName = "ZB.MOM.WW.OtOpcUa.Redundancy";
+
+    private readonly Meter _meter;
+    private readonly Counter<long> _roleTransitions;
+    private readonly object _gaugeLock = new();
+    private readonly Dictionary<string, ClusterGaugeState> _gaugeState = new();
+
+    public RedundancyMetrics()
+    {
+        _meter = new Meter(MeterName, version: "1.0.0");
+        _roleTransitions = _meter.CreateCounter<long>(
+            "otopcua.redundancy.role_transition",
+            unit: "{transition}",
+            description: "Observed RedundancyRole changes per node — tagged FromRole, ToRole, NodeId, ClusterId.");
+
+        // Observable gauges — the callback reports whatever the last Observe*Count call stashed.
+        _meter.CreateObservableGauge(
+            "otopcua.redundancy.primary_count",
+            ObservePrimaryCounts,
+            unit: "{node}",
+            description: "Count of Primary-role nodes per cluster (should be 1 for N+1 redundant clusters, 0 during failover).");
+        _meter.CreateObservableGauge(
+            "otopcua.redundancy.secondary_count",
+            ObserveSecondaryCounts,
+            unit: "{node}",
+            description: "Count of Secondary-role nodes per cluster.");
+        _meter.CreateObservableGauge(
+            "otopcua.redundancy.stale_count",
+            ObserveStaleCounts,
+            unit: "{node}",
+            description: "Count of cluster nodes whose LastSeenAt is older than StaleThreshold.");
+    }
+
+    /// <summary>
+    ///     Update the per-cluster snapshot consumed by the ObservableGauges. Poller calls this
+    ///     at the end of every tick so the collectors see fresh numbers on the next observation
+    ///     window (by default 1s for dotnet-counters, configurable per exporter).
+    /// </summary>
+    public void SetClusterCounts(string clusterId, int primary, int secondary, int stale)
+    {
+        lock (_gaugeLock)
+        {
+            _gaugeState[clusterId] = new ClusterGaugeState(primary, secondary, stale);
+        }
+    }
+
+    /// <summary>
+    ///     Increment the role_transition counter when a node's RedundancyRole changes. Tags
+    ///     allow breakdowns by from/to roles (e.g. Primary → Secondary for planned failover vs
+    ///     Primary → Standalone for emergency recovery) + by cluster for multi-site fleets.
+    /// </summary>
+    public void RecordRoleTransition(string clusterId, string nodeId, string fromRole, string toRole)
+    {
+        _roleTransitions.Add(1,
+            new KeyValuePair<string, object?>("cluster.id", clusterId),
+            new KeyValuePair<string, object?>("node.id", nodeId),
+            new KeyValuePair<string, object?>("from_role", fromRole),
+            new KeyValuePair<string, object?>("to_role", toRole));
+    }
+
+    public void Dispose() => _meter.Dispose();
+
+    private IEnumerable<Measurement<long>> ObservePrimaryCounts() => SnapshotGauge(s => s.Primary);
+    private IEnumerable<Measurement<long>> ObserveSecondaryCounts() => SnapshotGauge(s => s.Secondary);
+    private IEnumerable<Measurement<long>> ObserveStaleCounts() => SnapshotGauge(s => s.Stale);
+
+    private IEnumerable<Measurement<long>> SnapshotGauge(Func<ClusterGaugeState, int> selector)
+    {
+        List<Measurement<long>> results;
+        lock (_gaugeLock)
+        {
+            results = new List<Measurement<long>>(_gaugeState.Count);
+            foreach (var (cluster, state) in _gaugeState)
+                results.Add(new Measurement<long>(selector(state),
+                    new KeyValuePair<string, object?>("cluster.id", cluster)));
+        }
+        return results;
+    }
+
+    private readonly record struct ClusterGaugeState(int Primary, int Secondary, int Stale);
+}