using System.Diagnostics.Metrics; namespace ZB.MOM.WW.OtOpcUa.Admin.Services; /// /// OpenTelemetry-compatible instrumentation for the redundancy surface. Uses in-box /// so no NuGet dependency is required to emit — /// any MeterListener (dotnet-counters, OpenTelemetry.Extensions.Hosting OTLP exporter, /// Prometheus exporter, etc.) picks up the instruments by the . /// /// /// Exporter configuration (OTLP, Prometheus, etc.) is intentionally NOT wired here — /// that's a deployment-ops decision that belongs in Program.cs behind an /// appsettings toggle. This class owns only the Meter + instruments so the /// production data stream exists regardless of exporter availability. /// /// Counter + gauge names follow the otel-semantic-conventions pattern: /// otopcua.redundancy.* with tags for ClusterId + (for transitions) FromRole/ToRole/NodeId. /// public sealed class RedundancyMetrics : IDisposable { public const string MeterName = "ZB.MOM.WW.OtOpcUa.Redundancy"; private readonly Meter _meter; private readonly Counter _roleTransitions; private readonly object _gaugeLock = new(); private readonly Dictionary _gaugeState = new(); public RedundancyMetrics() { _meter = new Meter(MeterName, version: "1.0.0"); _roleTransitions = _meter.CreateCounter( "otopcua.redundancy.role_transition", unit: "{transition}", description: "Observed RedundancyRole changes per node — tagged FromRole, ToRole, NodeId, ClusterId."); // Observable gauges — the callback reports whatever the last Observe*Count call stashed. _meter.CreateObservableGauge( "otopcua.redundancy.primary_count", ObservePrimaryCounts, unit: "{node}", description: "Count of Primary-role nodes per cluster (should be 1 for N+1 redundant clusters, 0 during failover)."); _meter.CreateObservableGauge( "otopcua.redundancy.secondary_count", ObserveSecondaryCounts, unit: "{node}", description: "Count of Secondary-role nodes per cluster."); _meter.CreateObservableGauge( "otopcua.redundancy.stale_count", ObserveStaleCounts, unit: "{node}", description: "Count of cluster nodes whose LastSeenAt is older than StaleThreshold."); } /// /// Update the per-cluster snapshot consumed by the ObservableGauges. Poller calls this /// at the end of every tick so the collectors see fresh numbers on the next observation /// window (by default 1s for dotnet-counters, configurable per exporter). /// public void SetClusterCounts(string clusterId, int primary, int secondary, int stale) { lock (_gaugeLock) { _gaugeState[clusterId] = new ClusterGaugeState(primary, secondary, stale); } } /// /// Increment the role_transition counter when a node's RedundancyRole changes. Tags /// allow breakdowns by from/to roles (e.g. Primary → Secondary for planned failover vs /// Primary → Standalone for emergency recovery) + by cluster for multi-site fleets. /// public void RecordRoleTransition(string clusterId, string nodeId, string fromRole, string toRole) { _roleTransitions.Add(1, new KeyValuePair("cluster.id", clusterId), new KeyValuePair("node.id", nodeId), new KeyValuePair("from_role", fromRole), new KeyValuePair("to_role", toRole)); } public void Dispose() => _meter.Dispose(); private IEnumerable> ObservePrimaryCounts() => SnapshotGauge(s => s.Primary); private IEnumerable> ObserveSecondaryCounts() => SnapshotGauge(s => s.Secondary); private IEnumerable> ObserveStaleCounts() => SnapshotGauge(s => s.Stale); private IEnumerable> SnapshotGauge(Func selector) { List> results; lock (_gaugeLock) { results = new List>(_gaugeState.Count); foreach (var (cluster, state) in _gaugeState) results.Add(new Measurement(selector(state), new KeyValuePair("cluster.id", cluster))); } return results; } private readonly record struct ClusterGaugeState(int Primary, int Secondary, int Stale); }