using System.Diagnostics.Metrics;
namespace ZB.MOM.WW.OtOpcUa.Admin.Services;
///
/// OpenTelemetry-compatible instrumentation for the redundancy surface. Uses in-box
/// so no NuGet dependency is required to emit —
/// any MeterListener (dotnet-counters, OpenTelemetry.Extensions.Hosting OTLP exporter,
/// Prometheus exporter, etc.) picks up the instruments by the .
///
///
/// Exporter configuration (OTLP, Prometheus, etc.) is intentionally NOT wired here —
/// that's a deployment-ops decision that belongs in Program.cs behind an
/// appsettings toggle. This class owns only the Meter + instruments so the
/// production data stream exists regardless of exporter availability.
///
/// Counter + gauge names follow the otel-semantic-conventions pattern:
/// otopcua.redundancy.* with tags for ClusterId + (for transitions) FromRole/ToRole/NodeId.
///
public sealed class RedundancyMetrics : IDisposable
{
public const string MeterName = "ZB.MOM.WW.OtOpcUa.Redundancy";
private readonly Meter _meter;
private readonly Counter _roleTransitions;
private readonly object _gaugeLock = new();
private readonly Dictionary _gaugeState = new();
public RedundancyMetrics()
{
_meter = new Meter(MeterName, version: "1.0.0");
_roleTransitions = _meter.CreateCounter(
"otopcua.redundancy.role_transition",
unit: "{transition}",
description: "Observed RedundancyRole changes per node — tagged FromRole, ToRole, NodeId, ClusterId.");
// Observable gauges — the callback reports whatever the last Observe*Count call stashed.
_meter.CreateObservableGauge(
"otopcua.redundancy.primary_count",
ObservePrimaryCounts,
unit: "{node}",
description: "Count of Primary-role nodes per cluster (should be 1 for N+1 redundant clusters, 0 during failover).");
_meter.CreateObservableGauge(
"otopcua.redundancy.secondary_count",
ObserveSecondaryCounts,
unit: "{node}",
description: "Count of Secondary-role nodes per cluster.");
_meter.CreateObservableGauge(
"otopcua.redundancy.stale_count",
ObserveStaleCounts,
unit: "{node}",
description: "Count of cluster nodes whose LastSeenAt is older than StaleThreshold.");
}
///
/// Update the per-cluster snapshot consumed by the ObservableGauges. Poller calls this
/// at the end of every tick so the collectors see fresh numbers on the next observation
/// window (by default 1s for dotnet-counters, configurable per exporter).
///
public void SetClusterCounts(string clusterId, int primary, int secondary, int stale)
{
lock (_gaugeLock)
{
_gaugeState[clusterId] = new ClusterGaugeState(primary, secondary, stale);
}
}
///
/// Increment the role_transition counter when a node's RedundancyRole changes. Tags
/// allow breakdowns by from/to roles (e.g. Primary → Secondary for planned failover vs
/// Primary → Standalone for emergency recovery) + by cluster for multi-site fleets.
///
public void RecordRoleTransition(string clusterId, string nodeId, string fromRole, string toRole)
{
_roleTransitions.Add(1,
new KeyValuePair("cluster.id", clusterId),
new KeyValuePair("node.id", nodeId),
new KeyValuePair("from_role", fromRole),
new KeyValuePair("to_role", toRole));
}
public void Dispose() => _meter.Dispose();
private IEnumerable> ObservePrimaryCounts() => SnapshotGauge(s => s.Primary);
private IEnumerable> ObserveSecondaryCounts() => SnapshotGauge(s => s.Secondary);
private IEnumerable> ObserveStaleCounts() => SnapshotGauge(s => s.Stale);
private IEnumerable> SnapshotGauge(Func selector)
{
List> results;
lock (_gaugeLock)
{
results = new List>(_gaugeState.Count);
foreach (var (cluster, state) in _gaugeState)
results.Add(new Measurement(selector(state),
new KeyValuePair("cluster.id", cluster)));
}
return results;
}
private readonly record struct ClusterGaugeState(int Primary, int Secondary, int Stale);
}