103 lines
4.7 KiB
C#
103 lines
4.7 KiB
C#
using System.Diagnostics.Metrics;
|
|
|
|
namespace ZB.MOM.WW.OtOpcUa.Admin.Services;
|
|
|
|
/// <summary>
|
|
/// OpenTelemetry-compatible instrumentation for the redundancy surface. Uses in-box
|
|
/// <see cref="System.Diagnostics.Metrics"/> so no NuGet dependency is required to emit —
|
|
/// any MeterListener (dotnet-counters, OpenTelemetry.Extensions.Hosting OTLP exporter,
|
|
/// Prometheus exporter, etc.) picks up the instruments by the <see cref="MeterName"/>.
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// Exporter configuration (OTLP, Prometheus, etc.) is intentionally NOT wired here —
|
|
/// that's a deployment-ops decision that belongs in <c>Program.cs</c> behind an
|
|
/// <c>appsettings</c> toggle. This class owns only the Meter + instruments so the
|
|
/// production data stream exists regardless of exporter availability.
|
|
///
|
|
/// Counter + gauge names follow the otel-semantic-conventions pattern:
|
|
/// <c>otopcua.redundancy.*</c> with tags for ClusterId + (for transitions) FromRole/ToRole/NodeId.
|
|
/// </remarks>
|
|
public sealed class RedundancyMetrics : IDisposable
|
|
{
|
|
public const string MeterName = "ZB.MOM.WW.OtOpcUa.Redundancy";
|
|
|
|
private readonly Meter _meter;
|
|
private readonly Counter<long> _roleTransitions;
|
|
private readonly object _gaugeLock = new();
|
|
private readonly Dictionary<string, ClusterGaugeState> _gaugeState = new();
|
|
|
|
public RedundancyMetrics()
|
|
{
|
|
_meter = new Meter(MeterName, version: "1.0.0");
|
|
_roleTransitions = _meter.CreateCounter<long>(
|
|
"otopcua.redundancy.role_transition",
|
|
unit: "{transition}",
|
|
description: "Observed RedundancyRole changes per node — tagged FromRole, ToRole, NodeId, ClusterId.");
|
|
|
|
// Observable gauges — the callback reports whatever the last Observe*Count call stashed.
|
|
_meter.CreateObservableGauge(
|
|
"otopcua.redundancy.primary_count",
|
|
ObservePrimaryCounts,
|
|
unit: "{node}",
|
|
description: "Count of Primary-role nodes per cluster (should be 1 for N+1 redundant clusters, 0 during failover).");
|
|
_meter.CreateObservableGauge(
|
|
"otopcua.redundancy.secondary_count",
|
|
ObserveSecondaryCounts,
|
|
unit: "{node}",
|
|
description: "Count of Secondary-role nodes per cluster.");
|
|
_meter.CreateObservableGauge(
|
|
"otopcua.redundancy.stale_count",
|
|
ObserveStaleCounts,
|
|
unit: "{node}",
|
|
description: "Count of cluster nodes whose LastSeenAt is older than StaleThreshold.");
|
|
}
|
|
|
|
/// <summary>
|
|
/// Update the per-cluster snapshot consumed by the ObservableGauges. Poller calls this
|
|
/// at the end of every tick so the collectors see fresh numbers on the next observation
|
|
/// window (by default 1s for dotnet-counters, configurable per exporter).
|
|
/// </summary>
|
|
public void SetClusterCounts(string clusterId, int primary, int secondary, int stale)
|
|
{
|
|
lock (_gaugeLock)
|
|
{
|
|
_gaugeState[clusterId] = new ClusterGaugeState(primary, secondary, stale);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Increment the role_transition counter when a node's RedundancyRole changes. Tags
|
|
/// allow breakdowns by from/to roles (e.g. Primary → Secondary for planned failover vs
|
|
/// Primary → Standalone for emergency recovery) + by cluster for multi-site fleets.
|
|
/// </summary>
|
|
public void RecordRoleTransition(string clusterId, string nodeId, string fromRole, string toRole)
|
|
{
|
|
_roleTransitions.Add(1,
|
|
new KeyValuePair<string, object?>("cluster.id", clusterId),
|
|
new KeyValuePair<string, object?>("node.id", nodeId),
|
|
new KeyValuePair<string, object?>("from_role", fromRole),
|
|
new KeyValuePair<string, object?>("to_role", toRole));
|
|
}
|
|
|
|
public void Dispose() => _meter.Dispose();
|
|
|
|
private IEnumerable<Measurement<long>> ObservePrimaryCounts() => SnapshotGauge(s => s.Primary);
|
|
private IEnumerable<Measurement<long>> ObserveSecondaryCounts() => SnapshotGauge(s => s.Secondary);
|
|
private IEnumerable<Measurement<long>> ObserveStaleCounts() => SnapshotGauge(s => s.Stale);
|
|
|
|
private IEnumerable<Measurement<long>> SnapshotGauge(Func<ClusterGaugeState, int> selector)
|
|
{
|
|
List<Measurement<long>> results;
|
|
lock (_gaugeLock)
|
|
{
|
|
results = new List<Measurement<long>>(_gaugeState.Count);
|
|
foreach (var (cluster, state) in _gaugeState)
|
|
results.Add(new Measurement<long>(selector(state),
|
|
new KeyValuePair<string, object?>("cluster.id", cluster)));
|
|
}
|
|
return results;
|
|
}
|
|
|
|
private readonly record struct ClusterGaugeState(int Primary, int Secondary, int Stale);
|
|
}
|