metrics(alarms): expose provider-switch count in snapshot, bound the reason tag

B1: add AlarmProviderSwitchCount to GatewayMetricsSnapshot so the switch total is
readable without scraping the OTEL counter.
B2: replace the free-text reason tag on mxgateway.alarms.provider_switches with a
bounded AlarmProviderSwitchReason enum (failover/failback/unknown); the human-readable
reason stays in the structured log.
This commit is contained in:
Joseph Doherty
2026-06-14 02:33:02 -04:00
parent 5b31e99ab6
commit 56abd64c6c
5 changed files with 47 additions and 8 deletions
@@ -399,7 +399,13 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
BroadcastToAll(new AlarmFeedMessage { ProviderStatus = status }); BroadcastToAll(new AlarmFeedMessage { ProviderStatus = status });
} }
_metrics.AlarmProviderSwitched(fromModeInt, ModeToInt(toMode), reason); AlarmProviderSwitchReason switchReason = toMode switch
{
AlarmProviderMode.Subtag => AlarmProviderSwitchReason.Failover,
AlarmProviderMode.Alarmmgr => AlarmProviderSwitchReason.Failback,
_ => AlarmProviderSwitchReason.Unknown,
};
_metrics.AlarmProviderSwitched(fromModeInt, ModeToInt(toMode), switchReason);
_logger.LogInformation( _logger.LogInformation(
"Alarm provider mode changed to {Mode} (degraded={Degraded}): {Reason}", "Alarm provider mode changed to {Mode} (degraded={Degraded}): {Reason}",
@@ -0,0 +1,20 @@
namespace ZB.MOM.WW.MxGateway.Server.Metrics;
/// <summary>
/// Bounded classification of an alarm-provider switch, used as the low-cardinality
/// <c>reason</c> tag on the <c>mxgateway.alarms.provider_switches</c> counter. The
/// worker supplies a free-text reason (e.g. <c>"primary PollOnce failed"</c>) that
/// stays in the structured log; only this bounded value reaches the metric tag so the
/// time series cannot fan out on operation-specific text.
/// </summary>
public enum AlarmProviderSwitchReason
{
/// <summary>The switch direction could not be classified.</summary>
Unknown = 0,
/// <summary>Switched from the primary (alarmmgr) provider to the subtag standby — degraded.</summary>
Failover = 1,
/// <summary>Switched back from the subtag standby to the primary (alarmmgr) provider — recovered.</summary>
Failback = 2,
}
@@ -50,6 +50,7 @@ public sealed class GatewayMetrics : IDisposable
private long _heartbeatFailures; private long _heartbeatFailures;
private long _streamDisconnects; private long _streamDisconnects;
private long _retryAttempts; private long _retryAttempts;
private long _alarmProviderSwitches;
private bool _disposed; private bool _disposed;
/// <summary> /// <summary>
@@ -383,25 +384,34 @@ public sealed class GatewayMetrics : IDisposable
} }
/// <summary> /// <summary>
/// Records that the alarm provider switched modes and updates the current provider mode gauge. /// Records that the alarm provider switched modes, increments the switch count, and updates the
/// current provider mode gauge.
/// </summary> /// </summary>
/// <param name="fromMode">Provider mode before the switch (1=alarmmgr, 2=subtag, 0=unknown).</param> /// <param name="fromMode">Provider mode before the switch (1=alarmmgr, 2=subtag, 0=unknown).</param>
/// <param name="toMode">Provider mode after the switch (1=alarmmgr, 2=subtag, 0=unknown).</param> /// <param name="toMode">Provider mode after the switch (1=alarmmgr, 2=subtag, 0=unknown).</param>
/// <param name="reason">Human-readable reason for the switch.</param> /// <param name="reason">Bounded switch classification used as the counter's <c>reason</c> tag.</param>
public void AlarmProviderSwitched(int fromMode, int toMode, string reason) public void AlarmProviderSwitched(int fromMode, int toMode, AlarmProviderSwitchReason reason)
{ {
lock (_syncRoot) lock (_syncRoot)
{ {
_alarmProviderMode = toMode; _alarmProviderMode = toMode;
_alarmProviderSwitches++;
} }
_alarmProviderSwitchesCounter.Add( _alarmProviderSwitchesCounter.Add(
1, 1,
new KeyValuePair<string, object?>("from", fromMode.ToString(CultureInfo.InvariantCulture)), new KeyValuePair<string, object?>("from", fromMode.ToString(CultureInfo.InvariantCulture)),
new KeyValuePair<string, object?>("to", toMode.ToString(CultureInfo.InvariantCulture)), new KeyValuePair<string, object?>("to", toMode.ToString(CultureInfo.InvariantCulture)),
new KeyValuePair<string, object?>("reason", reason)); new KeyValuePair<string, object?>("reason", ReasonTag(reason)));
} }
private static string ReasonTag(AlarmProviderSwitchReason reason) => reason switch
{
AlarmProviderSwitchReason.Failover => "failover",
AlarmProviderSwitchReason.Failback => "failback",
_ => "unknown",
};
/// <summary>Sets the current alarm provider-mode gauge without recording a switch (e.g. startup baseline).</summary> /// <summary>Sets the current alarm provider-mode gauge without recording a switch (e.g. startup baseline).</summary>
public void SetAlarmProviderMode(int mode) public void SetAlarmProviderMode(int mode)
{ {
@@ -433,6 +443,7 @@ public sealed class GatewayMetrics : IDisposable
HeartbeatFailures: _heartbeatFailures, HeartbeatFailures: _heartbeatFailures,
StreamDisconnects: _streamDisconnects, StreamDisconnects: _streamDisconnects,
RetryAttempts: _retryAttempts, RetryAttempts: _retryAttempts,
AlarmProviderSwitchCount: _alarmProviderSwitches,
CommandFailuresByMethod: new Dictionary<string, long>(_commandFailuresByMethod, StringComparer.OrdinalIgnoreCase), CommandFailuresByMethod: new Dictionary<string, long>(_commandFailuresByMethod, StringComparer.OrdinalIgnoreCase),
EventsByFamily: new Dictionary<string, long>(_eventsByFamily, StringComparer.OrdinalIgnoreCase), EventsByFamily: new Dictionary<string, long>(_eventsByFamily, StringComparer.OrdinalIgnoreCase),
EventsBySession: new Dictionary<string, long>(_eventsBySession, StringComparer.Ordinal), EventsBySession: new Dictionary<string, long>(_eventsBySession, StringComparer.Ordinal),
@@ -18,6 +18,7 @@ public sealed record GatewayMetricsSnapshot(
long HeartbeatFailures, long HeartbeatFailures,
long StreamDisconnects, long StreamDisconnects,
long RetryAttempts, long RetryAttempts,
long AlarmProviderSwitchCount,
IReadOnlyDictionary<string, long> CommandFailuresByMethod, IReadOnlyDictionary<string, long> CommandFailuresByMethod,
IReadOnlyDictionary<string, long> EventsByFamily, IReadOnlyDictionary<string, long> EventsByFamily,
IReadOnlyDictionary<string, long> EventsBySession, IReadOnlyDictionary<string, long> EventsBySession,
@@ -111,12 +111,13 @@ public sealed class GatewayMetricsTests
}); });
listener.Start(); listener.Start();
metrics.AlarmProviderSwitched(1, 2, "test"); metrics.AlarmProviderSwitched(1, 2, AlarmProviderSwitchReason.Failover);
Assert.Equal(1, capturedValue); Assert.Equal(1, capturedValue);
Assert.Equal("1", capturedFrom); Assert.Equal("1", capturedFrom);
Assert.Equal("2", capturedTo); Assert.Equal("2", capturedTo);
Assert.Equal("test", capturedReason); Assert.Equal("failover", capturedReason);
Assert.Equal(1, metrics.GetSnapshot().AlarmProviderSwitchCount);
} }
/// <summary> /// <summary>
@@ -150,7 +151,7 @@ public sealed class GatewayMetricsTests
}); });
listener.Start(); listener.Start();
metrics.AlarmProviderSwitched(1, 2, "test"); metrics.AlarmProviderSwitched(1, 2, AlarmProviderSwitchReason.Failover);
listener.RecordObservableInstruments(); listener.RecordObservableInstruments();
Assert.Equal(2, capturedMode); Assert.Equal(2, capturedMode);