metrics(alarms): expose provider-switch count in snapshot, bound the reason tag

B1: add AlarmProviderSwitchCount to GatewayMetricsSnapshot so the switch total is
readable without scraping the OTEL counter.
B2: replace the free-text reason tag on mxgateway.alarms.provider_switches with a
bounded AlarmProviderSwitchReason enum (failover/failback/unknown); the human-readable
reason stays in the structured log.
This commit is contained in:
Joseph Doherty
2026-06-14 02:33:02 -04:00
parent 5b31e99ab6
commit 56abd64c6c
5 changed files with 47 additions and 8 deletions
@@ -399,7 +399,13 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
BroadcastToAll(new AlarmFeedMessage { ProviderStatus = status });
}
_metrics.AlarmProviderSwitched(fromModeInt, ModeToInt(toMode), reason);
AlarmProviderSwitchReason switchReason = toMode switch
{
AlarmProviderMode.Subtag => AlarmProviderSwitchReason.Failover,
AlarmProviderMode.Alarmmgr => AlarmProviderSwitchReason.Failback,
_ => AlarmProviderSwitchReason.Unknown,
};
_metrics.AlarmProviderSwitched(fromModeInt, ModeToInt(toMode), switchReason);
_logger.LogInformation(
"Alarm provider mode changed to {Mode} (degraded={Degraded}): {Reason}",
@@ -0,0 +1,20 @@
namespace ZB.MOM.WW.MxGateway.Server.Metrics;
/// <summary>
/// Bounded classification of an alarm-provider switch, used as the low-cardinality
/// <c>reason</c> tag on the <c>mxgateway.alarms.provider_switches</c> counter. The
/// worker supplies a free-text reason (e.g. <c>"primary PollOnce failed"</c>) that
/// stays in the structured log; only this bounded value reaches the metric tag so the
/// time series cannot fan out on operation-specific text.
/// </summary>
public enum AlarmProviderSwitchReason
{
/// <summary>The switch direction could not be classified.</summary>
Unknown = 0,
/// <summary>Switched from the primary (alarmmgr) provider to the subtag standby — degraded.</summary>
Failover = 1,
/// <summary>Switched back from the subtag standby to the primary (alarmmgr) provider — recovered.</summary>
Failback = 2,
}
@@ -50,6 +50,7 @@ public sealed class GatewayMetrics : IDisposable
private long _heartbeatFailures;
private long _streamDisconnects;
private long _retryAttempts;
private long _alarmProviderSwitches;
private bool _disposed;
/// <summary>
@@ -383,25 +384,34 @@ public sealed class GatewayMetrics : IDisposable
}
/// <summary>
/// Records that the alarm provider switched modes and updates the current provider mode gauge.
/// Records that the alarm provider switched modes, increments the switch count, and updates the
/// current provider mode gauge.
/// </summary>
/// <param name="fromMode">Provider mode before the switch (1=alarmmgr, 2=subtag, 0=unknown).</param>
/// <param name="toMode">Provider mode after the switch (1=alarmmgr, 2=subtag, 0=unknown).</param>
/// <param name="reason">Human-readable reason for the switch.</param>
public void AlarmProviderSwitched(int fromMode, int toMode, string reason)
/// <param name="reason">Bounded switch classification used as the counter's <c>reason</c> tag.</param>
public void AlarmProviderSwitched(int fromMode, int toMode, AlarmProviderSwitchReason reason)
{
lock (_syncRoot)
{
_alarmProviderMode = toMode;
_alarmProviderSwitches++;
}
_alarmProviderSwitchesCounter.Add(
1,
new KeyValuePair<string, object?>("from", fromMode.ToString(CultureInfo.InvariantCulture)),
new KeyValuePair<string, object?>("to", toMode.ToString(CultureInfo.InvariantCulture)),
new KeyValuePair<string, object?>("reason", reason));
new KeyValuePair<string, object?>("reason", ReasonTag(reason)));
}
private static string ReasonTag(AlarmProviderSwitchReason reason) => reason switch
{
AlarmProviderSwitchReason.Failover => "failover",
AlarmProviderSwitchReason.Failback => "failback",
_ => "unknown",
};
/// <summary>Sets the current alarm provider-mode gauge without recording a switch (e.g. startup baseline).</summary>
public void SetAlarmProviderMode(int mode)
{
@@ -433,6 +443,7 @@ public sealed class GatewayMetrics : IDisposable
HeartbeatFailures: _heartbeatFailures,
StreamDisconnects: _streamDisconnects,
RetryAttempts: _retryAttempts,
AlarmProviderSwitchCount: _alarmProviderSwitches,
CommandFailuresByMethod: new Dictionary<string, long>(_commandFailuresByMethod, StringComparer.OrdinalIgnoreCase),
EventsByFamily: new Dictionary<string, long>(_eventsByFamily, StringComparer.OrdinalIgnoreCase),
EventsBySession: new Dictionary<string, long>(_eventsBySession, StringComparer.Ordinal),
@@ -18,6 +18,7 @@ public sealed record GatewayMetricsSnapshot(
long HeartbeatFailures,
long StreamDisconnects,
long RetryAttempts,
long AlarmProviderSwitchCount,
IReadOnlyDictionary<string, long> CommandFailuresByMethod,
IReadOnlyDictionary<string, long> EventsByFamily,
IReadOnlyDictionary<string, long> EventsBySession,
@@ -111,12 +111,13 @@ public sealed class GatewayMetricsTests
});
listener.Start();
metrics.AlarmProviderSwitched(1, 2, "test");
metrics.AlarmProviderSwitched(1, 2, AlarmProviderSwitchReason.Failover);
Assert.Equal(1, capturedValue);
Assert.Equal("1", capturedFrom);
Assert.Equal("2", capturedTo);
Assert.Equal("test", capturedReason);
Assert.Equal("failover", capturedReason);
Assert.Equal(1, metrics.GetSnapshot().AlarmProviderSwitchCount);
}
/// <summary>
@@ -150,7 +151,7 @@ public sealed class GatewayMetricsTests
});
listener.Start();
metrics.AlarmProviderSwitched(1, 2, "test");
metrics.AlarmProviderSwitched(1, 2, AlarmProviderSwitchReason.Failover);
listener.RecordObservableInstruments();
Assert.Equal(2, capturedMode);