diff --git a/src/ZB.MOM.WW.MxGateway.Server/Alarms/GatewayAlarmMonitor.cs b/src/ZB.MOM.WW.MxGateway.Server/Alarms/GatewayAlarmMonitor.cs
index fdbfc89..a353d9c 100644
--- a/src/ZB.MOM.WW.MxGateway.Server/Alarms/GatewayAlarmMonitor.cs
+++ b/src/ZB.MOM.WW.MxGateway.Server/Alarms/GatewayAlarmMonitor.cs
@@ -399,7 +399,13 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
BroadcastToAll(new AlarmFeedMessage { ProviderStatus = status });
}
- _metrics.AlarmProviderSwitched(fromModeInt, ModeToInt(toMode), reason);
+ AlarmProviderSwitchReason switchReason = toMode switch
+ {
+ AlarmProviderMode.Subtag => AlarmProviderSwitchReason.Failover,
+ AlarmProviderMode.Alarmmgr => AlarmProviderSwitchReason.Failback,
+ _ => AlarmProviderSwitchReason.Unknown,
+ };
+ _metrics.AlarmProviderSwitched(fromModeInt, ModeToInt(toMode), switchReason);
_logger.LogInformation(
"Alarm provider mode changed to {Mode} (degraded={Degraded}): {Reason}",
diff --git a/src/ZB.MOM.WW.MxGateway.Server/Metrics/AlarmProviderSwitchReason.cs b/src/ZB.MOM.WW.MxGateway.Server/Metrics/AlarmProviderSwitchReason.cs
new file mode 100644
index 0000000..da024c8
--- /dev/null
+++ b/src/ZB.MOM.WW.MxGateway.Server/Metrics/AlarmProviderSwitchReason.cs
@@ -0,0 +1,20 @@
+namespace ZB.MOM.WW.MxGateway.Server.Metrics;
+
+///
+/// Bounded classification of an alarm-provider switch, used as the low-cardinality
+/// reason tag on the mxgateway.alarms.provider_switches counter. The
+/// worker supplies a free-text reason (e.g. "primary PollOnce failed") that
+/// stays in the structured log; only this bounded value reaches the metric tag so the
+/// time series cannot fan out on operation-specific text.
+///
+public enum AlarmProviderSwitchReason
+{
+ /// The switch direction could not be classified.
+ Unknown = 0,
+
+ /// Switched from the primary (alarmmgr) provider to the subtag standby — degraded.
+ Failover = 1,
+
+ /// Switched back from the subtag standby to the primary (alarmmgr) provider — recovered.
+ Failback = 2,
+}
diff --git a/src/ZB.MOM.WW.MxGateway.Server/Metrics/GatewayMetrics.cs b/src/ZB.MOM.WW.MxGateway.Server/Metrics/GatewayMetrics.cs
index 959ef41..15fad05 100644
--- a/src/ZB.MOM.WW.MxGateway.Server/Metrics/GatewayMetrics.cs
+++ b/src/ZB.MOM.WW.MxGateway.Server/Metrics/GatewayMetrics.cs
@@ -50,6 +50,7 @@ public sealed class GatewayMetrics : IDisposable
private long _heartbeatFailures;
private long _streamDisconnects;
private long _retryAttempts;
+ private long _alarmProviderSwitches;
private bool _disposed;
///
@@ -383,25 +384,34 @@ public sealed class GatewayMetrics : IDisposable
}
///
- /// Records that the alarm provider switched modes and updates the current provider mode gauge.
+ /// Records that the alarm provider switched modes, increments the switch count, and updates the
+ /// current provider mode gauge.
///
/// Provider mode before the switch (1=alarmmgr, 2=subtag, 0=unknown).
/// Provider mode after the switch (1=alarmmgr, 2=subtag, 0=unknown).
- /// Human-readable reason for the switch.
- public void AlarmProviderSwitched(int fromMode, int toMode, string reason)
+ /// Bounded switch classification used as the counter's reason tag.
+ public void AlarmProviderSwitched(int fromMode, int toMode, AlarmProviderSwitchReason reason)
{
lock (_syncRoot)
{
_alarmProviderMode = toMode;
+ _alarmProviderSwitches++;
}
_alarmProviderSwitchesCounter.Add(
1,
new KeyValuePair("from", fromMode.ToString(CultureInfo.InvariantCulture)),
new KeyValuePair("to", toMode.ToString(CultureInfo.InvariantCulture)),
- new KeyValuePair("reason", reason));
+ new KeyValuePair("reason", ReasonTag(reason)));
}
+ private static string ReasonTag(AlarmProviderSwitchReason reason) => reason switch
+ {
+ AlarmProviderSwitchReason.Failover => "failover",
+ AlarmProviderSwitchReason.Failback => "failback",
+ _ => "unknown",
+ };
+
/// Sets the current alarm provider-mode gauge without recording a switch (e.g. startup baseline).
public void SetAlarmProviderMode(int mode)
{
@@ -433,6 +443,7 @@ public sealed class GatewayMetrics : IDisposable
HeartbeatFailures: _heartbeatFailures,
StreamDisconnects: _streamDisconnects,
RetryAttempts: _retryAttempts,
+ AlarmProviderSwitchCount: _alarmProviderSwitches,
CommandFailuresByMethod: new Dictionary(_commandFailuresByMethod, StringComparer.OrdinalIgnoreCase),
EventsByFamily: new Dictionary(_eventsByFamily, StringComparer.OrdinalIgnoreCase),
EventsBySession: new Dictionary(_eventsBySession, StringComparer.Ordinal),
diff --git a/src/ZB.MOM.WW.MxGateway.Server/Metrics/GatewayMetricsSnapshot.cs b/src/ZB.MOM.WW.MxGateway.Server/Metrics/GatewayMetricsSnapshot.cs
index c96f570..996d44a 100644
--- a/src/ZB.MOM.WW.MxGateway.Server/Metrics/GatewayMetricsSnapshot.cs
+++ b/src/ZB.MOM.WW.MxGateway.Server/Metrics/GatewayMetricsSnapshot.cs
@@ -18,6 +18,7 @@ public sealed record GatewayMetricsSnapshot(
long HeartbeatFailures,
long StreamDisconnects,
long RetryAttempts,
+ long AlarmProviderSwitchCount,
IReadOnlyDictionary CommandFailuresByMethod,
IReadOnlyDictionary EventsByFamily,
IReadOnlyDictionary EventsBySession,
diff --git a/src/ZB.MOM.WW.MxGateway.Tests/Metrics/GatewayMetricsTests.cs b/src/ZB.MOM.WW.MxGateway.Tests/Metrics/GatewayMetricsTests.cs
index 2ac7ff1..3fa974c 100644
--- a/src/ZB.MOM.WW.MxGateway.Tests/Metrics/GatewayMetricsTests.cs
+++ b/src/ZB.MOM.WW.MxGateway.Tests/Metrics/GatewayMetricsTests.cs
@@ -111,12 +111,13 @@ public sealed class GatewayMetricsTests
});
listener.Start();
- metrics.AlarmProviderSwitched(1, 2, "test");
+ metrics.AlarmProviderSwitched(1, 2, AlarmProviderSwitchReason.Failover);
Assert.Equal(1, capturedValue);
Assert.Equal("1", capturedFrom);
Assert.Equal("2", capturedTo);
- Assert.Equal("test", capturedReason);
+ Assert.Equal("failover", capturedReason);
+ Assert.Equal(1, metrics.GetSnapshot().AlarmProviderSwitchCount);
}
///
@@ -150,7 +151,7 @@ public sealed class GatewayMetricsTests
});
listener.Start();
- metrics.AlarmProviderSwitched(1, 2, "test");
+ metrics.AlarmProviderSwitched(1, 2, AlarmProviderSwitchReason.Failover);
listener.RecordObservableInstruments();
Assert.Equal(2, capturedMode);