diff --git a/src/ZB.MOM.WW.MxGateway.Server/Alarms/GatewayAlarmMonitor.cs b/src/ZB.MOM.WW.MxGateway.Server/Alarms/GatewayAlarmMonitor.cs index fdbfc89..a353d9c 100644 --- a/src/ZB.MOM.WW.MxGateway.Server/Alarms/GatewayAlarmMonitor.cs +++ b/src/ZB.MOM.WW.MxGateway.Server/Alarms/GatewayAlarmMonitor.cs @@ -399,7 +399,13 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic BroadcastToAll(new AlarmFeedMessage { ProviderStatus = status }); } - _metrics.AlarmProviderSwitched(fromModeInt, ModeToInt(toMode), reason); + AlarmProviderSwitchReason switchReason = toMode switch + { + AlarmProviderMode.Subtag => AlarmProviderSwitchReason.Failover, + AlarmProviderMode.Alarmmgr => AlarmProviderSwitchReason.Failback, + _ => AlarmProviderSwitchReason.Unknown, + }; + _metrics.AlarmProviderSwitched(fromModeInt, ModeToInt(toMode), switchReason); _logger.LogInformation( "Alarm provider mode changed to {Mode} (degraded={Degraded}): {Reason}", diff --git a/src/ZB.MOM.WW.MxGateway.Server/Metrics/AlarmProviderSwitchReason.cs b/src/ZB.MOM.WW.MxGateway.Server/Metrics/AlarmProviderSwitchReason.cs new file mode 100644 index 0000000..da024c8 --- /dev/null +++ b/src/ZB.MOM.WW.MxGateway.Server/Metrics/AlarmProviderSwitchReason.cs @@ -0,0 +1,20 @@ +namespace ZB.MOM.WW.MxGateway.Server.Metrics; + +/// +/// Bounded classification of an alarm-provider switch, used as the low-cardinality +/// reason tag on the mxgateway.alarms.provider_switches counter. The +/// worker supplies a free-text reason (e.g. "primary PollOnce failed") that +/// stays in the structured log; only this bounded value reaches the metric tag so the +/// time series cannot fan out on operation-specific text. +/// +public enum AlarmProviderSwitchReason +{ + /// The switch direction could not be classified. + Unknown = 0, + + /// Switched from the primary (alarmmgr) provider to the subtag standby — degraded. + Failover = 1, + + /// Switched back from the subtag standby to the primary (alarmmgr) provider — recovered. + Failback = 2, +} diff --git a/src/ZB.MOM.WW.MxGateway.Server/Metrics/GatewayMetrics.cs b/src/ZB.MOM.WW.MxGateway.Server/Metrics/GatewayMetrics.cs index 959ef41..15fad05 100644 --- a/src/ZB.MOM.WW.MxGateway.Server/Metrics/GatewayMetrics.cs +++ b/src/ZB.MOM.WW.MxGateway.Server/Metrics/GatewayMetrics.cs @@ -50,6 +50,7 @@ public sealed class GatewayMetrics : IDisposable private long _heartbeatFailures; private long _streamDisconnects; private long _retryAttempts; + private long _alarmProviderSwitches; private bool _disposed; /// @@ -383,25 +384,34 @@ public sealed class GatewayMetrics : IDisposable } /// - /// Records that the alarm provider switched modes and updates the current provider mode gauge. + /// Records that the alarm provider switched modes, increments the switch count, and updates the + /// current provider mode gauge. /// /// Provider mode before the switch (1=alarmmgr, 2=subtag, 0=unknown). /// Provider mode after the switch (1=alarmmgr, 2=subtag, 0=unknown). - /// Human-readable reason for the switch. - public void AlarmProviderSwitched(int fromMode, int toMode, string reason) + /// Bounded switch classification used as the counter's reason tag. + public void AlarmProviderSwitched(int fromMode, int toMode, AlarmProviderSwitchReason reason) { lock (_syncRoot) { _alarmProviderMode = toMode; + _alarmProviderSwitches++; } _alarmProviderSwitchesCounter.Add( 1, new KeyValuePair("from", fromMode.ToString(CultureInfo.InvariantCulture)), new KeyValuePair("to", toMode.ToString(CultureInfo.InvariantCulture)), - new KeyValuePair("reason", reason)); + new KeyValuePair("reason", ReasonTag(reason))); } + private static string ReasonTag(AlarmProviderSwitchReason reason) => reason switch + { + AlarmProviderSwitchReason.Failover => "failover", + AlarmProviderSwitchReason.Failback => "failback", + _ => "unknown", + }; + /// Sets the current alarm provider-mode gauge without recording a switch (e.g. startup baseline). public void SetAlarmProviderMode(int mode) { @@ -433,6 +443,7 @@ public sealed class GatewayMetrics : IDisposable HeartbeatFailures: _heartbeatFailures, StreamDisconnects: _streamDisconnects, RetryAttempts: _retryAttempts, + AlarmProviderSwitchCount: _alarmProviderSwitches, CommandFailuresByMethod: new Dictionary(_commandFailuresByMethod, StringComparer.OrdinalIgnoreCase), EventsByFamily: new Dictionary(_eventsByFamily, StringComparer.OrdinalIgnoreCase), EventsBySession: new Dictionary(_eventsBySession, StringComparer.Ordinal), diff --git a/src/ZB.MOM.WW.MxGateway.Server/Metrics/GatewayMetricsSnapshot.cs b/src/ZB.MOM.WW.MxGateway.Server/Metrics/GatewayMetricsSnapshot.cs index c96f570..996d44a 100644 --- a/src/ZB.MOM.WW.MxGateway.Server/Metrics/GatewayMetricsSnapshot.cs +++ b/src/ZB.MOM.WW.MxGateway.Server/Metrics/GatewayMetricsSnapshot.cs @@ -18,6 +18,7 @@ public sealed record GatewayMetricsSnapshot( long HeartbeatFailures, long StreamDisconnects, long RetryAttempts, + long AlarmProviderSwitchCount, IReadOnlyDictionary CommandFailuresByMethod, IReadOnlyDictionary EventsByFamily, IReadOnlyDictionary EventsBySession, diff --git a/src/ZB.MOM.WW.MxGateway.Tests/Metrics/GatewayMetricsTests.cs b/src/ZB.MOM.WW.MxGateway.Tests/Metrics/GatewayMetricsTests.cs index 2ac7ff1..3fa974c 100644 --- a/src/ZB.MOM.WW.MxGateway.Tests/Metrics/GatewayMetricsTests.cs +++ b/src/ZB.MOM.WW.MxGateway.Tests/Metrics/GatewayMetricsTests.cs @@ -111,12 +111,13 @@ public sealed class GatewayMetricsTests }); listener.Start(); - metrics.AlarmProviderSwitched(1, 2, "test"); + metrics.AlarmProviderSwitched(1, 2, AlarmProviderSwitchReason.Failover); Assert.Equal(1, capturedValue); Assert.Equal("1", capturedFrom); Assert.Equal("2", capturedTo); - Assert.Equal("test", capturedReason); + Assert.Equal("failover", capturedReason); + Assert.Equal(1, metrics.GetSnapshot().AlarmProviderSwitchCount); } /// @@ -150,7 +151,7 @@ public sealed class GatewayMetricsTests }); listener.Start(); - metrics.AlarmProviderSwitched(1, 2, "test"); + metrics.AlarmProviderSwitched(1, 2, AlarmProviderSwitchReason.Failover); listener.RecordObservableInstruments(); Assert.Equal(2, capturedMode);