metrics(alarms): expose provider-switch count in snapshot, bound the reason tag
B1: add AlarmProviderSwitchCount to GatewayMetricsSnapshot so the switch total is readable without scraping the OTEL counter. B2: replace the free-text reason tag on mxgateway.alarms.provider_switches with a bounded AlarmProviderSwitchReason enum (failover/failback/unknown); the human-readable reason stays in the structured log.
This commit is contained in:
@@ -399,7 +399,13 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
|
||||
BroadcastToAll(new AlarmFeedMessage { ProviderStatus = status });
|
||||
}
|
||||
|
||||
_metrics.AlarmProviderSwitched(fromModeInt, ModeToInt(toMode), reason);
|
||||
AlarmProviderSwitchReason switchReason = toMode switch
|
||||
{
|
||||
AlarmProviderMode.Subtag => AlarmProviderSwitchReason.Failover,
|
||||
AlarmProviderMode.Alarmmgr => AlarmProviderSwitchReason.Failback,
|
||||
_ => AlarmProviderSwitchReason.Unknown,
|
||||
};
|
||||
_metrics.AlarmProviderSwitched(fromModeInt, ModeToInt(toMode), switchReason);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Alarm provider mode changed to {Mode} (degraded={Degraded}): {Reason}",
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Metrics;
|
||||
|
||||
/// <summary>
|
||||
/// Bounded classification of an alarm-provider switch, used as the low-cardinality
|
||||
/// <c>reason</c> tag on the <c>mxgateway.alarms.provider_switches</c> counter. The
|
||||
/// worker supplies a free-text reason (e.g. <c>"primary PollOnce failed"</c>) that
|
||||
/// stays in the structured log; only this bounded value reaches the metric tag so the
|
||||
/// time series cannot fan out on operation-specific text.
|
||||
/// </summary>
|
||||
public enum AlarmProviderSwitchReason
|
||||
{
|
||||
/// <summary>The switch direction could not be classified.</summary>
|
||||
Unknown = 0,
|
||||
|
||||
/// <summary>Switched from the primary (alarmmgr) provider to the subtag standby — degraded.</summary>
|
||||
Failover = 1,
|
||||
|
||||
/// <summary>Switched back from the subtag standby to the primary (alarmmgr) provider — recovered.</summary>
|
||||
Failback = 2,
|
||||
}
|
||||
@@ -50,6 +50,7 @@ public sealed class GatewayMetrics : IDisposable
|
||||
private long _heartbeatFailures;
|
||||
private long _streamDisconnects;
|
||||
private long _retryAttempts;
|
||||
private long _alarmProviderSwitches;
|
||||
private bool _disposed;
|
||||
|
||||
/// <summary>
|
||||
@@ -383,25 +384,34 @@ public sealed class GatewayMetrics : IDisposable
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records that the alarm provider switched modes and updates the current provider mode gauge.
|
||||
/// Records that the alarm provider switched modes, increments the switch count, and updates the
|
||||
/// current provider mode gauge.
|
||||
/// </summary>
|
||||
/// <param name="fromMode">Provider mode before the switch (1=alarmmgr, 2=subtag, 0=unknown).</param>
|
||||
/// <param name="toMode">Provider mode after the switch (1=alarmmgr, 2=subtag, 0=unknown).</param>
|
||||
/// <param name="reason">Human-readable reason for the switch.</param>
|
||||
public void AlarmProviderSwitched(int fromMode, int toMode, string reason)
|
||||
/// <param name="reason">Bounded switch classification used as the counter's <c>reason</c> tag.</param>
|
||||
public void AlarmProviderSwitched(int fromMode, int toMode, AlarmProviderSwitchReason reason)
|
||||
{
|
||||
lock (_syncRoot)
|
||||
{
|
||||
_alarmProviderMode = toMode;
|
||||
_alarmProviderSwitches++;
|
||||
}
|
||||
|
||||
_alarmProviderSwitchesCounter.Add(
|
||||
1,
|
||||
new KeyValuePair<string, object?>("from", fromMode.ToString(CultureInfo.InvariantCulture)),
|
||||
new KeyValuePair<string, object?>("to", toMode.ToString(CultureInfo.InvariantCulture)),
|
||||
new KeyValuePair<string, object?>("reason", reason));
|
||||
new KeyValuePair<string, object?>("reason", ReasonTag(reason)));
|
||||
}
|
||||
|
||||
private static string ReasonTag(AlarmProviderSwitchReason reason) => reason switch
|
||||
{
|
||||
AlarmProviderSwitchReason.Failover => "failover",
|
||||
AlarmProviderSwitchReason.Failback => "failback",
|
||||
_ => "unknown",
|
||||
};
|
||||
|
||||
/// <summary>Sets the current alarm provider-mode gauge without recording a switch (e.g. startup baseline).</summary>
|
||||
public void SetAlarmProviderMode(int mode)
|
||||
{
|
||||
@@ -433,6 +443,7 @@ public sealed class GatewayMetrics : IDisposable
|
||||
HeartbeatFailures: _heartbeatFailures,
|
||||
StreamDisconnects: _streamDisconnects,
|
||||
RetryAttempts: _retryAttempts,
|
||||
AlarmProviderSwitchCount: _alarmProviderSwitches,
|
||||
CommandFailuresByMethod: new Dictionary<string, long>(_commandFailuresByMethod, StringComparer.OrdinalIgnoreCase),
|
||||
EventsByFamily: new Dictionary<string, long>(_eventsByFamily, StringComparer.OrdinalIgnoreCase),
|
||||
EventsBySession: new Dictionary<string, long>(_eventsBySession, StringComparer.Ordinal),
|
||||
|
||||
@@ -18,6 +18,7 @@ public sealed record GatewayMetricsSnapshot(
|
||||
long HeartbeatFailures,
|
||||
long StreamDisconnects,
|
||||
long RetryAttempts,
|
||||
long AlarmProviderSwitchCount,
|
||||
IReadOnlyDictionary<string, long> CommandFailuresByMethod,
|
||||
IReadOnlyDictionary<string, long> EventsByFamily,
|
||||
IReadOnlyDictionary<string, long> EventsBySession,
|
||||
|
||||
@@ -111,12 +111,13 @@ public sealed class GatewayMetricsTests
|
||||
});
|
||||
listener.Start();
|
||||
|
||||
metrics.AlarmProviderSwitched(1, 2, "test");
|
||||
metrics.AlarmProviderSwitched(1, 2, AlarmProviderSwitchReason.Failover);
|
||||
|
||||
Assert.Equal(1, capturedValue);
|
||||
Assert.Equal("1", capturedFrom);
|
||||
Assert.Equal("2", capturedTo);
|
||||
Assert.Equal("test", capturedReason);
|
||||
Assert.Equal("failover", capturedReason);
|
||||
Assert.Equal(1, metrics.GetSnapshot().AlarmProviderSwitchCount);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -150,7 +151,7 @@ public sealed class GatewayMetricsTests
|
||||
});
|
||||
listener.Start();
|
||||
|
||||
metrics.AlarmProviderSwitched(1, 2, "test");
|
||||
metrics.AlarmProviderSwitched(1, 2, AlarmProviderSwitchReason.Failover);
|
||||
listener.RecordObservableInstruments();
|
||||
|
||||
Assert.Equal(2, capturedMode);
|
||||
|
||||
Reference in New Issue
Block a user