metrics(alarms): expose provider-switch count in snapshot, bound the reason tag
B1: add AlarmProviderSwitchCount to GatewayMetricsSnapshot so the switch total is readable without scraping the OTEL counter. B2: replace the free-text reason tag on mxgateway.alarms.provider_switches with a bounded AlarmProviderSwitchReason enum (failover/failback/unknown); the human-readable reason stays in the structured log.
This commit is contained in:
@@ -399,7 +399,13 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
|
|||||||
BroadcastToAll(new AlarmFeedMessage { ProviderStatus = status });
|
BroadcastToAll(new AlarmFeedMessage { ProviderStatus = status });
|
||||||
}
|
}
|
||||||
|
|
||||||
_metrics.AlarmProviderSwitched(fromModeInt, ModeToInt(toMode), reason);
|
AlarmProviderSwitchReason switchReason = toMode switch
|
||||||
|
{
|
||||||
|
AlarmProviderMode.Subtag => AlarmProviderSwitchReason.Failover,
|
||||||
|
AlarmProviderMode.Alarmmgr => AlarmProviderSwitchReason.Failback,
|
||||||
|
_ => AlarmProviderSwitchReason.Unknown,
|
||||||
|
};
|
||||||
|
_metrics.AlarmProviderSwitched(fromModeInt, ModeToInt(toMode), switchReason);
|
||||||
|
|
||||||
_logger.LogInformation(
|
_logger.LogInformation(
|
||||||
"Alarm provider mode changed to {Mode} (degraded={Degraded}): {Reason}",
|
"Alarm provider mode changed to {Mode} (degraded={Degraded}): {Reason}",
|
||||||
|
|||||||
@@ -0,0 +1,20 @@
|
|||||||
|
namespace ZB.MOM.WW.MxGateway.Server.Metrics;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Bounded classification of an alarm-provider switch, used as the low-cardinality
|
||||||
|
/// <c>reason</c> tag on the <c>mxgateway.alarms.provider_switches</c> counter. The
|
||||||
|
/// worker supplies a free-text reason (e.g. <c>"primary PollOnce failed"</c>) that
|
||||||
|
/// stays in the structured log; only this bounded value reaches the metric tag so the
|
||||||
|
/// time series cannot fan out on operation-specific text.
|
||||||
|
/// </summary>
|
||||||
|
public enum AlarmProviderSwitchReason
|
||||||
|
{
|
||||||
|
/// <summary>The switch direction could not be classified.</summary>
|
||||||
|
Unknown = 0,
|
||||||
|
|
||||||
|
/// <summary>Switched from the primary (alarmmgr) provider to the subtag standby — degraded.</summary>
|
||||||
|
Failover = 1,
|
||||||
|
|
||||||
|
/// <summary>Switched back from the subtag standby to the primary (alarmmgr) provider — recovered.</summary>
|
||||||
|
Failback = 2,
|
||||||
|
}
|
||||||
@@ -50,6 +50,7 @@ public sealed class GatewayMetrics : IDisposable
|
|||||||
private long _heartbeatFailures;
|
private long _heartbeatFailures;
|
||||||
private long _streamDisconnects;
|
private long _streamDisconnects;
|
||||||
private long _retryAttempts;
|
private long _retryAttempts;
|
||||||
|
private long _alarmProviderSwitches;
|
||||||
private bool _disposed;
|
private bool _disposed;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
@@ -383,25 +384,34 @@ public sealed class GatewayMetrics : IDisposable
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Records that the alarm provider switched modes and updates the current provider mode gauge.
|
/// Records that the alarm provider switched modes, increments the switch count, and updates the
|
||||||
|
/// current provider mode gauge.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="fromMode">Provider mode before the switch (1=alarmmgr, 2=subtag, 0=unknown).</param>
|
/// <param name="fromMode">Provider mode before the switch (1=alarmmgr, 2=subtag, 0=unknown).</param>
|
||||||
/// <param name="toMode">Provider mode after the switch (1=alarmmgr, 2=subtag, 0=unknown).</param>
|
/// <param name="toMode">Provider mode after the switch (1=alarmmgr, 2=subtag, 0=unknown).</param>
|
||||||
/// <param name="reason">Human-readable reason for the switch.</param>
|
/// <param name="reason">Bounded switch classification used as the counter's <c>reason</c> tag.</param>
|
||||||
public void AlarmProviderSwitched(int fromMode, int toMode, string reason)
|
public void AlarmProviderSwitched(int fromMode, int toMode, AlarmProviderSwitchReason reason)
|
||||||
{
|
{
|
||||||
lock (_syncRoot)
|
lock (_syncRoot)
|
||||||
{
|
{
|
||||||
_alarmProviderMode = toMode;
|
_alarmProviderMode = toMode;
|
||||||
|
_alarmProviderSwitches++;
|
||||||
}
|
}
|
||||||
|
|
||||||
_alarmProviderSwitchesCounter.Add(
|
_alarmProviderSwitchesCounter.Add(
|
||||||
1,
|
1,
|
||||||
new KeyValuePair<string, object?>("from", fromMode.ToString(CultureInfo.InvariantCulture)),
|
new KeyValuePair<string, object?>("from", fromMode.ToString(CultureInfo.InvariantCulture)),
|
||||||
new KeyValuePair<string, object?>("to", toMode.ToString(CultureInfo.InvariantCulture)),
|
new KeyValuePair<string, object?>("to", toMode.ToString(CultureInfo.InvariantCulture)),
|
||||||
new KeyValuePair<string, object?>("reason", reason));
|
new KeyValuePair<string, object?>("reason", ReasonTag(reason)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static string ReasonTag(AlarmProviderSwitchReason reason) => reason switch
|
||||||
|
{
|
||||||
|
AlarmProviderSwitchReason.Failover => "failover",
|
||||||
|
AlarmProviderSwitchReason.Failback => "failback",
|
||||||
|
_ => "unknown",
|
||||||
|
};
|
||||||
|
|
||||||
/// <summary>Sets the current alarm provider-mode gauge without recording a switch (e.g. startup baseline).</summary>
|
/// <summary>Sets the current alarm provider-mode gauge without recording a switch (e.g. startup baseline).</summary>
|
||||||
public void SetAlarmProviderMode(int mode)
|
public void SetAlarmProviderMode(int mode)
|
||||||
{
|
{
|
||||||
@@ -433,6 +443,7 @@ public sealed class GatewayMetrics : IDisposable
|
|||||||
HeartbeatFailures: _heartbeatFailures,
|
HeartbeatFailures: _heartbeatFailures,
|
||||||
StreamDisconnects: _streamDisconnects,
|
StreamDisconnects: _streamDisconnects,
|
||||||
RetryAttempts: _retryAttempts,
|
RetryAttempts: _retryAttempts,
|
||||||
|
AlarmProviderSwitchCount: _alarmProviderSwitches,
|
||||||
CommandFailuresByMethod: new Dictionary<string, long>(_commandFailuresByMethod, StringComparer.OrdinalIgnoreCase),
|
CommandFailuresByMethod: new Dictionary<string, long>(_commandFailuresByMethod, StringComparer.OrdinalIgnoreCase),
|
||||||
EventsByFamily: new Dictionary<string, long>(_eventsByFamily, StringComparer.OrdinalIgnoreCase),
|
EventsByFamily: new Dictionary<string, long>(_eventsByFamily, StringComparer.OrdinalIgnoreCase),
|
||||||
EventsBySession: new Dictionary<string, long>(_eventsBySession, StringComparer.Ordinal),
|
EventsBySession: new Dictionary<string, long>(_eventsBySession, StringComparer.Ordinal),
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ public sealed record GatewayMetricsSnapshot(
|
|||||||
long HeartbeatFailures,
|
long HeartbeatFailures,
|
||||||
long StreamDisconnects,
|
long StreamDisconnects,
|
||||||
long RetryAttempts,
|
long RetryAttempts,
|
||||||
|
long AlarmProviderSwitchCount,
|
||||||
IReadOnlyDictionary<string, long> CommandFailuresByMethod,
|
IReadOnlyDictionary<string, long> CommandFailuresByMethod,
|
||||||
IReadOnlyDictionary<string, long> EventsByFamily,
|
IReadOnlyDictionary<string, long> EventsByFamily,
|
||||||
IReadOnlyDictionary<string, long> EventsBySession,
|
IReadOnlyDictionary<string, long> EventsBySession,
|
||||||
|
|||||||
@@ -111,12 +111,13 @@ public sealed class GatewayMetricsTests
|
|||||||
});
|
});
|
||||||
listener.Start();
|
listener.Start();
|
||||||
|
|
||||||
metrics.AlarmProviderSwitched(1, 2, "test");
|
metrics.AlarmProviderSwitched(1, 2, AlarmProviderSwitchReason.Failover);
|
||||||
|
|
||||||
Assert.Equal(1, capturedValue);
|
Assert.Equal(1, capturedValue);
|
||||||
Assert.Equal("1", capturedFrom);
|
Assert.Equal("1", capturedFrom);
|
||||||
Assert.Equal("2", capturedTo);
|
Assert.Equal("2", capturedTo);
|
||||||
Assert.Equal("test", capturedReason);
|
Assert.Equal("failover", capturedReason);
|
||||||
|
Assert.Equal(1, metrics.GetSnapshot().AlarmProviderSwitchCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
@@ -150,7 +151,7 @@ public sealed class GatewayMetricsTests
|
|||||||
});
|
});
|
||||||
listener.Start();
|
listener.Start();
|
||||||
|
|
||||||
metrics.AlarmProviderSwitched(1, 2, "test");
|
metrics.AlarmProviderSwitched(1, 2, AlarmProviderSwitchReason.Failover);
|
||||||
listener.RecordObservableInstruments();
|
listener.RecordObservableInstruments();
|
||||||
|
|
||||||
Assert.Equal(2, capturedMode);
|
Assert.Equal(2, capturedMode);
|
||||||
|
|||||||
Reference in New Issue
Block a user