server(alarms): monitor resolves watch-list, sends ForcedMode/failover, reflects provider mode into feed + metrics

This commit is contained in:
Joseph Doherty
2026-06-13 10:20:03 -04:00
parent 3ccf0b5f9e
commit ebf1d95f72
3 changed files with 574 additions and 3 deletions
@@ -13,6 +13,7 @@ public static class AlarmsServiceCollectionExtensions
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddGatewayAlarms(this IServiceCollection services)
{
services.AddSingleton<IAlarmWatchListResolver, AlarmWatchListResolver>();
services.AddSingleton<GatewayAlarmMonitor>();
services.AddSingleton<IGatewayAlarmService>(provider => provider.GetRequiredService<GatewayAlarmMonitor>());
services.AddHostedService(provider => provider.GetRequiredService<GatewayAlarmMonitor>());
@@ -1,7 +1,9 @@
using System.Threading.Channels;
using Google.Protobuf.WellKnownTypes;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
using ZB.MOM.WW.MxGateway.Server.Configuration;
using ZB.MOM.WW.MxGateway.Server.Metrics;
using ZB.MOM.WW.MxGateway.Server.Sessions;
namespace ZB.MOM.WW.MxGateway.Server.Alarms;
@@ -23,6 +25,8 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
private static readonly TimeSpan StartupGrace = TimeSpan.FromSeconds(2);
private readonly ISessionManager _sessionManager;
private readonly IAlarmWatchListResolver _watchListResolver;
private readonly GatewayMetrics _metrics;
private readonly AlarmsOptions _options;
private readonly ILogger<GatewayAlarmMonitor> _logger;
@@ -30,20 +34,34 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
private readonly Dictionary<string, ActiveAlarmSnapshot> _alarms = new(StringComparer.Ordinal);
private readonly List<Subscriber> _subscribers = [];
// Current provider status (mode + degraded + reason + since), guarded by _sync.
// Initialized to the alarm-manager, not-degraded baseline so a late joiner sees
// a sensible status even before any OnAlarmProviderModeChanged event arrives.
private AlarmProviderMode _providerMode = AlarmProviderMode.Alarmmgr;
private bool _providerDegraded;
private string _providerReason = string.Empty;
private DateTimeOffset _providerSince = DateTimeOffset.UtcNow;
private volatile GatewayAlarmMonitorState _state = GatewayAlarmMonitorState.Disabled;
private volatile string? _lastError;
private GatewaySession? _session;
/// <summary>Initializes the gateway alarm monitor.</summary>
/// <param name="sessionManager">Gateway session manager.</param>
/// <param name="watchListResolver">Resolver for the subtag-fallback watch-list.</param>
/// <param name="metrics">Gateway metrics sink.</param>
/// <param name="options">Gateway options carrying the alarm configuration.</param>
/// <param name="logger">Diagnostic logger.</param>
public GatewayAlarmMonitor(
ISessionManager sessionManager,
IAlarmWatchListResolver watchListResolver,
GatewayMetrics metrics,
IOptions<GatewayOptions> options,
ILogger<GatewayAlarmMonitor> logger)
{
_sessionManager = sessionManager ?? throw new ArgumentNullException(nameof(sessionManager));
_watchListResolver = watchListResolver ?? throw new ArgumentNullException(nameof(watchListResolver));
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_options = (options ?? throw new ArgumentNullException(nameof(options))).Value.Alarms;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
@@ -139,6 +157,16 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
private async Task RunMonitorAsync(string subscription, CancellationToken stoppingToken)
{
_state = GatewayAlarmMonitorState.Starting;
lock (_sync)
{
// Re-baseline the provider status for this lifecycle so a restarted
// monitor advertises alarm-manager/not-degraded until told otherwise.
_providerMode = AlarmProviderMode.Alarmmgr;
_providerDegraded = false;
_providerReason = string.Empty;
_providerSince = DateTimeOffset.UtcNow;
}
GatewaySession session = await _sessionManager.OpenSessionAsync(
new SessionOpenRequest(BackendName, MonitorClientName, Guid.NewGuid().ToString("N"), CommandTimeout: null),
MonitorClientName,
@@ -173,6 +201,15 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
{
ApplyTransition(mxEvent.OnAlarmTransition);
}
else if (mxEvent is { BodyCase: MxEvent.BodyOneofCase.OnAlarmProviderModeChanged }
&& mxEvent.OnAlarmProviderModeChanged is not null)
{
await ApplyProviderModeChangeAsync(
session.SessionId,
mxEvent.OnAlarmProviderModeChanged,
linked.Token)
.ConfigureAwait(false);
}
}
}
finally
@@ -209,6 +246,29 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
private async Task SubscribeAlarmsAsync(string sessionId, string subscription, CancellationToken cancellationToken)
{
IReadOnlyList<AlarmSubtagTarget> watchList = await _watchListResolver
.ResolveAsync(_options, cancellationToken)
.ConfigureAwait(false);
AlarmProviderMode forcedMode = MapForcedMode(_options.Fallback.Mode);
// When the forced mode is Unspecified (the "Auto" case) and the resolved
// watch-list is empty — the common alarmmgr-only deployment — the command
// is identical-in-effect to the historical SubscribeAlarms (wnwrap only):
// the worker builds the wnwrap consumer and no subtag watch-list.
SubscribeAlarmsCommand command = new()
{
SubscriptionExpression = subscription,
ForcedMode = forcedMode,
Failover = new AlarmFailoverConfig
{
ConsecutiveFailureThreshold = _options.Fallback.ConsecutiveFailureThreshold,
FailbackProbeIntervalSeconds = _options.Fallback.FailbackProbeIntervalSeconds,
FailbackStableProbes = _options.Fallback.FailbackStableProbes,
},
};
command.WatchList.AddRange(watchList);
WorkerCommandReply reply = await _sessionManager.InvokeAsync(
sessionId,
new WorkerCommand
@@ -216,7 +276,7 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
Command = new MxCommand
{
Kind = MxCommandKind.SubscribeAlarms,
SubscribeAlarms = new SubscribeAlarmsCommand { SubscriptionExpression = subscription },
SubscribeAlarms = command,
},
},
cancellationToken)
@@ -310,6 +370,94 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
}
}
// Handles the worker's provider-mode-change event: updates the stored provider
// status, broadcasts it to every subscriber (provider status is global, not
// alarm-scoped), records the switch metric, and forces a cache reconcile so the
// active-alarm set reflects whatever the new mode reports.
private async Task ApplyProviderModeChangeAsync(
string sessionId,
OnAlarmProviderModeChangedEvent change,
CancellationToken cancellationToken)
{
AlarmProviderMode toMode = change.Mode;
string reason = change.Reason ?? string.Empty;
AlarmProviderStatus status;
int fromModeInt;
lock (_sync)
{
fromModeInt = ModeToInt(_providerMode);
_providerMode = toMode;
_providerDegraded = toMode == AlarmProviderMode.Subtag;
_providerReason = reason;
_providerSince = DateTimeOffset.UtcNow;
status = BuildProviderStatus();
BroadcastToAll(new AlarmFeedMessage { ProviderStatus = status });
}
_metrics.AlarmProviderSwitched(fromModeInt, ModeToInt(toMode), reason);
_logger.LogInformation(
"Alarm provider mode changed to {Mode} (degraded={Degraded}): {Reason}",
toMode,
status.Degraded,
reason);
try
{
await ReconcileAsync(sessionId, cancellationToken).ConfigureAwait(false);
}
catch (OperationCanceledException)
{
throw;
}
catch (Exception exception)
{
_logger.LogDebug(
exception,
"Reconcile after alarm provider mode change failed; keeping the current cache.");
}
}
// Caller holds _sync. Builds an AlarmProviderStatus snapshot of the current state.
private AlarmProviderStatus BuildProviderStatus()
{
return new AlarmProviderStatus
{
Mode = _providerMode,
Degraded = _providerDegraded,
Reason = _providerReason,
Since = Timestamp.FromDateTimeOffset(_providerSince),
};
}
// Maps the configured fallback mode string to the forced provider mode the
// worker honours. Case-insensitive; anything other than the two force values
// (including the default "Auto") yields Unspecified ("let the worker decide").
private static AlarmProviderMode MapForcedMode(string? mode)
{
if (string.Equals(mode, "ForceAlarmManager", StringComparison.OrdinalIgnoreCase))
{
return AlarmProviderMode.Alarmmgr;
}
if (string.Equals(mode, "ForceSubtag", StringComparison.OrdinalIgnoreCase))
{
return AlarmProviderMode.Subtag;
}
return AlarmProviderMode.Unspecified;
}
// Maps the provider-mode enum to the integer the metric expects
// (alarmmgr=1, subtag=2, unknown/unspecified=0).
private static int ModeToInt(AlarmProviderMode mode) => mode switch
{
AlarmProviderMode.Alarmmgr => 1,
AlarmProviderMode.Subtag => 2,
_ => 0,
};
// Replaces the cache with the worker's authoritative snapshot, broadcasting
// a synthetic transition for any alarm the live stream missed.
private void ApplyReconcile(IEnumerable<ActiveAlarmSnapshot> snapshots)
@@ -374,6 +522,23 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
}
}
// Caller holds _sync. Pushes a feed message to every subscriber regardless of
// its alarm-filter prefix. Used for provider-status messages, which are global
// rather than scoped to a single alarm reference.
private void BroadcastToAll(AlarmFeedMessage message)
{
for (int index = _subscribers.Count - 1; index >= 0; index--)
{
Subscriber subscriber = _subscribers[index];
if (!subscriber.Channel.Writer.TryWrite(message))
{
subscriber.Channel.Writer.TryComplete(new InvalidOperationException(
"Alarm feed subscriber fell behind and was dropped; reconnect to re-snapshot."));
_subscribers.RemoveAt(index);
}
}
}
private void ClearCache()
{
lock (_sync)
@@ -398,11 +563,14 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
Subscriber subscriber = new(channel, prefix);
ActiveAlarmSnapshot[] snapshot;
AlarmProviderStatus providerStatus;
lock (_sync)
{
// Register before snapshotting under the same lock so no transition
// can slip between the snapshot and the live stream.
// Register before snapshotting under the same lock so neither a
// transition nor a provider-mode change can slip between the snapshot
// and the live stream.
_subscribers.Add(subscriber);
providerStatus = BuildProviderStatus();
snapshot = _alarms.Values
.Where(alarm => prefix.Length == 0
|| alarm.AlarmFullReference.StartsWith(prefix, StringComparison.Ordinal))
@@ -412,6 +580,10 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
try
{
// Emit the current provider status first so a late joiner immediately
// learns the mode (and whether the feed is degraded) before any alarms.
yield return new AlarmFeedMessage { ProviderStatus = providerStatus };
foreach (ActiveAlarmSnapshot alarm in snapshot)
{
yield return new AlarmFeedMessage { ActiveAlarm = alarm };