diff --git a/code-reviews/Server/findings.md b/code-reviews/Server/findings.md index db1c4a4..4c6c11b 100644 --- a/code-reviews/Server/findings.md +++ b/code-reviews/Server/findings.md @@ -4,8 +4,8 @@ |---|---| | Module | `src/ZB.MOM.WW.MxGateway.Server` | | Reviewer | Claude Code | -| Review date | 2026-05-24 | -| Commit reviewed | `42b0037` | +| Review date | 2026-06-15 | +| Commit reviewed | `410acc9` | | Status | Re-reviewed | | Open findings | 0 | @@ -120,6 +120,38 @@ contention nor the bounded `_events` channel saw any changes in this wave. | 9 | Testing coverage | No issues found in this module — see Tests-026 in the Tests module for the missing EventsHub broadcast coverage. | | 10 | Documentation & comments | Issues found: Server-040, Server-043 (both documentation gaps). | +### 2026-06-15 re-review (commit 410acc9) + +Re-review pass at `410acc9` over the `42b0037..HEAD` diff. The diff is large (~137 files) +but the bulk is vendored theme/CSS/font asset swaps (`wwwroot`), generated code, and the +shared-library auth refactor / TLS cert-autogen / lazy-browse / canonical-audit waves that +each carry their own design+plan and were verified in passing only. This pass is scoped to +the **alarm-provider subtag-fallback** wave the task called out: the central +`GatewayAlarmMonitor` provider-mode seeding + failover/failback handling, the new +`AlarmWatchListResolver` / `IAlarmWatchListResolver`, `AlarmFallbackOptions` / +`AlarmDiscoveryOptions` / `AlarmSubtagNameOptions` and their `GatewayOptionsValidator` +wiring, the `DashboardAlarmProviderStatus` badge + `AlarmsPage.razor` hub attach, the +provider-mode gauge + `provider_switches` counter (`GatewayMetrics`, +`AlarmProviderSwitchReason`), the Galaxy alarm-attribute discovery query +(`GalaxyRepository.GetAlarmAttributesAsync` / `AlarmAttributesSql` / `GalaxyAlarmAttributeRow`), +the `/auth/login` POST move + configurable `Dashboard:CookieName`, and the +`BrowseChildrenRequest` scope-resolver entry. Prior findings Server-044 through Server-050 +are confirmed resolved by the SessionManager/GatewaySession changes in range and remain +closed. New findings filed against this pass: Server-051..053. + +| # | Category | Result | +|---|---|---| +| 1 | Correctness & logic bugs | Issues found: Server-051 (`AlarmWatchListResolver.ResolveAsync`'s broad `catch (Exception)` swallows `OperationCanceledException`, contradicting the `IAlarmWatchListResolver` cancellation contract). | +| 2 | mxaccessgw conventions | No issues found — file-scoped namespaces, `sealed`, `Async` suffix, Options pattern, MXAccess-aligned naming all hold; no UI component libraries (badge is Bootstrap-only); the alarm SQL is a parameterless constant; no secret/tag-value logging; well-known reason strings centralised in `AlarmProviderReasons`. | +| 3 | Concurrency & thread safety | No issues found — `_providerMode`/`_providerDegraded`/`_providerReason`/`_providerSince` are read/written only under `_sync`; `BroadcastToAll` runs under `_sync`; the reconcile after a mode change is intentionally awaited outside `_sync` to avoid the documented self-deadlock; the provider-mode gauge is serialized on `GatewayMetrics._syncRoot`. | +| 4 | Error handling & resilience | Issues found: Server-051 (cancellation swallowed in the resolver — also an error-handling/contract concern). | +| 5 | Security | No issues found — `BrowseChildren` runs the same `ResolveBrowseSubtrees()` constraint scoping and `MetadataRead` scope as `DiscoverHierarchy`; the configurable `Dashboard:CookieName` falls back to the canonical default and cannot be blanked; the `/auth/login` POST keeps antiforgery + return-URL sanitisation. | +| 6 | Performance & resource management | No issues found in the alarm-fallback code — discovery is a one-shot per subscribe lifecycle; the watch-list is composed once. | +| 7 | Design-document adherence | No issues found — `docs/GatewayConfiguration.md`, `docs/Metrics.md`, `docs/GalaxyRepository.md`, and the `docs/plans/2026-06-13-alarm-subtag-fallback*` / `2026-06-15-forced-subtag-mode-fix.md` plans were landed in the same range and match the code. | +| 8 | Code organization & conventions | No issues found — new alarm types live under `Alarms/`, options under `Configuration/`, metric helper under `Metrics/`, registered via `AddGatewayAlarms`. | +| 9 | Testing coverage | Issues found: Server-053 (`AlarmWatchListResolver` `ExcludeAttributes`-vs-`IncludeAttributes` precedence and the resolver's cancellation contract are untested; no redundant-mode-change guard test). | +| 10 | Documentation & comments | Issues found: Server-052 (`IAlarmWatchListResolver` XML contract claims cancellation propagates while the implementation swallows it; the `Discovery:ExcludeAttributes` doc says "Repository-derived watch-list" while the code also removes matching explicit `IncludeAttributes`). | + ## Findings ### Server-001 @@ -929,3 +961,64 @@ Today neither call site has a Blazor error boundary, so an unhandled exception l **Recommendation:** Add a general `catch (Exception exception)` after the `SessionManagerException` catch in both `CloseSessionAsync` and `KillWorkerAsync`, log a warning (matching the SessionManagerException pattern), and return `DashboardSessionAdminResult.Fail($"{operation} failed unexpectedly. See the gateway log for details.")`. This makes the result type truly the only output the page sees. Add a regression test using a `ThrowingSessionManager` that throws e.g. `InvalidOperationException` from `KillWorkerAsync` and asserts the service returns a failing result rather than propagating. **Resolution:** 2026-05-24 — Added the recommended general `catch (Exception)` arms to both `DashboardSessionAdminService.CloseSessionAsync` and `KillWorkerAsync` (`src/ZB.MOM.WW.MxGateway.Server/Dashboard/DashboardSessionAdminService.cs`), placed after the `SessionManagerException` catches and behind a `catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) throw;` so caller cancellation still propagates cleanly. The new catches log a warning with actor + session id and return `DashboardSessionAdminResult.Fail("{Operation} failed unexpectedly for session {SessionId}. See the gateway log for details.")`, mirroring the SessionManagerException pattern. Regression tests in `src/ZB.MOM.WW.MxGateway.Tests/Gateway/Dashboard/DashboardSessionAdminServiceTests.cs`: `CloseSessionAsync_WhenManagerThrowsUnexpected_ReturnsFriendlyFail` (the `ISessionManager` throws `InvalidOperationException("unexpected")`) and `KillWorkerAsync_WhenManagerThrowsUnexpected_ReturnsFriendlyFail` (throws `IOException("pipe broken")`); both assert the service returns a failing result with a non-blank message rather than propagating. The fake's new `CloseThrowsUnexpected` / `KillThrowsUnexpected` properties hold the configured exception. Confirmed to fail before the fix (raw exception propagated) and pass after. + +### Server-051 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Error handling & resilience | +| Location | `src/ZB.MOM.WW.MxGateway.Server/Alarms/AlarmWatchListResolver.cs:64-78` | +| Status | Resolved | + +**Description:** `AlarmWatchListResolver.ResolveAsync` wraps the Galaxy Repository discovery call in a bare `catch (Exception ex)` that logs a warning and continues with an empty (config-only) discovery set: + +```csharp +try { rows = await _repository.GetAlarmAttributesAsync(cancellationToken)...; } +catch (Exception ex) { _logger.LogWarning(ex, "...continuing with configuration-only watch-list."); rows = []; } +``` + +`OperationCanceledException` / `TaskCanceledException` derive from `Exception`, so a cancellation triggered while `GetAlarmAttributesAsync` is awaiting SQL is **swallowed**, not propagated. The resolver then returns a (config-only or empty) watch-list as though the call completed normally. This directly contradicts the `IAlarmWatchListResolver.ResolveAsync` XML contract, which states: *"Cancellation is the one exception: a triggered cancellationToken still propagates an OperationCanceledException."* In practice the resolver is called from `GatewayAlarmMonitor.SubscribeAlarmsAsync` on the monitor's lifecycle token; if the gateway is shutting down (or the monitor lifecycle is being torn down) mid-discovery, the resolver hides the cancellation and the monitor proceeds to issue `SubscribeAlarms` with a wrong (empty) watch-list instead of unwinding promptly. The `GalaxyRepository.GetAlarmAttributesAsync` SQL path does honour the token (`OpenAsync(ct)` / `ExecuteReaderAsync(ct)` / `ReadAsync(ct)`), so a real cancellation can land inside this catch. + +**Recommendation:** Add a `catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) { throw; }` ahead of the general catch (or filter the general catch with `when (ex is not OperationCanceledException)`), so cancellation propagates per the documented contract while genuine discovery failures still degrade to a config-only list. Add a regression test that cancels the token mid-`GetAlarmAttributesAsync` and asserts `OperationCanceledException` propagates. + +**Resolution:** Resolved 2026-06-15. Confirmed against source: the bare `catch (Exception ex)` swallowed `OperationCanceledException`. Filtered the catch with `when (ex is not OperationCanceledException)` so a real cancellation propagates per the `IAlarmWatchListResolver` contract while genuine discovery failures still degrade to a config-only list. Regression test: `AlarmWatchListResolverTests.ResolveAsync_RepositoryCancelled_PropagatesOperationCanceled` (failed before the fix, passes after). + +### Server-052 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `src/ZB.MOM.WW.MxGateway.Server/Alarms/IAlarmWatchListResolver.cs:24-30`, `src/ZB.MOM.WW.MxGateway.Server/Alarms/AlarmWatchListResolver.cs:101-114`, `docs/GatewayConfiguration.md:247` | +| Status | Resolved | + +**Description:** Two prose-vs-code mismatches in the watch-list resolver: + +1. The `IAlarmWatchListResolver.ResolveAsync` XML `` promises that a triggered `cancellationToken` propagates an `OperationCanceledException`, but the implementation swallows it (see Server-051). Whichever way Server-051 is resolved, exactly one of the doc or the code is currently wrong; right now the doc over-promises. + +2. `AlarmDiscoveryOptions.ExcludeAttributes` and `docs/GatewayConfiguration.md:247` both describe `ExcludeAttributes` as removing entries from the **"Repository-derived"** watch-list. The implementation's `ordered.RemoveAll(e => excluded.Contains(e.Reference))` runs over the combined list — Galaxy-Repository rows **and** the explicit `Discovery:IncludeAttributes` entries appended just above it — so an exclude entry that matches an explicit include silently removes that include too. The behaviour is defensible (excludes win) but is not what the "Repository-derived" wording says, and an operator who adds an attribute via `IncludeAttributes` and also lists it in `ExcludeAttributes` would be surprised it disappears. + +**Recommendation:** For (1), align the `IAlarmWatchListResolver` doc with whatever Server-051 settles on. For (2), either restrict the exclude to GR-discovered rows (apply `RemoveAll` before appending the `IncludeAttributes` entries) or update the option XML doc and `GatewayConfiguration.md` to say excludes are applied to the merged GR-plus-include list and therefore also suppress matching explicit includes. + +**Resolution:** Resolved 2026-06-15. (1) No longer over-promises: the Server-051 fix makes the implementation propagate `OperationCanceledException`, so the `IAlarmWatchListResolver.ResolveAsync` `` doc is now accurate and was left unchanged. (2) Kept the "excludes win" code behaviour (excludes applied to the merged GR-plus-include list) and corrected the prose to match: `AlarmDiscoveryOptions.ExcludeAttributes` XML doc and `docs/GatewayConfiguration.md:247` now state the exclude runs after the GR rows and explicit `IncludeAttributes` are combined, so an exclude matching an explicit include suppresses it too. The "excludes win" precedence is pinned by `AlarmWatchListResolverTests.ResolveAsync_ExcludeAlsoSuppressesMatchingExplicitInclude`. + +### Server-053 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Testing coverage | +| Location | `src/ZB.MOM.WW.MxGateway.Tests/Alarms/AlarmWatchListResolverTests.cs`, `src/ZB.MOM.WW.MxGateway.Tests/Alarms/GatewayAlarmMonitorProviderModeTests.cs` | +| Status | Resolved | + +**Description:** The new alarm-fallback surface is broadly well-tested (`AlarmWatchListResolverTests`, `GatewayAlarmMonitorProviderModeTests`, `DashboardBrowseAndAlarmModelTests`, `GalaxyAlarmAttributeMappingTests`, `GatewayOptionsValidatorTests`), but two behaviours that the diff introduced have no coverage: + +- **Resolver cancellation contract (Server-051):** no test cancels the token mid-discovery and asserts `OperationCanceledException` propagates. Because the existing `ResolveAsync_RepositoryThrows_LogsAndReturnsConfigOnlySet` asserts the swallow path, the cancellation regression is precisely the case that would catch the Server-051 bug — and its absence is why the contract violation went unnoticed. +- **Exclude-vs-include precedence (Server-052 item 2):** no test exercises a `Discovery:IncludeAttributes` entry that also appears in `ExcludeAttributes`, so the "excludes also drop explicit includes" behaviour is unpinned and would silently change if the merge order were edited. + +Additionally, `GatewayAlarmMonitor.ApplyProviderModeChangeAsync` increments the `mxgateway.alarms.provider_switches` counter and resets `_providerSince` unconditionally on every `OnAlarmProviderModeChanged` event, with no guard for a redundant event whose `toMode` equals the current mode; there is no test asserting the from==to / no-op behaviour either way. + +**Recommendation:** Add resolver tests for (a) cancellation propagation and (b) an include that is also excluded; and a `GatewayAlarmMonitorProviderMode` test pinning the provider-switch counter behaviour for a same-mode repeat event (whichever semantics the team intends). These lock down the contracts the Server-051/052 findings expose. + +**Resolution:** Resolved 2026-06-15. Added all three missing tests: (a) `AlarmWatchListResolverTests.ResolveAsync_RepositoryCancelled_PropagatesOperationCanceled` (cancellation propagation, also covers Server-051); (b) `AlarmWatchListResolverTests.ResolveAsync_ExcludeAlsoSuppressesMatchingExplicitInclude` (exclude-vs-include precedence, also Server-052 item 2); and (c) `GatewayAlarmMonitorProviderModeTests.ProviderModeChange_RepeatedSameMode_RecordsASwitchForEachEvent`, which pins the existing semantics — each worker-reported `OnAlarmProviderModeChanged` event records a `provider_switches` increment (and resets `_providerSince`) even when `toMode` equals the current mode, since the worker is the authority on when a mode change occurred and the gateway does not synthesize or suppress it. diff --git a/docs/GatewayConfiguration.md b/docs/GatewayConfiguration.md index 7d2523b..b4f45a5 100644 --- a/docs/GatewayConfiguration.md +++ b/docs/GatewayConfiguration.md @@ -244,7 +244,7 @@ native wnwrap alarm-manager provider and the subtag-monitoring fallback. | `MxGateway:Alarms:Fallback:Discovery:UseGalaxyRepository` | `true` | When `true`, the monitor queries the Galaxy Repository SQL database to build the subtag watch-list for the configured area. | | `MxGateway:Alarms:Fallback:Discovery:Area` | _(empty)_ | Galaxy area to scope the Repository query to. Falls back to `MxGateway:Alarms:DefaultArea` when empty. Ignored when `UseGalaxyRepository` is `false`. This area is **not** used to compose a Repository-derived alarm's canonical `Galaxy!{area}.{reference}`: each discovered alarm uses its object's real Galaxy area (discovered via `gobject.area_gobject_id`), so the reference's group matches what the native alarmmgr emits. `Discovery:Area` / `DefaultArea` is used as the composition area only for explicit `IncludeAttributes` entries, which carry no discovered area. | | `MxGateway:Alarms:Fallback:Discovery:IncludeAttributes` | _(empty)_ | Explicit MXAccess attribute paths to add to the subtag watch-list, supplementing (or replacing, when `UseGalaxyRepository` is `false`) the Repository-derived list. | -| `MxGateway:Alarms:Fallback:Discovery:ExcludeAttributes` | _(empty)_ | Attribute paths to remove from the Repository-derived watch-list. Ignored when `UseGalaxyRepository` is `false`. | +| `MxGateway:Alarms:Fallback:Discovery:ExcludeAttributes` | _(empty)_ | Attribute paths to remove from the merged watch-list (case-insensitive). The exclude is applied after the Repository-derived rows and the explicit `IncludeAttributes` entries are combined, so an exclude that matches an explicit include suppresses it too — excludes win. Ignored when `UseGalaxyRepository` is `false`. | | `MxGateway:Alarms:Fallback:Subtags:Active` | `InAlarm` | Subtag name for the in-alarm boolean. Confirmed AVEVA `AlarmExtension` field name. | | `MxGateway:Alarms:Fallback:Subtags:Acked` | `Acked` | Subtag name for the acknowledged boolean. Confirmed AVEVA `AlarmExtension` field name. | | `MxGateway:Alarms:Fallback:Subtags:AckComment` | `AckMsg` | Subtag name for the acknowledgement comment write target. Writing this subtag performs the acknowledge in AVEVA. Confirmed AVEVA `AlarmExtension` field name. When empty, the ack-comment write path is disabled. | diff --git a/src/ZB.MOM.WW.MxGateway.Server/Alarms/AlarmWatchListResolver.cs b/src/ZB.MOM.WW.MxGateway.Server/Alarms/AlarmWatchListResolver.cs index 879621b..c7b159e 100644 --- a/src/ZB.MOM.WW.MxGateway.Server/Alarms/AlarmWatchListResolver.cs +++ b/src/ZB.MOM.WW.MxGateway.Server/Alarms/AlarmWatchListResolver.cs @@ -66,11 +66,13 @@ public sealed class AlarmWatchListResolver : IAlarmWatchListResolver { rows = await _repository.GetAlarmAttributesAsync(cancellationToken).ConfigureAwait(false); } - catch (Exception ex) + catch (Exception ex) when (ex is not OperationCanceledException) { // Discovery being unavailable must not crash the resolver: log and // continue with an empty discovery set. The caller decides what to - // do with the (possibly config-only) result. + // do with the (possibly config-only) result. Cancellation is the one + // exception — an OperationCanceledException propagates per the + // IAlarmWatchListResolver contract so the caller unwinds promptly. _logger.LogWarning( ex, "Galaxy Repository alarm-attribute discovery failed; continuing with configuration-only watch-list."); diff --git a/src/ZB.MOM.WW.MxGateway.Server/Configuration/AlarmFallbackOptions.cs b/src/ZB.MOM.WW.MxGateway.Server/Configuration/AlarmFallbackOptions.cs index afd3f63..7269cb9 100644 --- a/src/ZB.MOM.WW.MxGateway.Server/Configuration/AlarmFallbackOptions.cs +++ b/src/ZB.MOM.WW.MxGateway.Server/Configuration/AlarmFallbackOptions.cs @@ -86,7 +86,10 @@ public sealed class AlarmDiscoveryOptions public string[] IncludeAttributes { get; init; } = Array.Empty(); /// - /// Attribute paths to exclude from the Repository-derived poll list. + /// Attribute paths to remove from the merged poll list (case-insensitive). + /// The exclude runs after the Repository-derived rows and the explicit + /// entries are combined, so an exclude that + /// matches an explicit include suppresses it too — excludes win. /// Ignored when is false. /// Default empty. /// diff --git a/src/ZB.MOM.WW.MxGateway.Tests/Alarms/AlarmWatchListResolverTests.cs b/src/ZB.MOM.WW.MxGateway.Tests/Alarms/AlarmWatchListResolverTests.cs index 86e9de3..d25dc53 100644 --- a/src/ZB.MOM.WW.MxGateway.Tests/Alarms/AlarmWatchListResolverTests.cs +++ b/src/ZB.MOM.WW.MxGateway.Tests/Alarms/AlarmWatchListResolverTests.cs @@ -306,6 +306,56 @@ public sealed class AlarmWatchListResolverTests Assert.Contains("Tank01", target.ActiveSubtag, StringComparison.Ordinal); } + /// + /// Server-051: a cancellation triggered while Galaxy Repository discovery is + /// awaiting must propagate as , not be + /// swallowed into a config-only watch-list, per the + /// contract. + /// + [Fact] + public async Task ResolveAsync_RepositoryCancelled_PropagatesOperationCanceled() + { + using CancellationTokenSource cts = new(); + // Repository observes the token, cancels the source, then throws the matching + // OperationCanceledException — exactly what the live SQL path does on shutdown. + CancellingGalaxyRepository repo = new(cts); + + AlarmWatchListResolver resolver = CreateResolver(repo); + + await Assert.ThrowsAnyAsync(() => + resolver.ResolveAsync( + Options(include: ["Tank01.Level.HiHi"]), + cts.Token)); + } + + /// + /// Server-052 item 2 / Server-053: an entry that appears in both + /// IncludeAttributes and ExcludeAttributes is removed — excludes + /// win over explicit includes (the documented "excludes also suppress matching + /// explicit includes" behaviour). + /// + [Fact] + public async Task ResolveAsync_ExcludeAlsoSuppressesMatchingExplicitInclude() + { + StubGalaxyRepository repo = new( + [ + new GalaxyAlarmAttributeRow { FullTagReference = "Tank01.Level.HiHi", SourceObjectReference = "Tank01", Area = "TestArea" }, + ]); + + AlarmWatchListResolver resolver = CreateResolver(repo); + + // Pump01.Fault is an explicit include AND an exclude (case-insensitive). It must + // be dropped; the GR row Tank01 survives. + IReadOnlyList result = await resolver.ResolveAsync(Options( + useGalaxyRepository: true, + include: ["Pump01.Fault"], + exclude: ["pump01.fault"])); + + AlarmSubtagTarget target = Assert.Single(result); + Assert.Contains("Tank01", target.ActiveSubtag, StringComparison.Ordinal); + Assert.DoesNotContain(result, t => t.ActiveSubtag.Contains("Pump01", StringComparison.OrdinalIgnoreCase)); + } + /// In-memory returning a fixed alarm rowset. private sealed class StubGalaxyRepository(List rows) : IGalaxyRepository { @@ -357,4 +407,35 @@ public sealed class AlarmWatchListResolverTests public Task> GetAlarmAttributesAsync(CancellationToken ct = default) => Task.FromException>(toThrow); } + + /// + /// whose alarm-attribute query cancels the + /// supplied source and throws a token-bound , + /// mirroring the live SQL path being cancelled mid-await. + /// + private sealed class CancellingGalaxyRepository(CancellationTokenSource source) : IGalaxyRepository + { + /// + public Task TestConnectionAsync(CancellationToken ct = default) => Task.FromResult(true); + + /// + public Task GetLastDeployTimeAsync(CancellationToken ct = default) => + Task.FromResult(null); + + /// + public Task> GetHierarchyAsync(CancellationToken ct = default) => + Task.FromResult(new List()); + + /// + public Task> GetAttributesAsync(CancellationToken ct = default) => + Task.FromResult(new List()); + + /// + public Task> GetAlarmAttributesAsync(CancellationToken ct = default) + { + source.Cancel(); + ct.ThrowIfCancellationRequested(); + return Task.FromResult(new List()); + } + } } diff --git a/src/ZB.MOM.WW.MxGateway.Tests/Alarms/GatewayAlarmMonitorProviderModeTests.cs b/src/ZB.MOM.WW.MxGateway.Tests/Alarms/GatewayAlarmMonitorProviderModeTests.cs index e36aa87..8ed5171 100644 --- a/src/ZB.MOM.WW.MxGateway.Tests/Alarms/GatewayAlarmMonitorProviderModeTests.cs +++ b/src/ZB.MOM.WW.MxGateway.Tests/Alarms/GatewayAlarmMonitorProviderModeTests.cs @@ -120,6 +120,104 @@ public sealed class GatewayAlarmMonitorProviderModeTests await monitor.StopAsync(CancellationToken.None); } + /// + /// Server-053: a redundant OnAlarmProviderModeChanged event whose target + /// mode equals the current mode still records a provider switch. The worker is the + /// authority on when a mode change occurred; the gateway does not second-guess it, + /// so each event the worker emits increments provider_switches (no from==to + /// suppression). This test pins that semantics so it cannot drift silently. + /// + [Fact] + public async Task ProviderModeChange_RepeatedSameMode_RecordsASwitchForEachEvent() + { + using GatewayMetrics metrics = new(); + long switchCount = 0; + using MeterListener listener = new(); + listener.InstrumentPublished = (instrument, meterListener) => + { + if (ReferenceEquals(instrument.Meter, metrics.Meter) + && instrument.Name == "mxgateway.alarms.provider_switches") + { + meterListener.EnableMeasurementEvents(instrument); + } + }; + listener.SetMeasurementEventCallback( + (instrument, measurement, _, _) => + { + if (ReferenceEquals(instrument.Meter, metrics.Meter) + && instrument.Name == "mxgateway.alarms.provider_switches") + { + Interlocked.Add(ref switchCount, measurement); + } + }); + listener.Start(); + + FakeSessionManager sessions = new(); + using GatewayAlarmMonitor monitor = CreateMonitor(sessions, metrics); + + using CancellationTokenSource cts = new(); + await monitor.StartAsync(cts.Token); + await sessions.WaitForSubscribeAsync(WaitTimeout); + + List received = []; + TaskCompletionSource baselineReceived = new(TaskCreationOptions.RunContinuationsAsynchronously); + using CancellationTokenSource streamCts = new(); + Task reader = Task.Run(async () => + { + try + { + await foreach (AlarmFeedMessage message in monitor.StreamAsync(null, streamCts.Token)) + { + lock (received) + { + received.Add(message); + if (received.Count == 1) + { + baselineReceived.TrySetResult(); + } + } + } + } + catch (OperationCanceledException) + { + // Expected when the test cancels the stream. + } + }); + + await baselineReceived.Task.WaitAsync(WaitTimeout); + + // First subtag-mode event. + sessions.EmitEvent(new MxEvent + { + OnAlarmProviderModeChanged = new OnAlarmProviderModeChangedEvent + { + Mode = AlarmProviderMode.Subtag, + Reason = "alarmmgr failed", + At = Timestamp.FromDateTimeOffset(DateTimeOffset.UtcNow), + }, + }); + await WaitUntilAsync(() => Interlocked.Read(ref switchCount) >= 1, WaitTimeout); + + // Second subtag-mode event — same mode, but still a worker-reported switch. + sessions.EmitEvent(new MxEvent + { + OnAlarmProviderModeChanged = new OnAlarmProviderModeChangedEvent + { + Mode = AlarmProviderMode.Subtag, + Reason = "still degraded", + At = Timestamp.FromDateTimeOffset(DateTimeOffset.UtcNow), + }, + }); + await WaitUntilAsync(() => Interlocked.Read(ref switchCount) >= 2, WaitTimeout); + + Assert.Equal(2, Interlocked.Read(ref switchCount)); + + await streamCts.CancelAsync(); + await reader; + await cts.CancelAsync(); + await monitor.StopAsync(CancellationToken.None); + } + [Fact] public async Task NewSubscriber_ReceivesProviderStatusAsFirstMessage() {