Compare commits

..

69 Commits

Author SHA1 Message Date
Joseph Doherty c7f754c77b fix(client/python): cap setuptools<77 so dist stays metadata 2.2 for Gitea PyPI feed; proprietary via classifier 2026-06-15 05:30:22 -04:00
Joseph Doherty 144c293f05 chore(clients): bump all five clients 0.1.0 -> 0.1.1 for release 2026-06-15 05:07:17 -04:00
Joseph Doherty 7c957908f8 docs(code-review): regenerate index — all re-review findings resolved 2026-06-15 03:04:37 -04:00
Joseph Doherty 6659653673 fix(client/java): handle PROVIDER_STATUS alarm-feed arm — CLI build break (Client.Java-039) 2026-06-15 03:01:13 -04:00
Joseph Doherty 75a39f5a8c fix(client/java): correct browseChildrenRaw README; CLI --require-certificate-validation (Client.Java-037,038) 2026-06-15 02:56:15 -04:00
Joseph Doherty cebe67e9bd fix(worker): resilient failover switch; FIPS-safe synthetic GUID; dup-reference guard + tests (Worker-026..028, Worker.Tests-031..033) 2026-06-15 02:56:15 -04:00
Joseph Doherty ddf2d84fbc contracts: round-trip degraded provenance/watch-list/mode-changed; proto doc (Contracts-018,019) 2026-06-15 02:46:06 -04:00
Joseph Doherty 56dd56954b test(gateway): cover failback reason, FromFeed/SinceUtc badge paths; style + bounded drain (Tests-032..035) 2026-06-15 02:46:06 -04:00
Joseph Doherty b57d02cc4d fix(client/rust): handle provider_status arm (build break); real system-roots TLS; design doc (Client.Rust-030..032) 2026-06-15 02:39:11 -04:00
Joseph Doherty 47062c1a6e fix(client/python): reachable cert-validation flag; bounded off-loop TOFU probe; license/marker fixes (Client.Python-027..031) 2026-06-15 02:39:11 -04:00
Joseph Doherty d0d1dcef15 fix(client/go): correct tag-go-module dirty-tree guard; GOPRIVATE docs (Client.Go-028,029) 2026-06-15 02:39:11 -04:00
Joseph Doherty fb2b1a4a52 fix(client/dotnet): restore warnings-as-errors floor; license metadata; LazyBrowseNode publication (Client.Dotnet-022..025) 2026-06-15 02:39:11 -04:00
Joseph Doherty d2c776901b fix(integrationtests): repair GatewayAlarmMonitor ctor build break; LDAP bind + docs (IntegrationTests-026..029) 2026-06-15 02:39:11 -04:00
Joseph Doherty 258e09e0de fix(server): propagate watch-list cancellation; doc + test gaps (Server-051..053) 2026-06-15 02:39:11 -04:00
Joseph Doherty 410acc92eb feat(dashboard): distinct 'forced' subtag provider badge
Render Fallback:Mode=ForceSubtag as a cyan 'Subtag monitoring (forced)'
badge, distinct from the amber failover 'degraded' badge, so an intentional
configuration isn't shown as a fault. Distinguished by the shared
AlarmProviderReasons.ForcedSubtag reason carried on the provider-status feed.
2026-06-15 01:43:17 -04:00
Joseph Doherty b40aaeef05 docs: forced-subtag fix resolution (#1 proto artifact, #2 fixed) 2026-06-15 01:33:14 -04:00
Joseph Doherty 9208225f9c fix: gateway reflects configured forced provider mode into gauge/feed (#2) 2026-06-15 01:10:04 -04:00
Joseph Doherty c6f17557f6 docs: forced-subtag mode fix plan 2026-06-15 01:04:46 -04:00
Joseph Doherty bbbef4d098 D2: document no current duplicate / endpoint (no-op) 2026-06-14 23:49:47 -04:00
Joseph Doherty 4af24b9518 D1: surface AlarmProviderSwitchCount on dashboard metric list 2026-06-14 23:49:02 -04:00
Joseph Doherty 371ce53409 docs: add deferred follow-ups plan (D1-D5) 2026-06-14 23:46:21 -04:00
Joseph Doherty 597677025f Merge alarm-fallback cleanup: metrics snapshot/reason, SQL prune, teardown, doc drift
Implements the actionable deferred items from pending.md (B1-B5, C6-C7):
- B1/B2 metrics: provider-switch count in snapshot + bounded reason enum
- B5: drop dead primitive branch from AlarmAttributesSql
- B3/B4 worker: UnAdvise only advised handles (+Dispose tests); remove dead field
- C6/C7: doc clarifications and design-doc superseded notes

Verified: gateway tests on macOS, net48/x86 worker suite (318 passed) on windev.
2026-06-14 02:39:10 -04:00
Joseph Doherty 393e326275 docs(alarms): note operator/IDE toggle drives the live subtag smoke test
C6a: the rig's TestAlarm attributes are object-driven; a flip script OR a manual
operator/IDE toggle drives them (confirmed live 2026-06-14). Update the how-to-run
comments and Skip reason accordingly.
2026-06-14 02:35:59 -04:00
Joseph Doherty 986dcee14a worker(alarms): UnAdvise only advised handles in LmxSubtagAlarmSource teardown
B3: track advised handles separately from added handles so Dispose only UnAdvises
items that were actually advised — a write-only subtag (e.g. ack-comment added by
Write, never advised) is removed but not unadvised. Add Dispose tests covering the
advised/write-only split, full removal, single Unregister, and double-dispose
idempotency.
2026-06-14 02:35:59 -04:00
Joseph Doherty a3752799de worker(alarms): remove dead FailoverAlarmConsumer.subscriptionExpression
B4: the field was stored in Subscribe but never read — the primary is never
re-subscribed during probing. Drop it and keep the rationale as a comment.
2026-06-14 02:35:59 -04:00
Joseph Doherty 37aadf72b3 docs(alarms): clarify resolver cancellation contract; mark design doc superseded
C6b: IAlarmWatchListResolver.ResolveAsync doc now notes that while discovery being
unavailable never throws, a triggered cancellation token still propagates.
C7: annotate the original design doc where it drifted from the shipped code — metric
names / unimplemented watch-list gauges, and the proto-type location (gateway proto, not
worker proto).
2026-06-14 02:33:14 -04:00
Joseph Doherty 5573f2a229 galaxy(alarms): drop dead primitive branch from AlarmAttributesSql
B5: the candidate CTE's src_pri=1 (primitive-instance) UNION ALL branch was always
excluded by the final WHERE r.src_pri=0, so it added work with no output change. Remove
the branch and the now-constant src_pri column/filter. An alarm anchor is always a user
attribute, so output is identical.
2026-06-14 02:33:14 -04:00
Joseph Doherty 56abd64c6c metrics(alarms): expose provider-switch count in snapshot, bound the reason tag
B1: add AlarmProviderSwitchCount to GatewayMetricsSnapshot so the switch total is
readable without scraping the OTEL counter.
B2: replace the free-text reason tag on mxgateway.alarms.provider_switches with a
bounded AlarmProviderSwitchReason enum (failover/failback/unknown); the human-readable
reason stays in the structured log.
2026-06-14 02:33:02 -04:00
Joseph Doherty 5b31e99ab6 alarms: compose subtag reference from object's real Galaxy area for exact alarmmgr parity 2026-06-14 02:12:11 -04:00
Joseph Doherty 64db828d71 docs(alarms): record live confirmation of subtag path + ack; advise-before-write requirement 2026-06-13 11:26:08 -04:00
Joseph Doherty 1a9367b5de worker(alarms): advise ack-comment subtag so the ack write targets an active MXAccess item 2026-06-13 11:23:39 -04:00
Joseph Doherty 98e997b573 test(alarms): probe writes evidence log to PROBE_LOG file 2026-06-13 11:15:05 -04:00
Joseph Doherty 0e8d911fd8 test(alarms): live runtime-path resolution probe (LiveMxAccessFact) for alarm subtags 2026-06-13 11:14:12 -04:00
Joseph Doherty e72763d703 alarms: use confirmed AVEVA AlarmExtension subtag names (InAlarm/Acked/AckMsg/Priority) 2026-06-13 11:07:22 -04:00
Joseph Doherty 3c9becc8d6 docs(plan): mark all alarm-subtag-fallback tasks completed 2026-06-13 10:55:18 -04:00
Joseph Doherty ec88532fe4 alarms: propagate degraded/source_provider through snapshot + gateway cache paths (integration fix I1/I2) 2026-06-13 10:53:55 -04:00
Joseph Doherty 2f30f0c7c0 docs(alarms): document alarmmgr->subtag fallback (providers, failover, config, contract, parity) 2026-06-13 10:43:37 -04:00
Joseph Doherty 27f6c9e6b7 dashboard(alarms): provider-status badge (alarmmgr vs degraded subtag) 2026-06-13 10:37:37 -04:00
Joseph Doherty 29bd504a99 test(alarms): end-to-end provider failover/failback lifecycle through GatewayAlarmMonitor 2026-06-13 10:34:24 -04:00
Joseph Doherty e10b252e3a test(alarms): drop unsupported Assert.Equal message args in live subtag smoke test (xUnit) 2026-06-13 10:30:39 -04:00
Joseph Doherty bcc54ca56b server(alarms): provider-mode gauge startup baseline; reconcile-lock comment; de-flake monitor test 2026-06-13 10:29:13 -04:00
Joseph Doherty ee459f43e1 test(alarms): opt-in live subtag-fallback smoke test (Skip by default)
Adds AlarmSubtagLiveSmokeTests to validate the open design item from Task 17:
confirms that LmxSubtagAlarmSource (real MxAccessComObjectFactory) wired to
SubtagAlarmConsumer synthesizes degraded Raise transitions with stable synthetic
GUIDs from Galaxy alarm subtags, and that AcknowledgeByName writes the
ack-comment subtag (rc=0). PLACEHOLDER_* subtag addresses are best-guess and
must be verified against MXAccess-Public-API.md + live Galaxy before flipping Skip.
2026-06-13 10:26:28 -04:00
Joseph Doherty ebf1d95f72 server(alarms): monitor resolves watch-list, sends ForcedMode/failover, reflects provider mode into feed + metrics 2026-06-13 10:20:03 -04:00
Joseph Doherty 3ccf0b5f9e server(alarms): honor ExcludeAttributes GR-only contract; warn on empty config-only watch-list 2026-06-13 10:12:58 -04:00
Joseph Doherty f7ccfd678e server(alarms): watch-list resolver merging GR discovery + config override 2026-06-13 10:09:10 -04:00
Joseph Doherty 3f5e5fc0b3 worker(alarms): route ForcedMode/watch-list/failover via AlarmCommandHandler; emit provider-mode-changed event 2026-06-13 10:04:33 -04:00
Joseph Doherty 7241a4fb9c worker(alarms): net48 index fix; enforce ProbeIntervalSeconds; OOM-safe catch; reset-on-failure test 2026-06-13 09:55:07 -04:00
Joseph Doherty d6c0bb41ca worker(alarms): failback probe re-polls the still-subscribed primary (no re-Subscribe) 2026-06-13 09:49:38 -04:00
Joseph Doherty 0a54c0bc4b worker(alarms): FailoverAlarmConsumer auto-failover/failback state machine 2026-06-13 09:46:47 -04:00
Joseph Doherty fd64b9260c worker(alarms): exact-match ack resolution (no substring false-match) + ack-by-guid tests 2026-06-13 09:42:00 -04:00
Joseph Doherty 4bd757a136 worker(alarms): SubtagAlarmConsumer synthesizing degraded transitions; dispatcher propagates Degraded 2026-06-13 09:35:49 -04:00
Joseph Doherty 1e2ed6d1ea worker(alarms): WriteRecord as class not positional record (net48 has no IsExternalInit) 2026-06-13 09:30:52 -04:00
Joseph Doherty 5f6655de27 server(alarms): drop redundant null-coalesce; tidy validator tests (review fixes) 2026-06-13 09:27:37 -04:00
Joseph Doherty fbc9cf56df worker(alarms): SyntheticAlarmGuid internal + alarmmgr-parity assertion (review fixes) 2026-06-13 09:26:52 -04:00
Joseph Doherty 4c0e14fc5d worker(alarms): COM-backed LmxSubtagAlarmSource advising alarm subtags 2026-06-13 09:24:09 -04:00
Joseph Doherty c75920c620 docs(plan): correct alarm proto location to mxaccess_gateway.proto (Tasks 1-2) 2026-06-13 09:18:11 -04:00
Joseph Doherty a46ce90e6f server(metrics): alarm provider mode gauge + provider switch counter (Task 13) 2026-06-13 09:18:11 -04:00
Joseph Doherty f113ca53a1 server(galaxy): GetAlarmAttributesAsync discovery query + alarm-attribute row mapping (Task 11) 2026-06-13 09:18:11 -04:00
Joseph Doherty f3616cc7fa server(alarms): AlarmFallbackOptions + ForceSubtag/threshold validation (Task 10) 2026-06-13 09:18:11 -04:00
Joseph Doherty 57d5a8725f worker(alarms): synthetic GUID + degraded/source_provider on emitted transitions 2026-06-13 09:14:23 -04:00
Joseph Doherty 60d35a914f contracts: regenerate Generated/ for alarm provider mode + subtag types
Keeps committed generated C# in sync with the .proto change in 1d85db7
(AlarmProviderMode, AlarmSubtagTarget, AlarmFailoverConfig, AlarmProviderStatus,
OnAlarmProviderModeChangedEvent, degraded/source_provider fields).
2026-06-13 09:10:08 -04:00
Joseph Doherty b10e103bcf worker(alarms): fix net48 build (init->set, usings), token-boundary name parse, acked latch, dup-address guard, tests 2026-06-13 09:05:58 -04:00
Joseph Doherty 348ab16456 worker(alarms): subtag value-source seam + synthesis state machine 2026-06-13 08:57:28 -04:00
Joseph Doherty c16f016f0a test(contracts): round-trip provider status + degraded provenance 2026-06-13 08:56:13 -04:00
Joseph Doherty 1d85db7b4e contracts(gateway): AlarmProviderMode, subtag watch-list, provider status, degraded provenance, mode-changed event 2026-06-13 08:53:02 -04:00
Joseph Doherty 5ea5618315 docs: implementation plan for alarm subtag-monitoring fallback
18 TDD tasks across contracts, worker (SubtagAlarmConsumer + FailoverAlarmConsumer),
gateway (GR-SQL watch-list discovery, monitor mode reflection, metrics, dashboard),
and docs. Grounded in current signatures; parity-preserving (worker-side synthesis).
2026-06-13 08:44:42 -04:00
Joseph Doherty 38a0ad8ab4 docs: design for alarmmgr→subtag alarm-provider fallback
Auto-failover/failback between the wnwrap alarmmgr consumer and a new
worker-side SubtagAlarmConsumer that advises alarm subtags and synthesizes
transitions. GR-SQL+config watch-list discovery, ack via ack-comment write,
degraded state surfaced in the gRPC contract and dashboard/metrics.
2026-06-13 08:35:18 -04:00
Joseph Doherty 5df2ef0d1e chore(theme): bump ZB.MOM.WW.Theme 0.3.0 -> 0.3.1 (interactive-render nav fix) 2026-06-05 07:19:11 -04:00
Joseph Doherty e5785fd769 chore(theme): consume ZB.MOM.WW.Theme 0.3.0 (nav/login kit fixes) 2026-06-05 05:13:06 -04:00
116 changed files with 14198 additions and 625 deletions
+19 -2
View File
@@ -1,4 +1,17 @@
<Project>
<PropertyGroup>
<!-- Build-quality enforcement floor, mirroring src/Directory.Build.props so the
.NET client tree is held to the same baseline CLAUDE.md mandates (warnings as
errors, code-style enforced at build, latest analyzers, deterministic builds). -->
<LangVersion>latest</LangVersion>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<AnalysisLevel>latest</AnalysisLevel>
<EnforceCodeStyleInBuild>true</EnforceCodeStyleInBuild>
<Deterministic>true</Deterministic>
</PropertyGroup>
<PropertyGroup>
<!-- Shared package metadata for clients/dotnet/. Individual projects opt in via <IsPackable>true</IsPackable>. -->
<Authors>Joseph Doherty</Authors>
@@ -10,11 +23,15 @@
<PackageProjectUrl>https://gitea.dohertylan.com/dohertj2/mxaccessgw</PackageProjectUrl>
<PackageTags>mxaccess;mxgateway;grpc;client;archestra</PackageTags>
<PackageRequireLicenseAcceptance>false</PackageRequireLicenseAcceptance>
<!-- Proprietary/internal package, consistent with the Rust ("Proprietary") and
Python ("Proprietary") client license declarations. A LicenseRef SPDX expression
is rejected by the current NuGet toolset (NU5124), so the proprietary terms ship
as a packaged license file instead. -->
<PackageLicenseFile>LICENSE.txt</PackageLicenseFile>
<!-- Versioning: bump per release. Symbols ship as snupkg. -->
<Version>0.1.0</Version>
<Version>0.1.1</Version>
<IncludeSymbols>true</IncludeSymbols>
<SymbolPackageFormat>snupkg</SymbolPackageFormat>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<!-- Default: do NOT pack. Each project opts in. -->
<IsPackable>false</IsPackable>
</PropertyGroup>
+12
View File
@@ -0,0 +1,12 @@
Proprietary License
Copyright (c) ZB MOM WW. All rights reserved.
This software and its source code are proprietary and confidential. They are
licensed, not sold, for internal use within ZB MOM WW and its authorized
partners only. No part of this package may be reproduced, distributed, or
transmitted to third parties without the prior written permission of ZB MOM WW.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT.
+1 -1
View File
@@ -328,7 +328,7 @@ dotnet nuget add source https://gitea.dohertylan.com/api/packages/dohertj2/nuget
Then add the package to your project:
````bash
dotnet add package ZB.MOM.WW.MxGateway.Client --version 0.1.0
dotnet add package ZB.MOM.WW.MxGateway.Client --version 0.1.1
````
The `ZB.MOM.WW.MxGateway.Contracts` package is pulled in transitively.
@@ -135,25 +135,35 @@ internal sealed class FakeGalaxyRepositoryTransport(MxGatewayClientOptions optio
/// <summary>Queue of exceptions to throw from BrowseChildren; dequeued in FIFO order.</summary>
public Queue<Exception> BrowseChildrenExceptions { get; } = new();
/// <summary>
/// Optional hook awaited inside BrowseChildren before the reply is produced. Lets a
/// test hold an RPC mid-flight to exercise concurrent reads of the in-progress node.
/// </summary>
public Func<Task>? BrowseChildrenGate { get; set; }
/// <summary>
/// Records the request and either throws a queued exception or returns the configured reply.
/// </summary>
/// <param name="request">The BrowseChildrenRequest to process.</param>
/// <param name="callOptions">Call options specifying RPC behavior.</param>
public Task<BrowseChildrenReply> BrowseChildrenAsync(
public async Task<BrowseChildrenReply> BrowseChildrenAsync(
BrowseChildrenRequest request,
CallOptions callOptions)
{
BrowseChildrenCalls.Add((request, callOptions));
if (BrowseChildrenExceptions.TryDequeue(out Exception? exception))
{
return Task.FromException<BrowseChildrenReply>(exception);
throw exception;
}
return Task.FromResult(
BrowseChildrenReplies.TryDequeue(out BrowseChildrenReply? reply)
? reply
: BrowseChildrenReply);
if (BrowseChildrenGate is { } gate)
{
await gate().ConfigureAwait(false);
}
return BrowseChildrenReplies.TryDequeue(out BrowseChildrenReply? reply)
? reply
: BrowseChildrenReply;
}
/// <summary>
@@ -175,6 +175,96 @@ public sealed class LazyBrowseNodeTests
Assert.Equal(2, transport.BrowseChildrenCalls.Count);
}
/// <summary>
/// Verifies that reading Children/IsExpanded concurrently with an in-flight ExpandAsync
/// never throws (no torn enumeration of a mid-append list) and, once IsExpanded flips to
/// true, the published Children snapshot is fully populated. Pins the safe-publication
/// contract on the lock-free readers (Client.Dotnet-025).
/// </summary>
[Fact]
public async Task Expand_ConcurrentReadOfChildren_NeverTearsAndPublishesAtomically()
{
FakeGalaxyRepositoryTransport transport = CreateTransport();
transport.BrowseChildrenReplies.Enqueue(BuildReply(
children: [BuildObject(1, "Plant", isArea: true)],
childHasChildren: [true],
cacheSequence: 1));
// Multi-page child set so the expand loop spends meaningful time appending,
// widening the window for a concurrent reader to observe a torn list.
BrowseChildrenReply childPage1 = BuildReply(
children: [BuildObject(10, "A"), BuildObject(11, "B"), BuildObject(12, "C")],
childHasChildren: [false, false, false],
cacheSequence: 1);
childPage1.NextPageToken = "1:p:3";
transport.BrowseChildrenReplies.Enqueue(childPage1);
transport.BrowseChildrenReplies.Enqueue(BuildReply(
children: [BuildObject(13, "D"), BuildObject(14, "E")],
childHasChildren: [false, false],
cacheSequence: 1));
await using GalaxyRepositoryClient client = CreateClient(transport);
IReadOnlyList<LazyBrowseNode> roots = await client.BrowseAsync();
LazyBrowseNode node = roots[0];
// Gate the child-page RPCs so the expand stays mid-flight while the reader spins.
using SemaphoreSlim release = new(0, 1);
bool firstChildCall = true;
transport.BrowseChildrenGate = async () =>
{
if (firstChildCall)
{
firstChildCall = false;
await release.WaitAsync().ConfigureAwait(false);
}
};
using CancellationTokenSource readerStop = new();
Exception? readerFailure = null;
Task reader = Task.Run(() =>
{
try
{
while (!readerStop.IsCancellationRequested)
{
bool expanded = node.IsExpanded;
// Enumerate the snapshot; a torn/mid-append list would throw here.
int count = 0;
foreach (LazyBrowseNode _ in node.Children)
{
count++;
}
// If the node reports expanded, the published snapshot must be complete.
if (expanded)
{
Assert.Equal(5, count);
}
}
}
catch (Exception ex)
{
readerFailure = ex;
}
});
Task expand = node.ExpandAsync();
// Let the reader spin against the empty pre-publication snapshot for a moment.
await Task.Delay(50);
release.Release();
await expand;
// Let the reader observe the post-publication state, then stop it.
await Task.Delay(50);
readerStop.Cancel();
await reader;
Assert.Null(readerFailure);
Assert.True(node.IsExpanded);
Assert.Equal(5, node.Children.Count);
}
/// <summary>
/// Verifies that BrowseChildrenOptions filter fields are forwarded to the BrowseChildren request.
/// </summary>
@@ -12,9 +12,14 @@ public sealed class LazyBrowseNode
{
private readonly GalaxyRepositoryClient _client;
private readonly BrowseChildrenOptions _options;
private readonly List<LazyBrowseNode> _children = [];
private readonly SemaphoreSlim _expandLock = new(1, 1);
private bool _isExpanded;
// Published once, under _expandLock, when expansion completes. Lock-free readers
// see either the empty pre-expansion snapshot or the fully-populated post-expansion
// snapshot — never a partially-filled list — because the snapshot is built in a local
// and handed off via Volatile.Write (release) paired with Volatile.Read (acquire).
private IReadOnlyList<LazyBrowseNode> _children = [];
private volatile bool _isExpanded;
internal LazyBrowseNode(
GalaxyRepositoryClient client,
@@ -35,7 +40,7 @@ public sealed class LazyBrowseNode
public bool HasChildrenHint { get; }
/// <summary>Direct children loaded by <see cref="ExpandAsync"/>; empty until then.</summary>
public IReadOnlyList<LazyBrowseNode> Children => _children;
public IReadOnlyList<LazyBrowseNode> Children => Volatile.Read(ref _children);
/// <summary>True after the first <see cref="ExpandAsync"/> call completes.</summary>
public bool IsExpanded => _isExpanded;
@@ -46,7 +51,13 @@ public sealed class LazyBrowseNode
/// </summary>
/// <remarks>
/// Thread-safe: concurrent callers see exactly one fetch; subsequent callers
/// (after the first completes) return immediately.
/// (after the first completes) return immediately. <see cref="Children"/> and
/// <see cref="IsExpanded"/> may be read concurrently with an in-flight
/// <see cref="ExpandAsync"/> on another thread; the populated children are
/// published as an immutable snapshot under a release barrier, so a reader that
/// observes <see cref="IsExpanded"/> as <see langword="true"/> always sees the
/// fully-populated <see cref="Children"/>, and a reader never enumerates a
/// partially-built list.
/// </remarks>
/// <param name="cancellationToken">Token to observe for cancellation.</param>
public async Task ExpandAsync(CancellationToken cancellationToken = default)
@@ -64,6 +75,10 @@ public sealed class LazyBrowseNode
return;
}
// Accumulate into a local list, never the published field, so a lock-free
// reader can never observe a half-populated collection or enumerate a list
// that is being mutated mid-append.
List<LazyBrowseNode> children = [];
string pageToken = string.Empty;
HashSet<string> seenPageTokens = new(StringComparer.Ordinal);
do
@@ -79,7 +94,7 @@ public sealed class LazyBrowseNode
for (int i = 0; i < reply.Children.Count; i++)
{
bool hint = i < reply.ChildHasChildren.Count && reply.ChildHasChildren[i];
_children.Add(new LazyBrowseNode(_client, reply.Children[i], hint, _options));
children.Add(new LazyBrowseNode(_client, reply.Children[i], hint, _options));
}
pageToken = reply.NextPageToken;
@@ -91,6 +106,10 @@ public sealed class LazyBrowseNode
}
while (!string.IsNullOrWhiteSpace(pageToken));
// Publish the completed, immutable snapshot (release) before marking the node
// expanded (the volatile write below). A reader that observes IsExpanded == true
// is guaranteed to also observe the fully-populated Children.
Volatile.Write(ref _children, children);
_isExpanded = true;
}
finally
@@ -21,10 +21,15 @@
<PackageId>ZB.MOM.WW.MxGateway.Client</PackageId>
<Description>.NET 10 gRPC client for the MxAccessGateway service. Provides typed wrappers, retry, and a lazy-browse walker over the Galaxy Repository hierarchy.</Description>
<PackageReadmeFile>README.md</PackageReadmeFile>
<!-- Only the shipped library generates XML docs (matching src/Contracts). The Cli and
Tests projects are not packable and do not document their public surface, so this
stays out of the shared Directory.Build.props to avoid CS1591 on test classes. -->
<GenerateDocumentationFile>true</GenerateDocumentationFile>
</PropertyGroup>
<ItemGroup>
<None Include="..\README.md" Pack="true" PackagePath="\" />
<None Include="..\LICENSE.txt" Pack="true" PackagePath="\" />
</ItemGroup>
<ItemGroup>
+6 -3
View File
@@ -288,7 +288,7 @@ go run ./cmd/mxgw-go smoke -endpoint $env:MXGATEWAY_ENDPOINT -plaintext -api-key
The module is resolved directly from the git repo — no package registry:
````bash
go get gitea.dohertylan.com/dohertj2/mxaccessgw/clients/go@v0.1.0
go get gitea.dohertylan.com/dohertj2/mxaccessgw/clients/go@v0.1.1
````
Then import:
@@ -299,8 +299,11 @@ import "gitea.dohertylan.com/dohertj2/mxaccessgw/clients/go/mxgateway"
If your build environment cannot reach `gitea.dohertylan.com` directly,
configure `GOPROXY` to point at an internal proxy that fronts the Gitea
repo, or use `GONOSUMCHECK` + `GOPRIVATE` to bypass the checksum database
for the internal module path.
repo, or set `GOPRIVATE=gitea.dohertylan.com/*` to fetch the module
straight from the VCS — this both bypasses the public module proxy and
disables checksum-database (`sum.golang.org`) verification for that path.
Add `GOINSECURE=gitea.dohertylan.com/*` if the host serves the module over
plain HTTP rather than HTTPS.
## Releasing a new version
+12 -7
View File
@@ -128,18 +128,20 @@ gradle :zb-mom-ww-mxgateway-cli:run --args="galaxy-discover --endpoint localhost
### Browsing lazily
For UI trees or OPC UA bridges, use `browseChildren` to walk one level at a
For UI trees or OPC UA bridges, use `browseChildrenRaw` to walk one level at a
time instead of loading the full hierarchy with `discoverHierarchy`. Pass a
default request for root objects; subsequent calls set `parentGobjectId`,
`parentTagName`, or `parentContainedPath`. Filter fields match
`DiscoverHierarchy`. Each response pairs `getChildrenList()` with
`getChildHasChildrenList()` so you know which nodes to expand. See
[Galaxy Repository](../../docs/GalaxyRepository.md#browsechildren) for full
request and filter semantics. This snippet documents the API as it appears once
the Java client is regenerated on the Windows host.
request and filter semantics. For most callers the high-level
`browse()`/`LazyBrowseNode` walker below is the preferred surface;
`browseChildrenRaw` exposes the single underlying RPC when you need direct
control of paging.
```java
BrowseChildrenReply reply = galaxy.browseChildren(
BrowseChildrenReply reply = galaxy.browseChildrenRaw(
BrowseChildrenRequest.newBuilder().build());
List<GalaxyObject> children = reply.getChildrenList();
@@ -248,8 +250,11 @@ gradle :zb-mom-ww-mxgateway-cli:run --args="smoke --endpoint localhost:5000 --ap
```
The CLI accepts `--api-key`, `--api-key-env`, `--plaintext`, `--ca-file`,
`--server-name-override`, `--timeout`, and `--json` on gateway commands. JSON
output redacts API keys.
`--server-name-override`, `--require-certificate-validation`, `--timeout`, and
`--json` on gateway commands. JSON output redacts API keys. TLS is lenient by
default (the certificate is not verified unless you pin a CA with `--ca-file`);
pass `--require-certificate-validation` to verify the server certificate against
the JVM trust store without pinning.
Use TLS options for a secured gateway:
@@ -311,7 +316,7 @@ repositories {
}
dependencies {
implementation 'com.zb.mom.ww.mxgateway:zb-mom-ww-mxgateway-client:0.1.0'
implementation 'com.zb.mom.ww.mxgateway:zb-mom-ww-mxgateway-client:0.1.1'
}
````
+1 -1
View File
@@ -13,7 +13,7 @@ ext {
subprojects {
group = 'com.zb.mom.ww.mxgateway'
version = '0.1.0'
version = '0.1.1'
pluginManager.withPlugin('java') {
java {
@@ -37,6 +37,7 @@ import java.util.concurrent.atomic.AtomicReference;
import mxaccess_gateway.v1.MxaccessGateway.AcknowledgeAlarmReply;
import mxaccess_gateway.v1.MxaccessGateway.AcknowledgeAlarmRequest;
import mxaccess_gateway.v1.MxaccessGateway.ActiveAlarmSnapshot;
import mxaccess_gateway.v1.MxaccessGateway.AlarmProviderStatus;
import mxaccess_gateway.v1.MxaccessGateway.AlarmFeedMessage;
import mxaccess_gateway.v1.MxaccessGateway.BulkReadResult;
import mxaccess_gateway.v1.MxaccessGateway.BulkWriteResult;
@@ -1366,6 +1367,13 @@ public final class MxGatewayCli implements Callable<Integer> {
@Option(names = "--server-name-override", description = "TLS server name override.")
String serverNameOverride = "";
@Option(
names = "--require-certificate-validation",
description =
"Verify the server certificate against the JVM trust store "
+ "(disables the lenient default; ignored with --plaintext or --ca-file pinning).")
boolean requireCertificateValidation;
@Option(names = "--timeout", defaultValue = "30s", description = "Per-call timeout.")
String timeout;
@@ -1388,6 +1396,7 @@ public final class MxGatewayCli implements Callable<Integer> {
.plaintext(plaintext)
.caCertificatePath(caFile)
.serverNameOverride(serverNameOverride)
.requireCertificateValidation(requireCertificateValidation)
.callTimeout(resolvedTimeout)
.build();
}
@@ -1400,6 +1409,7 @@ public final class MxGatewayCli implements Callable<Integer> {
values.put("plaintext", plaintext);
values.put("caFile", caFile == null ? "" : caFile.toString());
values.put("serverNameOverride", serverNameOverride);
values.put("requireCertificateValidation", requireCertificateValidation);
values.put("timeout", timeout);
return values;
}
@@ -1703,6 +1713,12 @@ public final class MxGatewayCli implements Callable<Integer> {
transition.getTransitionKind().name(),
transition.getSeverity());
}
case PROVIDER_STATUS -> {
AlarmProviderStatus status = message.getProviderStatus();
yield String.format(
"provider-status mode=%s degraded=%b reason=%s",
status.getMode().name(), status.getDegraded(), status.getReason());
}
case PAYLOAD_NOT_SET -> "unknown";
};
}
@@ -5,6 +5,7 @@ import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import com.zb.mom.ww.mxgateway.client.MxGatewayAlarmFeedSubscription;
import com.zb.mom.ww.mxgateway.client.MxGatewayClientOptions;
import io.grpc.stub.StreamObserver;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
@@ -289,6 +290,51 @@ final class MxGatewayCliTests {
}
}
@Test
void requireCertificateValidationFlagPropagatesThroughToClientOptions() {
// Client.Java-038 regression — the --require-certificate-validation
// CLI flag must reach MxGatewayClientOptions.requireCertificateValidation
// via CommonOptions.toClientOptions(), so CLI users can opt into strict
// JVM-trust verification without pinning a CA.
CapturingClientFactory factory = new CapturingClientFactory();
CliRun run = execute(
factory,
"acknowledge-alarm",
"--endpoint",
"localhost:5000",
"--api-key-env",
"MXGATEWAY_API_KEY",
"--require-certificate-validation",
"--reference",
"Tank01.Level.HiHi");
assertEquals(0, run.exitCode(), "errors:\n" + run.errors());
assertTrue(
factory.capturedClientOptions.requireCertificateValidation(),
"--require-certificate-validation did not propagate into MxGatewayClientOptions");
}
@Test
void requireCertificateValidationDefaultsToLenientWhenFlagAbsent() {
// Without the flag, the lenient-by-default trust posture must be
// preserved (requireCertificateValidation == false).
CapturingClientFactory factory = new CapturingClientFactory();
CliRun run = execute(
factory,
"acknowledge-alarm",
"--endpoint",
"localhost:5000",
"--api-key-env",
"MXGATEWAY_API_KEY",
"--reference",
"Tank01.Level.HiHi");
assertEquals(0, run.exitCode(), "errors:\n" + run.errors());
assertFalse(
factory.capturedClientOptions.requireCertificateValidation(),
"requireCertificateValidation should default to false (lenient)");
}
@Test
void streamAlarmsCommandFailsFastOnQueueOverflow() {
// Client.Java-033 regression — the CLI's stream-alarms bounded queue
@@ -435,6 +481,23 @@ final class MxGatewayCliTests {
}
}
/**
* Factory that records the {@link MxGatewayClientOptions} produced by
* {@link MxGatewayCli.CommonOptions#toClientOptions()} so a test can assert
* how CLI flags map onto the library option surface. Wraps the standard
* {@link FakeClient} so the command body still completes. Used by the
* Client.Java-038 option-flow regression.
*/
private static final class CapturingClientFactory implements MxGatewayCli.MxGatewayCliClientFactory {
private MxGatewayClientOptions capturedClientOptions;
@Override
public MxGatewayCli.MxGatewayCliClient connect(MxGatewayCli.CommonOptions options) {
capturedClientOptions = options.toClientOptions();
return new FakeClient(options.spec.commandLine().getOut());
}
}
/**
* Factory whose fake client floods the {@code streamAlarms} observer with
* 2000 messages synchronously, exceeding the CLI's bounded 1024-element
+12 -1
View File
@@ -238,7 +238,11 @@ left `False`, the client fetches the gateway's presented certificate once
to `localhost` (the generated certificate always carries a `localhost` SAN) when
none was supplied. To verify instead, pass `ca_file` to verify against a specific
CA, or set `require_certificate_validation=True` to verify against the system
trust roots. See
trust roots. The strict posture is reachable through every documented entry
point: the `require_certificate_validation=True` keyword on
`GatewayClient.connect(...)` / `GalaxyRepositoryClient.connect(...)`, the
`ClientOptions(require_certificate_validation=True)` struct, and the
`--require-certificate-validation` CLI flag. See
[Gateway Configuration](../../docs/GatewayConfiguration.md#automatic-self-signed-certificate).
## CLI
@@ -267,6 +271,13 @@ Use TLS options for a secured gateway:
mxgw-py smoke --endpoint mxgateway.example.local:5001 --tls --ca-file C:\certs\mxgateway-ca.pem --server-name-override mxgateway.example.local --api-key-env MXGATEWAY_API_KEY --item Object.Attribute --json
```
To force certificate validation against the system trust store instead of the
lenient trust-on-first-use default, add `--require-certificate-validation`:
```powershell
mxgw-py smoke --endpoint mxgateway.example.local:5001 --tls --require-certificate-validation --api-key-env MXGATEWAY_API_KEY --item Object.Attribute --json
```
## Integration Checks
Run live checks only when a gateway and MXAccess-backed worker are available:
+8 -4
View File
@@ -1,10 +1,12 @@
[build-system]
requires = ["setuptools>=69", "wheel"]
# setuptools >=77 emits core-metadata 2.4 (PEP 639 License-Expression), which the
# Gitea PyPI feed does not yet accept; cap below that so the dist stays <=2.3.
requires = ["setuptools>=69,<77", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "zb-mom-ww-mxaccess-gateway-client"
version = "0.1.0"
version = "0.1.1"
description = "Async Python client scaffold for MXAccess Gateway."
readme = "README.md"
requires-python = ">=3.12"
@@ -16,11 +18,10 @@ dependencies = [
authors = [
{ name = "Joseph Doherty" },
]
license = { text = "Proprietary" }
keywords = ["mxaccess", "mxgateway", "grpc", "client", "archestra"]
classifiers = [
"Development Status :: 3 - Alpha",
"License :: Other/Proprietary License",
"Development Status :: 3 - Alpha",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
@@ -54,3 +55,6 @@ where = ["src"]
addopts = "-ra"
pythonpath = ["src"]
testpaths = ["tests"]
markers = [
"tls: loopback TLS tests, opt-in via MXGATEWAY_RUN_TLS_TESTS=1",
]
@@ -40,6 +40,7 @@ class GatewayClient:
api_key: str | None = None,
plaintext: bool = False,
ca_file: str | None = None,
require_certificate_validation: bool = False,
server_name_override: str | None = None,
stub: Any | None = None,
) -> "GatewayClient":
@@ -50,13 +51,16 @@ class GatewayClient:
api_key=api_key,
plaintext=plaintext,
ca_file=ca_file,
require_certificate_validation=require_certificate_validation,
server_name_override=server_name_override,
)
if stub is not None:
return cls(options=resolved, stub=stub)
channel = create_channel(resolved)
# create_channel may perform a blocking TLS certificate probe (TOFU
# default); run it off the event loop so connect never freezes the loop.
channel = await asyncio.to_thread(create_channel, resolved)
return cls(
options=resolved,
stub=pb_grpc.MxAccessGatewayStub(channel),
@@ -52,6 +52,7 @@ class GalaxyRepositoryClient:
api_key: str | None = None,
plaintext: bool = False,
ca_file: str | None = None,
require_certificate_validation: bool = False,
server_name_override: str | None = None,
stub: Any | None = None,
) -> "GalaxyRepositoryClient":
@@ -62,13 +63,16 @@ class GalaxyRepositoryClient:
api_key=api_key,
plaintext=plaintext,
ca_file=ca_file,
require_certificate_validation=require_certificate_validation,
server_name_override=server_name_override,
)
if stub is not None:
return cls(options=resolved, stub=stub)
channel = create_channel(resolved)
# create_channel may perform a blocking TLS certificate probe (TOFU
# default); run it off the event loop so connect never freezes the loop.
channel = await asyncio.to_thread(create_channel, resolved)
return cls(
options=resolved,
stub=galaxy_pb_grpc.GalaxyRepositoryStub(channel),
@@ -12,6 +12,10 @@ import grpc
from .auth import REDACTED, ApiKey
from .errors import MxGatewayTransportError
# Fallback bound for the TOFU certificate probe when no call_timeout is set, so a
# black-holed host fails fast instead of hanging on the OS default connect timeout.
_TOFU_PROBE_TIMEOUT_SECONDS = 10.0
@dataclass(frozen=True)
class ClientOptions:
@@ -88,8 +92,17 @@ def _split_authority(endpoint: str) -> tuple[str, int]:
remainder = target[bracket_end + 1 :] # ":5120" or ""
port_str = remainder.lstrip(":")
return (host, int(port_str) if port_str else 443)
host, _, port = target.rpartition(":")
return (host or "localhost", int(port) if port else 443)
host, sep, port = target.rpartition(":")
if not sep:
# No colon at all (e.g. a bare hostname "mygateway"): the whole target
# is the host; default the port rather than raising on int("mygateway").
return (target or "localhost", 443)
if not port.isdigit():
# A colon with a non-numeric / empty tail (e.g. a trailing ":") is not
# an explicit port — keep the left side as the host and default the
# port so a typo cannot raise an uncaught ValueError on the TOFU path.
return (host or "localhost", 443)
return (host or "localhost", int(port))
def create_channel(options: ClientOptions) -> grpc.aio.Channel:
@@ -120,9 +133,15 @@ def create_channel(options: ClientOptions) -> grpc.aio.Channel:
else:
# Lenient default: grpc-python has no per-channel skip-verify, so fetch the
# server's certificate (unverified) and pin it for this channel (TOFU).
# The probe opens a real blocking TCP+TLS socket, so it MUST be bounded —
# a black-holed / firewall-drop host would otherwise hang on the OS default
# connect timeout (minutes). Bound it by call_timeout (or a short fixed
# fallback) so the dial fails fast as a transport error. The async
# `connect` classmethods run this off the event loop (asyncio.to_thread).
host, port = _split_authority(options.endpoint)
probe_timeout = options.call_timeout if options.call_timeout else _TOFU_PROBE_TIMEOUT_SECONDS
try:
presented = ssl.get_server_certificate((host, port))
presented = ssl.get_server_certificate((host, port), timeout=probe_timeout)
except OSError as error:
raise MxGatewayTransportError(
f"failed to fetch TLS certificate from {options.endpoint}: {error}"
@@ -170,6 +170,13 @@ def gateway_options(command: Callable[..., Any]) -> Callable[..., Any]:
command = click.option("--plaintext", is_flag=True, help="Use plaintext gRPC.")(command)
command = click.option("--tls", "use_tls", is_flag=True, help="Use TLS gRPC.")(command)
command = click.option("--ca-file", default=None, help="Custom root certificate file.")(command)
command = click.option(
"--require-certificate-validation",
"require_certificate_validation",
is_flag=True,
help="Verify the TLS certificate against the system trust store "
"instead of the lenient trust-on-first-use default.",
)(command)
command = click.option(
"--server-name-override",
default=None,
@@ -923,6 +930,7 @@ async def _connect(kwargs: dict[str, Any]) -> GatewayClient:
api_key=api_key,
plaintext=_use_plaintext(kwargs),
ca_file=kwargs.get("ca_file"),
require_certificate_validation=bool(kwargs.get("require_certificate_validation")),
server_name_override=kwargs.get("server_name_override"),
call_timeout=kwargs.get("call_timeout"),
stream_timeout=kwargs.get("stream_timeout"),
+50 -2
View File
@@ -1,9 +1,12 @@
"""Tests for auth metadata and connection options."""
import socket
import pytest
from zb_mom_ww_mxgateway.auth import REDACTED, ApiKey, auth_metadata, redact_secret
from zb_mom_ww_mxgateway import options as options_module
from zb_mom_ww_mxgateway.errors import MxGatewayTransportError
from zb_mom_ww_mxgateway.options import ClientOptions, create_channel
@@ -80,7 +83,9 @@ def test_create_channel_uses_tls_channel_tofu_default(monkeypatch: pytest.Monkey
_DUMMY_PEM = "-----BEGIN CERTIFICATE-----\nZmFrZQ==\n-----END CERTIFICATE-----\n"
get_cert_calls: list[tuple[str, int]] = []
def fake_get_server_certificate(addr: tuple[str, int]) -> str:
def fake_get_server_certificate(
addr: tuple[str, int], *, timeout: float | None = None
) -> str:
get_cert_calls.append(addr)
return _DUMMY_PEM
@@ -133,7 +138,7 @@ def test_create_channel_uses_tls_channel_tofu_respects_server_name_override(
monkeypatch.setattr(
options_module.ssl,
"get_server_certificate",
lambda addr: _DUMMY_PEM,
lambda addr, *, timeout=None: _DUMMY_PEM,
)
cred_calls: list[object] = []
@@ -276,3 +281,46 @@ def test_create_channel_uses_tls_channel_ca_file(
],
),
]
def test_tofu_probe_passes_a_bounded_timeout(monkeypatch: pytest.MonkeyPatch) -> None:
"""The TOFU cert pre-fetch must be bounded so a black-holed host fails fast."""
captured: dict[str, object] = {}
def fake_get_server_certificate(addr: object, *, timeout: float | None = None) -> str:
captured["timeout"] = timeout
return "-----BEGIN CERTIFICATE-----\nZmFrZQ==\n-----END CERTIFICATE-----\n"
monkeypatch.setattr(options_module.ssl, "get_server_certificate", fake_get_server_certificate)
monkeypatch.setattr(options_module.grpc, "ssl_channel_credentials", lambda **_: "creds")
monkeypatch.setattr(
options_module.grpc.aio,
"secure_channel",
lambda endpoint, credentials, *, options: "tls-channel",
)
create_channel(ClientOptions(endpoint="gateway.example:5001", call_timeout=7.5))
# A finite, positive timeout must be supplied (bounded by call_timeout here).
assert isinstance(captured["timeout"], (int, float))
assert 0 < captured["timeout"] <= 7.5
@pytest.mark.parametrize(
"raised",
[socket.timeout("timed out"), TimeoutError("timed out"), OSError("connection refused")],
)
def test_tofu_probe_timeout_raises_transport_error(
monkeypatch: pytest.MonkeyPatch, raised: Exception
) -> None:
"""A timed-out / failed probe surfaces as MxGatewayTransportError, not a raw error."""
def fake_get_server_certificate(addr: object, *, timeout: float | None = None) -> str:
raise raised
monkeypatch.setattr(options_module.ssl, "get_server_certificate", fake_get_server_certificate)
options = ClientOptions(endpoint="gateway.example:5001")
with pytest.raises(MxGatewayTransportError) as excinfo:
create_channel(options)
assert options.endpoint in str(excinfo.value)
+65
View File
@@ -2,14 +2,79 @@
import json
import pytest
from click.testing import CliRunner
from zb_mom_ww_mxgateway import __version__
from zb_mom_ww_mxgateway_cli import commands as commands_module
from zb_mom_ww_mxgateway_cli.commands import main
_BATCH_EOR = "__MXGW_BATCH_EOR__"
def test_require_certificate_validation_flag_flows_through_connect(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""The --require-certificate-validation CLI flag must reach ClientOptions (Client.Python-027)."""
captured: dict[str, object] = {}
async def fake_connect(options, **_kwargs):
captured["options"] = options
# Return a minimal object that supports the async context-manager protocol
# used by every CLI command body (async with await _connect(...) as client).
return _FakeAsyncClient()
monkeypatch.setattr(commands_module.GatewayClient, "connect", fake_connect)
result = CliRunner().invoke(
main,
[
"open-session",
"--endpoint",
"gateway.example:5001",
"--require-certificate-validation",
"--json",
],
)
assert result.exit_code == 0, result.output
assert captured["options"].require_certificate_validation is True
def test_require_certificate_validation_defaults_off(monkeypatch: pytest.MonkeyPatch) -> None:
"""Without the flag the strict-validation posture stays off (TOFU default)."""
captured: dict[str, object] = {}
async def fake_connect(options, **_kwargs):
captured["options"] = options
return _FakeAsyncClient()
monkeypatch.setattr(commands_module.GatewayClient, "connect", fake_connect)
result = CliRunner().invoke(
main,
["open-session", "--endpoint", "gateway.example:5001", "--plaintext", "--json"],
)
assert result.exit_code == 0, result.output
assert captured["options"].require_certificate_validation is False
class _FakeAsyncClient:
"""Minimal async-context-manager fake satisfying the open-session command body."""
async def __aenter__(self) -> "_FakeAsyncClient":
return self
async def __aexit__(self, *_exc: object) -> None:
return None
async def open_session_raw(self, *_args, **_kwargs):
from zb_mom_ww_mxgateway.generated import mxaccess_gateway_pb2 as pb
return pb.OpenSessionReply(session_id="cli-test-session")
def test_version_json_is_deterministic() -> None:
runner = CliRunner()
@@ -8,9 +8,107 @@ from typing import Any
import pytest
from zb_mom_ww_mxgateway import ClientOptions, GatewayClient, MxAccessError
from zb_mom_ww_mxgateway import client as client_module
from zb_mom_ww_mxgateway import galaxy as galaxy_module
from zb_mom_ww_mxgateway.galaxy import GalaxyRepositoryClient
from zb_mom_ww_mxgateway.generated import mxaccess_gateway_pb2 as pb
@pytest.mark.asyncio
async def test_gateway_connect_forwards_require_certificate_validation(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""The connect convenience kwarg must reach ClientOptions (Client.Python-027)."""
captured: dict[str, Any] = {}
def fake_create_channel(options: ClientOptions) -> object:
captured["options"] = options
return object()
monkeypatch.setattr(client_module, "create_channel", fake_create_channel)
monkeypatch.setattr(client_module.pb_grpc, "MxAccessGatewayStub", lambda channel: object())
await GatewayClient.connect(
endpoint="gateway.example:5001",
require_certificate_validation=True,
)
assert captured["options"].require_certificate_validation is True
@pytest.mark.asyncio
async def test_galaxy_connect_forwards_require_certificate_validation(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""GalaxyRepositoryClient.connect must thread the flag too (Client.Python-027)."""
captured: dict[str, Any] = {}
def fake_create_channel(options: ClientOptions) -> object:
captured["options"] = options
return object()
monkeypatch.setattr(galaxy_module, "create_channel", fake_create_channel)
monkeypatch.setattr(
galaxy_module.galaxy_pb_grpc, "GalaxyRepositoryStub", lambda channel: object()
)
await GalaxyRepositoryClient.connect(
endpoint="gateway.example:5001",
require_certificate_validation=True,
)
assert captured["options"].require_certificate_validation is True
@pytest.mark.asyncio
async def test_gateway_connect_runs_create_channel_off_the_event_loop(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""connect must run the blocking channel factory off the loop (Client.Python-028)."""
ran_in_thread: dict[str, bool] = {}
def fake_create_channel(options: ClientOptions) -> object:
# If this runs on the event loop thread, get_running_loop() succeeds.
try:
asyncio.get_running_loop()
ran_in_thread["off_loop"] = False
except RuntimeError:
ran_in_thread["off_loop"] = True
return object()
monkeypatch.setattr(client_module, "create_channel", fake_create_channel)
monkeypatch.setattr(client_module.pb_grpc, "MxAccessGatewayStub", lambda channel: object())
await GatewayClient.connect(endpoint="gateway.example:5001")
assert ran_in_thread["off_loop"] is True
@pytest.mark.asyncio
async def test_galaxy_connect_runs_create_channel_off_the_event_loop(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""GalaxyRepositoryClient.connect must also run the probe off the loop (Client.Python-028)."""
ran_in_thread: dict[str, bool] = {}
def fake_create_channel(options: ClientOptions) -> object:
try:
asyncio.get_running_loop()
ran_in_thread["off_loop"] = False
except RuntimeError:
ran_in_thread["off_loop"] = True
return object()
monkeypatch.setattr(galaxy_module, "create_channel", fake_create_channel)
monkeypatch.setattr(
galaxy_module.galaxy_pb_grpc, "GalaxyRepositoryStub", lambda channel: object()
)
await GalaxyRepositoryClient.connect(endpoint="gateway.example:5001")
assert ran_in_thread["off_loop"] is True
@pytest.mark.asyncio
async def test_session_helpers_send_auth_metadata_and_preserve_raw_replies() -> None:
stub = FakeGatewayStub()
+11
View File
@@ -134,6 +134,17 @@ def test_split_authority_parses_host_and_port() -> None:
assert _split_authority(":5120") == ("localhost", 5120)
def test_split_authority_defaults_port_for_portless_endpoint() -> None:
from zb_mom_ww_mxgateway.options import _split_authority
# A bare hostname (no ":port") must default to 443, not crash on int("mygateway").
assert _split_authority("mygateway") == ("mygateway", 443)
# Scheme-prefixed bare hostname behaves the same.
assert _split_authority("https://mygateway") == ("mygateway", 443)
# A non-numeric tail after a colon is treated as no explicit port.
assert _split_authority("mygateway:") == ("mygateway", 443)
def test_split_authority_strips_ipv6_brackets() -> None:
from zb_mom_ww_mxgateway.options import _split_authority
+69 -2
View File
@@ -207,6 +207,22 @@ version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
[[package]]
name = "core-foundation"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6"
dependencies = [
"core-foundation-sys",
"libc",
]
[[package]]
name = "core-foundation-sys"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
[[package]]
name = "either"
version = "1.15.0"
@@ -574,7 +590,7 @@ checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084"
[[package]]
name = "mxgw-cli"
version = "0.1.0"
version = "0.1.1"
dependencies = [
"clap",
"futures-util",
@@ -597,6 +613,12 @@ version = "1.70.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
[[package]]
name = "openssl-probe"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
[[package]]
name = "percent-encoding"
version = "2.3.2"
@@ -796,6 +818,18 @@ dependencies = [
"zeroize",
]
[[package]]
name = "rustls-native-certs"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63"
dependencies = [
"openssl-probe",
"rustls-pki-types",
"schannel",
"security-framework",
]
[[package]]
name = "rustls-pki-types"
version = "1.14.1"
@@ -816,6 +850,38 @@ dependencies = [
"untrusted",
]
[[package]]
name = "schannel"
version = "0.1.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1"
dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "security-framework"
version = "3.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d"
dependencies = [
"bitflags",
"core-foundation",
"core-foundation-sys",
"libc",
"security-framework-sys",
]
[[package]]
name = "security-framework-sys"
version = "2.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3"
dependencies = [
"core-foundation-sys",
"libc",
]
[[package]]
name = "semver"
version = "1.0.28"
@@ -1056,6 +1122,7 @@ dependencies = [
"percent-encoding",
"pin-project",
"prost",
"rustls-native-certs",
"socket2 0.5.10",
"tokio",
"tokio-rustls",
@@ -1423,7 +1490,7 @@ dependencies = [
[[package]]
name = "zb-mom-ww-mxgateway-client"
version = "0.1.0"
version = "0.1.1"
dependencies = [
"futures-core",
"futures-util",
+3 -3
View File
@@ -1,6 +1,6 @@
[package]
name = "zb-mom-ww-mxgateway-client"
version = "0.1.0"
version = "0.1.1"
edition = "2021"
authors = ["Joseph Doherty"]
description = "Async Rust client for the MxAccessGateway gRPC service, including a lazy-browse walker over the Galaxy Repository hierarchy."
@@ -20,7 +20,7 @@ resolver = "2"
[workspace.package]
edition = "2021"
version = "0.1.0"
version = "0.1.1"
authors = ["Joseph Doherty"]
license = "Proprietary"
repository = "https://gitea.dohertylan.com/dohertj2/mxaccessgw"
@@ -37,7 +37,7 @@ serde_json = "1.0.145"
thiserror = "2.0.17"
tokio = { version = "1.48.0", features = ["macros", "rt-multi-thread", "sync", "time"] }
tokio-stream = { version = "0.1.17", features = ["net"] }
tonic = { version = "0.13.1", features = ["transport", "tls-ring"] }
tonic = { version = "0.13.1", features = ["transport", "tls-ring", "tls-native-roots"] }
tonic-build = "0.13.1"
[dependencies]
+14 -6
View File
@@ -81,11 +81,19 @@ cargo run -p mxgw-cli -- smoke --endpoint https://mxgateway.example.local:5001 -
The gateway can auto-generate its own self-signed certificate (it has no PKI).
Unlike the other clients, the Rust client is **not** lenient: tonic 0.13.1
exposes no public hook to inject a custom certificate verifier, so TLS over Rust
is pin-only. A TLS connection requires either `--ca-file` /
`ClientOptions::with_ca_file(...)` to pin a CA (export the gateway's self-signed
certificate and pin it), or `--require-certificate-validation` /
`with_require_certificate_validation(true)` to verify against the system trust
roots. TLS with neither set fails `connect` with a clear, actionable error rather
cannot accept an *arbitrary* self-signed certificate. A TLS connection requires
one of two trust paths:
- `--ca-file` / `ClientOptions::with_ca_file(...)` to pin a CA (export the
gateway's self-signed certificate and pin it). This is the path for a
self-signed gateway.
- `--require-certificate-validation` / `with_require_certificate_validation(true)`
to verify against the operating system's trust roots (`tls-native-roots`). This
only succeeds for a certificate that chains to a root the host already trusts —
i.e. a gateway fronted by a publicly- or enterprise-CA-issued certificate, not a
bare self-signed one.
TLS with neither set fails `connect` with a clear, actionable error rather
than accepting the certificate. See
[Gateway Configuration](../../docs/GatewayConfiguration.md#automatic-self-signed-certificate).
@@ -271,5 +279,5 @@ Then add the dependency:
```toml
[dependencies]
zb-mom-ww-mxgateway-client = { version = "0.1.0", registry = "dohertj2-gitea" }
zb-mom-ww-mxgateway-client = { version = "0.1.1", registry = "dohertj2-gitea" }
```
+80 -12
View File
@@ -162,12 +162,73 @@ impl GatewayClient {
`stream_alarms` opens with one `active_alarm` per currently-active alarm
(the ConditionRefresh snapshot), then a single `snapshot_complete`, then a
`transition` for every subsequent raise / acknowledge / clear. The feed is
served by the gateway's always-on alarm monitor — no worker session is
opened — so any number of clients may attach. Dropping the stream cancels
the gRPC call cooperatively. `acknowledge_alarm` is idempotent at the
MxAccess layer; the returned `AcknowledgeAlarmReply` carries the native
MxStatus from the worker.
`transition` for every subsequent raise / acknowledge / clear. A fourth
`provider_status` oneof case (`AlarmProviderStatus`: `mode`, `degraded`,
`reason`, `since`) is emitted once on stream open and again on every
failover/failback so late joiners learn the current alarm-provider mode.
The CLI renders all four cases in both its one-line summary and its
protobuf-JSON output (`alarm_feed_message_summary` /
`alarm_feed_message_to_json`). The feed is served by the gateway's always-on
alarm monitor — no worker session is opened — so any number of clients may
attach. Dropping the stream cancels the gRPC call cooperatively.
`acknowledge_alarm` is idempotent at the MxAccess layer; the returned
`AcknowledgeAlarmReply` carries the native MxStatus from the worker.
## Galaxy Repository
`GalaxyClient` is a session-less metadata client (requires the
`metadata:read` API-key scope). Alongside `test_connection`,
`get_last_deploy_time`, `discover_hierarchy`, and `watch_deploy_events`, it
exposes a lazy hierarchy walker built on the `BrowseChildren` RPC:
```rust
impl GalaxyClient {
pub async fn browse(&mut self, options: Option<BrowseChildrenOptions>) -> Result<Vec<LazyBrowseNode>, Error>;
pub async fn browse_children_raw(&mut self, request: BrowseChildrenRequest) -> Result<BrowseChildrenReply, Error>;
}
pub struct BrowseChildrenOptions {
pub category_ids: Vec<i32>,
pub template_chain_contains: Vec<String>,
pub tag_name_glob: Option<String>,
pub include_attributes: Option<bool>,
pub alarm_bearing_only: bool,
pub historized_only: bool,
}
impl LazyBrowseNode {
pub fn object(&self) -> &GalaxyObject;
pub fn has_children_hint(&self) -> bool;
pub async fn children(&self) -> Vec<LazyBrowseNode>;
pub async fn is_expanded(&self) -> bool;
pub async fn expand(&self) -> Result<(), Error>;
}
```
- `browse(options)` returns the root objects as `LazyBrowseNode`s. The
supplied `BrowseChildrenOptions` filter is captured and reused when any
returned node is expanded, so a single filter set scopes the entire walk.
- `BrowseChildrenOptions` mirrors the request-level filters on the wire and
combines them with **AND**: a child appears only when it satisfies every
populated criterion (`category_ids` membership, every
`template_chain_contains` substring, the `tag_name_glob`, plus the
`alarm_bearing_only` / `historized_only` flags). `include_attributes` is a
tri-state (`None` = server default). Empty/`None` fields impose no
restriction. See
[Galaxy Repository — BrowseChildren](../../docs/GalaxyRepository.md#browsechildren)
for the wire-level semantics.
- `LazyBrowseNode` is cheap to clone — clones share state through an internal
`Arc`, so expanding one clone makes the children visible to every clone.
`has_children_hint()` exposes the server's `child_has_children` hint so a UI
can draw an expand affordance without issuing an RPC. `expand()` is
idempotent: the first call issues a paged `BrowseChildren` walk (page size
500) under an async mutex held across the await, sets the `is_expanded`
flag, and caches the children; subsequent calls are no-ops and re-hit
nothing. The internal paged loop guards against a server returning a
repeated `next_page_token` by failing with `Error::InvalidArgument` rather
than looping forever.
- `browse_children_raw` issues a single `BrowseChildren` RPC and returns the
raw reply for callers that want to drive paging themselves.
## Authentication
@@ -200,13 +261,20 @@ Rust client is therefore **pin-only** — it requires either:
- `ClientOptions::with_ca_file(...)` to pin a CA (the supported path for the
gateway's self-signed certificate; export the certificate and pin it), or
- `ClientOptions::with_require_certificate_validation(true)` to verify against the
system trust roots.
operating system's trust roots. This enables the `tonic` `tls-native-roots`
feature and calls `ClientTlsConfig::with_native_roots()`, so the handshake
validates a certificate that chains to a root the host already trusts. It does
**not** accept a bare self-signed gateway certificate — that still needs
`with_ca_file`.
With TLS enabled (`with_plaintext(false)`), no pinned CA, and certificate
validation not required, `GatewayClient::connect` rejects the connection with a
clear, actionable error pointing at `with_ca_file` /
`require_certificate_validation` rather than silently accepting the certificate.
The CLI exposes `--ca-file` and `--require-certificate-validation`.
`build_tls_config` computes the trust posture with the pure `tls_trust_decision`
helper (`None` / `PinnedCa` / `SystemRoots` / `RejectNoCa`) so the posture is
unit-testable without a live handshake. With TLS enabled (`with_plaintext(false)`),
no pinned CA, and certificate validation not required (`RejectNoCa`),
`GatewayClient::connect` rejects the connection with a clear, actionable error
pointing at `with_ca_file` / `require_certificate_validation` rather than building
a config with zero trust anchors. The CLI exposes `--ca-file` and
`--require-certificate-validation`.
## Streaming
+63 -1
View File
@@ -1726,7 +1726,7 @@ fn event_value_to_json(value: &ProtoMxValue) -> Value {
}
/// Render a streamed [`AlarmFeedMessage`] as a terse one-line summary that
/// distinguishes the three `payload` oneof cases.
/// distinguishes the four `payload` oneof cases.
fn alarm_feed_message_summary(message: &AlarmFeedMessage) -> String {
match &message.payload {
Some(alarm_feed_message::Payload::ActiveAlarm(snapshot)) => {
@@ -1746,6 +1746,14 @@ fn alarm_feed_message_summary(message: &AlarmFeedMessage) -> String {
AlarmEnumName::transition_kind(transition.transition_kind)
)
}
Some(alarm_feed_message::Payload::ProviderStatus(status)) => {
format!(
"provider-status mode={} degraded={} reason={:?}",
AlarmEnumName::provider_mode(status.mode),
status.degraded,
status.reason
)
}
None => "(empty)".to_owned(),
}
}
@@ -1784,6 +1792,17 @@ fn alarm_feed_message_to_json(message: &AlarmFeedMessage) -> Value {
"description": transition.description,
}
}),
Some(alarm_feed_message::Payload::ProviderStatus(status)) => json!({
"providerStatus": {
"mode": AlarmEnumName::provider_mode(status.mode),
"degraded": status.degraded,
"reason": status.reason,
"since": status.since.as_ref().map(|ts| json!({
"seconds": ts.seconds,
"nanos": ts.nanos,
})),
}
}),
None => Value::Null,
}
}
@@ -1806,6 +1825,13 @@ impl AlarmEnumName {
.map(|kind| kind.as_str_name().to_owned())
.unwrap_or_else(|_| value.to_string())
}
fn provider_mode(value: i32) -> String {
use zb_mom_ww_mxgateway_client::generated::mxaccess_gateway::v1::AlarmProviderMode;
AlarmProviderMode::try_from(value)
.map(|mode| mode.as_str_name().to_owned())
.unwrap_or_else(|_| value.to_string())
}
}
/// Render an [`AcknowledgeAlarmReply`] as a terse line or a JSON document.
@@ -2165,4 +2191,40 @@ mod tests {
assert_eq!(frac.seconds, utc.seconds);
assert_eq!(frac.nanos, 250_000_000);
}
#[test]
fn alarm_feed_provider_status_renders_in_summary_and_json() {
use zb_mom_ww_mxgateway_client::generated::mxaccess_gateway::v1::{
alarm_feed_message, AlarmFeedMessage, AlarmProviderMode, AlarmProviderStatus,
};
let message = AlarmFeedMessage {
payload: Some(alarm_feed_message::Payload::ProviderStatus(
AlarmProviderStatus {
mode: AlarmProviderMode::Subtag as i32,
degraded: true,
reason: "alarmmgr unavailable".to_owned(),
since: Some(prost_types::Timestamp {
seconds: 1_777_995_000,
nanos: 0,
}),
},
)),
};
let summary = super::alarm_feed_message_summary(&message);
assert!(summary.contains("provider-status"), "summary: {summary}");
assert!(
summary.contains("ALARM_PROVIDER_MODE_SUBTAG"),
"summary: {summary}"
);
assert!(summary.contains("degraded=true"), "summary: {summary}");
let value = super::alarm_feed_message_to_json(&message);
let provider = &value["providerStatus"];
assert_eq!(provider["mode"], "ALARM_PROVIDER_MODE_SUBTAG");
assert_eq!(provider["degraded"], true);
assert_eq!(provider["reason"], "alarmmgr unavailable");
assert_eq!(provider["since"]["seconds"], 1_777_995_000_i64);
}
}
+141 -44
View File
@@ -74,16 +74,22 @@ impl ClientOptions {
}
/// Require TLS certificate verification even without a pinned CA. Default
/// false: the gateway's self-signed certificate is accepted (internal-tool
/// posture). Setting a CA file always verifies.
/// false. Setting a CA file always verifies against that CA.
///
/// Note for Rust: tonic 0.13's `ClientTlsConfig` exposes no hook for a
/// custom rustls verifier, so the Rust client cannot accept an arbitrary
/// custom rustls verifier, so the Rust client cannot accept an *arbitrary*
/// self-signed certificate the way the other clients do. With the default
/// (false) and no pinned CA, [`crate::client::GatewayClient::connect`]
/// rejects the TLS connection and asks for a CA file. Either pin a CA via
/// [`ClientOptions::with_ca_file`] (the supported lenient path on Rust) or
/// set this `true` to verify against the system trust roots.
/// rejects the TLS connection and asks for a CA file. There are two
/// supported TLS paths:
///
/// - Pin the gateway certificate with [`ClientOptions::with_ca_file`] (the
/// lenient pin-only path; works for a self-signed gateway cert).
/// - Set this `true` to verify against the operating system's trust roots
/// (`tls-native-roots`). This only succeeds for a certificate that chains
/// to a root the host already trusts, so it is for gateways fronted by a
/// publicly- or enterprise-CA-issued certificate, not a bare self-signed
/// one.
pub fn with_require_certificate_validation(mut self, require: bool) -> Self {
self.require_certificate_validation = require;
self
@@ -175,26 +181,63 @@ impl ClientOptions {
}
}
/// Where the TLS handshake gets its trust anchors for a given set of options.
/// Computed by [`tls_trust_decision`] and applied by [`build_tls_config`];
/// split out so the trust posture is unit-testable without a live handshake.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum TlsTrustDecision {
/// Plaintext transport — no TLS, no trust anchors.
None,
/// Validate against the CA pinned with [`ClientOptions::with_ca_file`].
PinnedCa,
/// Validate against the operating system's trust roots
/// (`require_certificate_validation == true`, no pinned CA).
SystemRoots,
/// Reject up front: TLS requested with neither a pinned CA nor strict
/// verification (the Rust pin-only lenient default).
RejectNoCa,
}
/// Decide the TLS trust posture from `options` without touching the filesystem
/// or the network.
pub(crate) fn tls_trust_decision(options: &ClientOptions) -> TlsTrustDecision {
if options.plaintext() {
TlsTrustDecision::None
} else if options.ca_file().is_some() {
TlsTrustDecision::PinnedCa
} else if options.require_certificate_validation() {
TlsTrustDecision::SystemRoots
} else {
TlsTrustDecision::RejectNoCa
}
}
/// Build the [`ClientTlsConfig`] for a non-plaintext connection described by
/// `options`, applying the lenient-default guard that is the **Rust
/// pin-only exception**.
///
/// Returns `Ok(None)` when `options.plaintext()` is `true` (no TLS needed).
/// Returns `Ok(Some(tls))` when a valid TLS config can be assembled.
/// Returns `Ok(Some(tls))` when a valid TLS config can be assembled — either
/// pinned to the CA from [`ClientOptions::with_ca_file`], or, when
/// `require_certificate_validation` is set with no pinned CA, verifying against
/// the operating system's trust roots (`tls-native-roots`).
/// Returns `Err(Error::InvalidEndpoint)` when TLS is requested but no pinned
/// CA was provided and `require_certificate_validation` is `false`.
///
/// # Why this guard exists
/// # Why the no-CA guard exists
///
/// `tonic` 0.13's `ClientTlsConfig` builds its rustls verifier inside a
/// crate-private connector and exposes no hook for a custom
/// `ServerCertVerifier`. The Rust client therefore cannot accept an arbitrary
/// `ServerCertVerifier`. The Rust client therefore cannot accept an *arbitrary*
/// self-signed certificate the way the other language clients do. Rather than
/// silently falling back to system-root verification (which always fails
/// against a self-signed gateway certificate), we reject the configuration
/// early with an actionable error.
/// silently falling back to a verifier with no trust anchors (which rejects
/// every certificate with a confusing handshake error), the lenient default
/// rejects the configuration early with an actionable error. The strict opt-in
/// instead loads the system trust roots so a certificate chaining to an
/// already-trusted root validates.
pub(crate) fn build_tls_config(options: &ClientOptions) -> Result<Option<ClientTlsConfig>, Error> {
if options.plaintext() {
let decision = tls_trust_decision(options);
if decision == TlsTrustDecision::None {
return Ok(None);
}
@@ -202,37 +245,46 @@ pub(crate) fn build_tls_config(options: &ClientOptions) -> Result<Option<ClientT
if let Some(server_name) = options.server_name_override() {
tls = tls.domain_name(server_name.to_owned());
}
if let Some(ca_file) = options.ca_file() {
let certificate = fs::read(ca_file).map_err(|source| Error::InvalidEndpoint {
endpoint: options.endpoint().to_owned(),
detail: format!("failed to read CA file {}: {source}", ca_file.display()),
})?;
tls = tls.ca_certificate(Certificate::from_pem(certificate));
} else if !options.require_certificate_validation() {
// Lenient-default fallback (Rust pin-only exception): tonic
// 0.13's `ClientTlsConfig` builds its rustls verifier inside a
// crate-private connector and exposes no hook for a custom
// `ServerCertVerifier`, so — unlike the other clients — the
// Rust client cannot accept an arbitrary self-signed cert. Pin
// the gateway's CA instead, or opt into strict verification
// against the system trust roots. We reject here rather than
// silently verifying against system roots (which would fail a
// self-signed gateway with a confusing handshake error).
//
// Note: a server-name override affects SNI (the hostname sent
// in the TLS ClientHello) but does NOT pin trust. Overriding
// the server name alone does not bypass certificate validation.
return Err(Error::InvalidEndpoint {
endpoint: options.endpoint().to_owned(),
detail: "TLS requested without a pinned CA. The Rust client cannot accept an \
arbitrary self-signed certificate (tonic 0.13 exposes no custom \
rustls verifier). Pin the gateway certificate with \
ClientOptions::with_ca_file, or call \
ClientOptions::with_require_certificate_validation(true) to verify \
against the system trust roots. Note: a server-name override \
affects SNI but does not pin trust."
.to_owned(),
});
match decision {
TlsTrustDecision::PinnedCa => {
let ca_file = options.ca_file().expect("PinnedCa implies a CA file");
let certificate = fs::read(ca_file).map_err(|source| Error::InvalidEndpoint {
endpoint: options.endpoint().to_owned(),
detail: format!("failed to read CA file {}: {source}", ca_file.display()),
})?;
tls = tls.ca_certificate(Certificate::from_pem(certificate));
}
TlsTrustDecision::SystemRoots => {
// Strict opt-in with no pinned CA: verify against the OS trust
// store. Without this the bare `ClientTlsConfig` carries zero
// trust anchors and rejects every certificate, so the documented
// "verify against the system trust roots" behaviour would be
// unreachable. Only a certificate chaining to an already-trusted
// root validates — a bare self-signed gateway cert still needs
// `with_ca_file`.
tls = tls.with_native_roots();
}
TlsTrustDecision::RejectNoCa => {
// Lenient-default fallback (Rust pin-only exception): the Rust
// client cannot accept an arbitrary self-signed cert. Pin the
// gateway's CA, or opt into strict verification against the
// system trust roots.
//
// Note: a server-name override affects SNI (the hostname sent in
// the TLS ClientHello) but does NOT pin trust.
return Err(Error::InvalidEndpoint {
endpoint: options.endpoint().to_owned(),
detail: "TLS requested without a pinned CA. The Rust client cannot accept an \
arbitrary self-signed certificate (tonic 0.13 exposes no custom \
rustls verifier). Pin the gateway certificate with \
ClientOptions::with_ca_file, or call \
ClientOptions::with_require_certificate_validation(true) to verify \
against the system trust roots. Note: a server-name override \
affects SNI but does not pin trust."
.to_owned(),
});
}
TlsTrustDecision::None => unreachable!("handled above"),
}
Ok(Some(tls))
}
@@ -269,6 +321,8 @@ mod tests {
use super::ClientOptions;
use crate::auth::ApiKey;
use super::{build_tls_config, tls_trust_decision, TlsTrustDecision};
#[test]
fn debug_redacts_api_key() {
let options =
@@ -279,4 +333,47 @@ mod tests {
assert!(debug.contains("<redacted>"));
assert!(!debug.contains("mxgw_secret"));
}
#[test]
fn plaintext_needs_no_tls() {
let options = ClientOptions::new("http://127.0.0.1:5000").with_plaintext(true);
assert_eq!(tls_trust_decision(&options), TlsTrustDecision::None);
assert!(build_tls_config(&options).unwrap().is_none());
}
#[test]
fn pinned_ca_uses_pinned_trust() {
let options = ClientOptions::new("https://127.0.0.1:5000")
.with_plaintext(false)
.with_ca_file("/some/ca.pem");
assert_eq!(tls_trust_decision(&options), TlsTrustDecision::PinnedCa);
}
#[test]
fn strict_without_ca_uses_system_roots() {
// Regression for Client.Rust-031: strict verification with no pinned CA
// must verify against the system trust roots, not produce a config with
// zero trust anchors. The trust decision proves roots are consulted; the
// build then succeeds (no no-CA guard error) and emits a config.
let options = ClientOptions::new("https://127.0.0.1:5000")
.with_plaintext(false)
.with_require_certificate_validation(true);
assert_eq!(
tls_trust_decision(&options),
TlsTrustDecision::SystemRoots,
"strict-no-CA must request the system trust roots"
);
assert!(
build_tls_config(&options).unwrap().is_some(),
"strict-no-CA must build a usable TLS config"
);
}
#[test]
fn lenient_without_ca_is_rejected() {
let options = ClientOptions::new("https://127.0.0.1:5000").with_plaintext(false);
assert_eq!(tls_trust_decision(&options), TlsTrustDecision::RejectNoCa);
assert!(build_tls_config(&options).is_err());
}
}
+98 -2
View File
@@ -4,8 +4,8 @@
|---|---|
| Module | `clients/dotnet` |
| Reviewer | Claude Code |
| Review date | 2026-05-24 |
| Commit reviewed | `42b0037` |
| Review date | 2026-06-15 |
| Commit reviewed | `410acc9` |
| Status | Re-reviewed |
| Open findings | 0 |
@@ -383,6 +383,40 @@ Re-review pass at `42b0037`. Diff against `d692232` consists of four commits:
| 9 | Testing coverage | No new issues — `RunAsync_StreamAlarms_*`, `RunAsync_AcknowledgeAlarm_*`, and `RunAsync_Batch_*` give the new surface unit coverage. `bench-read-bulk` is the same stress-harness-not-SDK shape called out in the prior re-review and is not flagged here. |
| 10 | Documentation & comments | Issue found (this review): the README examples for the two new alarm CLI subcommands cite wrong flag names and a non-existent `--session-id` (Client.Dotnet-018). The new XML docs on `StreamAlarmsAsync` / `AcknowledgeAlarmAsync` and on the bulk SDK methods are accurate and complete. |
#### 2026-06-15 re-review (commit 410acc9)
Re-review pass at `410acc9`. The diff against `42b0037` is packaging/release metadata
(NuGet/Gitea feed), a TLS trust-posture option (`RequireCertificateValidation` + a
lenient accept-all default for the gateway's auto-generated self-signed cert), the
Galaxy `BrowseChildren` RPC plumbing plus a `LazyBrowseNode` lazy-browse walker, and
in-source resolutions of the prior pass's Client.Dotnet-018..021 (CLI flag-name README
fix, `RequireRegisterServerHandle`, `ParseTimeoutMs` negative guard, steady-state OCE
filter). The alarm-provider-fallback proto surface mentioned in the review brief is
**not** present in this diff — no `AlarmProviderMode` / `AlarmProviderStatus` /
`source_provider` / provider-mode-changed event reaches the .NET client here.
Build is green (`dotnet build … .slnx` succeeds) and all 78 unit tests pass (1 skipped
live smoke). The build now emits **10 CS1591 warnings** that do not break the build,
because the `clients/dotnet/Directory.Build.props` enforcement floor recorded as
resolved under Client.Dotnet-012 (`TreatWarningsAsErrors` / `EnforceCodeStyleInBuild` /
`AnalysisLevel` / `Deterministic`) is **absent** from the history that reaches HEAD —
the props file at HEAD is packaging-metadata-only (Client.Dotnet-022). `git merge-base
--is-ancestor a020350 HEAD` is false: the 2026-05-20 review-sweep commit that resolved
012 is not in this line of history.
| # | Category | Result |
|---|---|---|
| 1 | Correctness & logic bugs | No new issues. The Galaxy `BrowseAsync` / `LazyBrowseNode.ExpandAsync` pagination correctly drains `next_page_token`, re-binds the same parent selector + filter set per page (matching the opaque-token contract), and guards against repeated tokens; the per-child `child_has_children` hint is read with an index-bounds check. The Client.Dotnet-019/021 in-source fixes (`RequireRegisterServerHandle`, `ParseTimeoutMs`) are correctly applied. |
| 2 | mxaccessgw conventions | Issue found (this review): the `clients/dotnet/Directory.Build.props` enforcement floor (warnings-as-errors / code-style enforcement) mandated by CLAUDE.md and recorded resolved under Client.Dotnet-012 is missing at HEAD; the new props file carries only packaging metadata (Client.Dotnet-022). Consumes the shared contracts project, no forked proto, `authorization: Bearer` metadata correct. |
| 3 | Concurrency & thread safety | Issue found (this review): `LazyBrowseNode.Children` and `IsExpanded` are read lock-free while `ExpandAsync` mutates `_children` and writes `_isExpanded` under `_expandLock`, with no release/acquire barrier to a concurrent reader (Client.Dotnet-025). `ExpandAsync`'s one-RPC dedup itself is correct (double-checked under the lock). |
| 4 | Error handling & resilience | No new issues — `BrowseChildrenAsync` routes `RpcException` through the shared `MapRpcException`; the bench steady-state OCE filter (Client.Dotnet-020) is correctly applied. |
| 5 | Security | No committed secret — the README Gitea-feed `dotnet nuget add source` example uses `<gitea-username>` / `<gitea-token-or-password>` placeholders. Note: TLS is lenient-by-default (accept-all callback when `UseTls` and no pinned CA), which disables certificate verification / MITM protection; this is an explicit, documented design choice for the gateway's auto-generated self-signed cert and is opt-out via `RequireCertificateValidation` or CA pinning, so not flagged as a finding. |
| 6 | Performance & resource management | No issues found — `LazyBrowseNode` holds one `SemaphoreSlim` per node (never disposed, but it owns no unmanaged handle and the node lifetime is the tree's); browse paging caps at 500/page. |
| 7 | Design-document adherence | No issues found — `BrowseChildren` / lazy-browse match `docs/GalaxyRepository.md#browsechildren`; the TLS posture matches `docs/GatewayConfiguration.md` (`RequireCertificateValidation` default `false`) and `DotnetClientDesign.md`. |
| 8 | Code organization & conventions | Issue found (this review): Client.Dotnet-022 (lost enforcement props); the new `GenerateDocumentationFile=true` in the shared props also applies to the Cli and Tests projects, surfacing CS1591 on `IMxGatewayCliClient` and every test class (Client.Dotnet-023); the client (and Contracts) NuGet package ships with no `<license>` metadata despite setting `PackageRequireLicenseAcceptance=false` (Client.Dotnet-024). The nuspec correctly emits the transitive `ZB.MOM.WW.MxGateway.Contracts 0.1.0` dependency, so the README "pulled in transitively" claim holds. |
| 9 | Testing coverage | No new issues — `LazyBrowseNodeTests` (7 cases incl. multi-page, concurrent-expand-one-RPC, filter forwarding), `MxGatewayClientTlsHandlerTests` / `GalaxyRepositoryClientTlsHandlerTests`, and the README-example parse tests give the new surface good coverage. |
| 10 | Documentation & comments | No new issues — README NuGet-install / lazy-browse / TLS-trust sections are accurate, cross-doc anchors (`#automatic-self-signed-certificate`, `#browsechildren`) resolve, and the new XML docs on `BrowseAsync` / `LazyBrowseNode` / `RequireCertificateValidation` are complete. (The CS1591-surfaced missing docs are tracked under Client.Dotnet-023.) |
### Client.Dotnet-018
| Field | Value |
@@ -507,3 +541,65 @@ uint timeoutMs = (uint)timeoutMsRaw;
A single shared helper (e.g. `ParseTimeoutMs(CliArguments, string, int)`) on `MxGatewayClientCli` would cover both call sites and remove the duplication.
**Resolution:** 2026-05-24 — Confirmed against source: both `ReadBulkAsync` (line 490) and `BenchReadBulkAsync` (line 715) cast `arguments.GetInt32("timeout-ms", ...)` straight to `uint`, so `--timeout-ms -1` silently wrapped to `0xFFFFFFFF` (~49.7 days). Added a single shared private helper `ParseTimeoutMs(CliArguments arguments, int defaultValue)` on `MxGatewayClientCli` that reads the int32, rejects negatives with a clear `ArgumentException` ("--timeout-ms must be a non-negative integer (use 0 for the gateway default)."), and returns the safe `(uint)`. Both call sites now route through the helper. Regression test `MxGatewayClientCliTests.RunAsync_TimeoutMs_NegativeValue_RejectsWithClearError` (xUnit `[Theory]` over `read-bulk` and `bench-read-bulk`) drives the CLI with `--timeout-ms -1` and asserts the exit code is non-zero, that stderr contains "timeout-ms", and that the "non-negative" guard text is present. Verified red against the original `(uint)arguments.GetInt32(...)` casts (the bench proceeded past the timeout parse and tripped a downstream "Queue empty" error rather than the descriptive guard message) and green after the helper landed.
### Client.Dotnet-022
| Field | Value |
|---|---|
| Severity | Medium |
| Category | mxaccessgw conventions |
| Location | `clients/dotnet/Directory.Build.props:1-21` |
| Status | Resolved |
**Description:** Client.Dotnet-012 was recorded resolved (2026-05-20, commit `a020350`) by adding `clients/dotnet/Directory.Build.props` mirroring `src/Directory.Build.props``TreatWarningsAsErrors=true`, `EnforceCodeStyleInBuild=true`, `AnalysisLevel=latest`, `Deterministic=true`, `LangVersion=latest`, `Nullable=enable`, `ImplicitUsings=enable` — to restore the build-quality floor that `CLAUDE.md` calls a baseline for the .NET client. That enforcement props file is **not present in the line of history that reaches HEAD**: `git merge-base --is-ancestor a020350 HEAD` is false (the 2026-05-20 review-sweep commit was dropped during the `ZB.MOM.WW` rename / history rebuild). At `42b0037` the file did not exist at all (`git show 42b0037:clients/dotnet/Directory.Build.props` fails), and at HEAD commit `523f944` introduced a **new** `clients/dotnet/Directory.Build.props` that carries only NuGet packaging metadata (Authors/Company/RepositoryUrl/Version/etc.) — none of the enforcement properties. None of the three client `.csproj` files set `TreatWarningsAsErrors` or `EnforceCodeStyleInBuild` independently (they set only `TargetFramework` and `Nullable`).
Net effect at HEAD: `dotnet build clients/dotnet/ZB.MOM.WW.MxGateway.Client.slnx` **succeeds with 10 CS1591 warnings** instead of failing. The mandated quality gate that would turn new warnings (missing docs, analyzer findings, code-style violations) into build breaks is gone for the entire client tree. This is a regression of the previously-closed Client.Dotnet-012; recorded as a fresh finding at the new commit per the re-review process.
**Recommendation:** Restore the enforcement properties in `clients/dotnet/Directory.Build.props` alongside the packaging metadata (they can coexist in the same `<Project>`), or add a sibling `clients/dotnet/Directory.Build.props` import. Re-run `dotnet build …slnx` and confirm 0 warnings / 0 errors (which will require closing Client.Dotnet-023 too, since the CS1591 warnings would otherwise become errors). Add a guard so the floor is not silently dropped again — e.g. assert the property is set in a small build test or CI check.
**Resolution:** 2026-06-15 — Confirmed at HEAD: `clients/dotnet/Directory.Build.props` carried only packaging metadata; none of the three client `.csproj` files set the enforcement properties, so `dotnet build …slnx` succeeded with 10 CS1591 warnings instead of failing. Restored the enforcement floor in `clients/dotnet/Directory.Build.props` mirroring `src/Directory.Build.props` (`LangVersion=latest`, `Nullable=enable`, `ImplicitUsings=enable`, `TreatWarningsAsErrors=true`, `AnalysisLevel=latest`, `EnforceCodeStyleInBuild=true`, `Deterministic=true`) in a second `<PropertyGroup>` alongside the existing packaging metadata. Resolved jointly with Client.Dotnet-023 (the CS1591 warnings would otherwise become errors under the restored `TreatWarningsAsErrors`). `dotnet build clients/dotnet/ZB.MOM.WW.MxGateway.Client.slnx -t:Rebuild` now reports 0 Warning(s) / 0 Error(s).
### Client.Dotnet-023
| Field | Value |
|---|---|
| Severity | Low |
| Category | Code organization & conventions |
| Location | `clients/dotnet/Directory.Build.props:17`, `clients/dotnet/ZB.MOM.WW.MxGateway.Client.Cli/IMxGatewayCliClient.cs:6`, `clients/dotnet/ZB.MOM.WW.MxGateway.Client.Tests/*.cs` |
| Status | Resolved |
**Description:** The new shared `clients/dotnet/Directory.Build.props` sets `GenerateDocumentationFile=true` at the directory level, so it applies to all three projects — including `ZB.MOM.WW.MxGateway.Client.Cli` and `ZB.MOM.WW.MxGateway.Client.Tests`, which are not packable and were not previously generating an XML doc file. Turning it on surfaces 10 CS1591 "missing XML comment" warnings: `IMxGatewayCliClient` (the public CLI interface, never documented at the type level — note Client.Dotnet-013's resolution claimed a type-level summary was added, but it is absent in the history reaching HEAD for the same reason as Client.Dotnet-022) plus every public xUnit test class (`GalaxyRepositoryClientTests`, `MxGatewayClientTlsHandlerTests`, `GalaxyRepositoryClientTlsHandlerTests`, and seven others). Today these are only warnings because the enforcement floor is missing (Client.Dotnet-022); once that floor is restored they become build-breaking errors.
**Recommendation:** Scope `GenerateDocumentationFile=true` to the packable library project only (move it from the shared props into `ZB.MOM.WW.MxGateway.Client.csproj`, which is the only project that ships a `.nupkg`), or keep it directory-wide but suppress CS1591 on the non-public test/CLI assemblies (`<NoWarn>$(NoWarn);CS1591</NoWarn>` in those two `.csproj` files) and add the one-line type summary to `IMxGatewayCliClient`. The first option is cleaner and avoids documenting test classes.
**Resolution:** 2026-06-15 — Confirmed via `-t:Rebuild`: the directory-wide `GenerateDocumentationFile=true` surfaced exactly 10 CS1591 warnings — `IMxGatewayCliClient` plus nine xUnit test classes (`GalaxyRepositoryClientTests`, `MxCommandReplyExtensionsTests`, `MxGatewayClientContractInfoTests`, `MxGatewayClientOptionsTests`, `MxGatewayClientTlsHandlerTests`, `GalaxyRepositoryClientTlsHandlerTests`, `MxGatewayGeneratedContractTests`, `MxStatusProxyExtensionsTests`, `MxValueExtensionsTests`); the shipped Client library itself emitted zero (its public surface was already fully documented). Took the first (cleaner) option, matching how `src/` handles this — only the packable `src/ZB.MOM.WW.MxGateway.Contracts.csproj` sets `GenerateDocumentationFile` directly. Removed `GenerateDocumentationFile=true` from the shared `clients/dotnet/Directory.Build.props` and moved it into the packable `ZB.MOM.WW.MxGateway.Client.csproj` only, so the Cli and Tests projects no longer generate doc files and CS1591 is not raised against them. No doc comments were added to test classes. With the Client.Dotnet-022 floor restored, the rebuild is clean (0 warnings / 0 errors).
### Client.Dotnet-024
| Field | Value |
|---|---|
| Severity | Low |
| Category | Code organization & conventions |
| Location | `clients/dotnet/Directory.Build.props:12`, `clients/dotnet/ZB.MOM.WW.MxGateway.Client/ZB.MOM.WW.MxGateway.Client.csproj:19-24` |
| Status | Resolved |
**Description:** The client package sets `PackageRequireLicenseAcceptance=false` but declares **no license at all** — there is no `PackageLicenseExpression` and no `PackageLicenseFile` in `clients/dotnet/Directory.Build.props` or in the packable `.csproj`. Confirmed by packing: the emitted `ZB.MOM.WW.MxGateway.Client.0.1.0.nuspec` has no `<license>` element, so the produced package carries no license metadata and a NuGet feed renders it as "License: not specified." The sibling `ZB.MOM.WW.MxGateway.Contracts` package (the transitive dependency) has the same gap. `dotnet pack` does not warn (a missing license is allowed), so the omission is silent. Setting `PackageRequireLicenseAcceptance=false` while shipping no license is internally inconsistent — that flag exists to control acceptance of a license that should be present.
**Recommendation:** Add the intended license to `clients/dotnet/Directory.Build.props` (and to `ZB.MOM.WW.MxGateway.Contracts.csproj` for parity) — either `<PackageLicenseExpression>` with an SPDX id (e.g. a proprietary marker or the actual license) or `<PackageLicenseFile>` pointing at a committed `LICENSE`. If the package is intentionally unlicensed/internal-only, document that explicitly rather than leaving the field blank.
**Resolution:** 2026-06-15 — Confirmed via pack: the emitted nuspec had no `<license>` element. Marked the package "Proprietary" consistent with the other clients' decision (Rust `license = "Proprietary"`, Python `license = { text = "Proprietary" }` + `License :: Other/Proprietary License`). A `<PackageLicenseExpression>LicenseRef-Proprietary</PackageLicenseExpression>` was tried first but the current NuGet toolset rejects `LicenseRef-*` (NU5124), which the restored `TreatWarningsAsErrors` escalates to a pack failure — so the proprietary terms ship as a committed license file instead: added `clients/dotnet/LICENSE.txt` (proprietary/internal-use terms), set `<PackageLicenseFile>LICENSE.txt</PackageLicenseFile>` in the shared `clients/dotnet/Directory.Build.props`, and packed it at the package root via a `<None Include="..\LICENSE.txt" Pack="true" PackagePath="\" />` item in the packable `ZB.MOM.WW.MxGateway.Client.csproj`. `dotnet pack` now succeeds and the nuspec carries `<license type="file">LICENSE.txt</license>` with `LICENSE.txt` present in the `.nupkg`. Scope was limited to Client.Dotnet per the constraints — the sibling `ZB.MOM.WW.MxGateway.Contracts` package has the same gap and is NOT touched here (it is a different module; flagging it for that module's review).
### Client.Dotnet-025
| Field | Value |
|---|---|
| Severity | Low |
| Category | Concurrency & thread safety |
| Location | `clients/dotnet/ZB.MOM.WW.MxGateway.Client/LazyBrowseNode.cs:38,41,54,82,94` |
| Status | Resolved |
**Description:** `LazyBrowseNode.ExpandAsync` is explicitly documented as thread-safe ("concurrent callers see exactly one fetch"), and its one-RPC dedup is correct: it double-checks `_isExpanded` under `_expandLock`. But the *readers* of the results are lock-free. `Children => _children` returns the live backing `List<LazyBrowseNode>` reference, and `IsExpanded => _isExpanded` reads the plain `bool` field — neither takes `_expandLock` nor uses `Volatile`. A thread that observes `IsExpanded == true` (or simply enumerates `Children`) concurrently with the writer thread inside `ExpandAsync` has no release/acquire barrier guaranteeing it sees the fully-populated `_children` contents that were appended under the lock. On x86/x64 the bool read and the list-reference read are atomic and the practical risk is low, but the published-state visibility is not guaranteed by the memory model, and a reader enumerating `Children` while a concurrent `ExpandAsync` is mid-append can throw `InvalidOperationException` ("collection was modified"). This is inconsistent with the type's own thread-safety claim.
**Recommendation:** Either (a) tighten the documented contract to "ExpandAsync is safe to call concurrently, but Children/IsExpanded must only be read after the awaited ExpandAsync completes (no concurrent reader/expander)", or (b) make the publication safe: write `_isExpanded` via `Volatile.Write` and read via `Volatile.Read`, and return an immutable snapshot from `Children` (e.g. assign a completed `IReadOnlyList` under the lock and expose that field) so lock-free readers never observe a partially-populated list. Option (a) is the smallest change and matches the realistic usage (UI thread expands then renders).
**Resolution:** 2026-06-15 — Confirmed against source: `Children => _children` returned the live mutable backing `List<LazyBrowseNode>` and `IsExpanded => _isExpanded` read a plain `bool`, while `ExpandAsync` appended to that same list under `_expandLock` with no release/acquire barrier to lock-free readers — so a concurrent reader could enumerate a mid-append list and throw `InvalidOperationException` ("collection was modified"). Applied option (b) (safe publication): `ExpandAsync` now accumulates children into a method-local `List<LazyBrowseNode>` and, only when fully drained across all pages, publishes it via `Volatile.Write(ref _children, children)` (release) immediately before setting the now-`volatile bool _isExpanded = true`. The `_children` field is an `IReadOnlyList<LazyBrowseNode>` read via `Volatile.Read` from the `Children` getter (acquire), so a reader that observes `IsExpanded == true` always sees the fully-populated snapshot and never enumerates a partially-built list. Updated the `ExpandAsync` `<remarks>` to document the strengthened concurrent-read guarantee. Regression test `LazyBrowseNodeTests.Expand_ConcurrentReadOfChildren_NeverTearsAndPublishesAtomically` gates the child-page RPCs (via a new `FakeGalaxyRepositoryTransport.BrowseChildrenGate` hook) to hold the expand mid-flight while a background reader spins enumerating `Children` and reading `IsExpanded`, asserting no exception escapes and that once `IsExpanded` is true the published snapshot has all five children. Verified red against the pre-fix code (the reader threw `InvalidOperationException: Collection was modified` deterministically across three runs) and green after the fix.
+83 -2
View File
@@ -4,8 +4,8 @@
|---|---|
| Module | `clients/go` |
| Reviewer | Claude Code |
| Review date | 2026-05-24 |
| Commit reviewed | `42b0037` |
| Review date | 2026-06-15 |
| Commit reviewed | `410acc9` |
| Status | Re-reviewed |
| Open findings | 0 |
@@ -83,6 +83,39 @@ that earlier commit.
| 9 | Testing coverage | New issue: the five new bulk SDK methods and `Client.StreamAlarms` have no unit tests in `mxgateway/` (Client.Go-024). |
| 10 | Documentation & comments | No issues found in this diff. README documents the new `StreamAlarms`/`AcknowledgeAlarm` SDK calls; `Session.ReadBulk` documents the cached-vs-snapshot semantics and `timeout=0` default; `WriteSecuredBulk` flags credential sensitivity. |
### 2026-06-15 re-review (commit 410acc9)
Re-review pass at `410acc9`. The diff is larger than the brief suggested:
`82996aa` resolved Client.Go-022..027 (already closed). On top of that,
`fd2a0ac`/`4a19854`/`da3aa7b`/`92cc468`/`75610e3` added a `LazyBrowseNode`
lazy-hierarchy walker (`Browse`/`Expand`/`BrowseChildrenRaw`) over the new
`BrowseChildren` RPC and paginated `DiscoverHierarchy`; `c463b49`/`2eb8137`/
`9bdb899` made the TLS path lenient-by-default (accept the gateway's
self-signed cert unless `RequireCertificateValidation` or `CACertFile` is set);
`6df373a` added the release docs + `scripts/tag-go-module.ps1`. `gofmt -l .`,
`go vet ./...`, `go build ./...`, and `go test ./...` are all clean at HEAD.
Two new low/medium issues in the release-helper and install docs. The
lenient-TLS default is an intentional, documented project posture
(`docs/GatewayConfiguration.md` "clients are lenient" to pair with the
auto-generated self-signed cert) and the `//nolint:gosec` is correctly
justified — not a finding. The `LazyBrowseNode` concurrency model
(coalesced in-flight Expand, non-sticky failures, snapshot copies under
`RWMutex`) is sound and well-tested, including a 10-goroutine race test.
| # | Category | Result |
|---|---|---|
| 1 | Correctness & logic bugs | New issue: `tag-go-module.ps1`'s clean-tree guard is order-dependent and silently permits tagging with uncommitted tracked changes when an untracked path sorts first (Client.Go-028). `DiscoverHierarchy`/`browseChildrenInner` pagination, the `child_has_children` hint mapping, and the duplicate-page-token guard are all correct. |
| 2 | mxaccessgw conventions | No issues found. `gofmt -l .` / `go vet ./...` clean; the `//nolint:gosec` on `InsecureSkipVerify` carries a narrow justified reason per the suppression convention. |
| 3 | Concurrency & thread safety | No issues found — `LazyBrowseNode.Expand` runs the RPC outside both locks, coalesces concurrent callers onto one in-flight RPC, publishes the result before `close(done)`, and leaves failures retryable; verified by `TestGalaxyBrowseExpandConcurrentCallersOnlyFireOneRpc` (`-race`-shaped). |
| 4 | Error handling & resilience | No issues found — `BrowseChildrenRaw` wraps transport failures in `*GatewayError`; both paginating loops guard against a repeated page token. |
| 5 | Security | No issues found — no committed secrets (only `"test"` / `"test-api-key"` fixtures); the lenient-TLS default is the documented project posture with an opt-in strict mode (`RequireCertificateValidation`). |
| 6 | Performance & resource management | No issues found — `DiscoverHierarchy` cancels each page's call context promptly inside the loop; `Children()` returns a defensive copy. |
| 7 | Design-document adherence | No issues found — lazy browse matches `docs/GalaxyRepository.md#browsechildren`; lenient TLS matches `docs/GatewayConfiguration.md`. |
| 8 | Code organization & conventions | No issues found — additive API (`Browse`/`BrowseChildrenOptions`/`RequireCertificateValidation`); `tlsConfigForOptions` cleanly extracted for testability. |
| 9 | Testing coverage | No issues found — new walker, pagination, dup-token, filter-forwarding, and TLS-posture paths are all covered. |
| 10 | Documentation & comments | New issue: README "Installing the Go client" recommends the `GONOSUMCHECK` env var, which was removed from the Go toolchain in 1.13 and is a no-op on Go 1.26 (Client.Go-029). |
## Findings
### Client.Go-001
@@ -625,3 +658,51 @@ The two cases the empty-line check seems to cover — (a) operator pressing Ente
**Recommendation:** Change `if line == "" { break }` to `if line == "" { continue }` (alongside the existing `len(args) == 0` continue, which is then redundant — keep one, drop the other for clarity). Update the `runBatch` doc-comment to read "only stdin EOF ends the session" and drop the "or an empty line" clause. If the interactive ergonomic is genuinely wanted, gate it on `isatty(stdin)` so the batch-from-pipe case isn't affected.
**Resolution:** 2026-05-24 — `runBatch` no longer treats a blank line as end-of-session. The `if line == "" { break }` early-exit was removed; blank or whitespace-only lines now fall through the existing `if len(args) == 0 { continue }` guard (kept as the single blank-line skip rule for clarity), so only stdin EOF ends the session. The doc-comment was updated to read "Blank lines are skipped; only stdin EOF ends the session." Regression test `TestRunBatchSkipsBlankLinesAndContinuesUntilEOF` in `cmd/mxgw-go/main_test.go` feeds `version --json\n\nversion --json\n` (a stray blank line between two commands) and asserts two EOR sentinels are emitted — pre-fix the test failed with "EOR sentinel count = 1, want 2" because the blank line broke the loop and the second command never ran; post-fix both commands run.
### Client.Go-028
| Field | Value |
|---|---|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Location | `scripts/tag-go-module.ps1:42-46` |
| Status | Resolved |
**Description:** The release helper's clean-working-tree guard is order-dependent and can silently let a release tag be created on top of uncommitted tracked changes — the exact thing it advertises it prevents (the README at `clients/go/README.md` says "The script ... refuses to tag with uncommitted tracked changes"). The check is:
```powershell
$status = (git status --porcelain) -join "`n"
if ($status -and -not ($status -match '^\?\?')) {
throw "Working tree has tracked changes. Commit or stash before tagging."
}
```
`git status --porcelain` emits one line per path (`XY path`), with untracked entries prefixed `??`. The lines are joined into a single string and matched against `'^\?\?'` with PowerShell `-match`, which by default is single-line (no `(?m)` multiline flag), so `^` anchors to the start of the *whole* joined string. The guard therefore inspects only the **first** status line: if that first line is an untracked file (`??`), the `-not (... -match '^\?\?')` clause is false and the throw is skipped — even when later lines are tracked modifications (` M file.go`, `A file.go`, etc.). Because `git status --porcelain` orders entries by pathname, an untracked file whose name sorts ahead of a modified tracked file (e.g. an untracked `AAA-notes.md` alongside a modified `mxgateway/session.go`) puts the `??` line first and the tag is created from a dirty tree. This was confirmed empirically: with `"?? untracked.md\n M tracked.go"` the script allows the tag; with the tracked line first it correctly throws. The whole point of the guard — reproducible release tags that match a committed state — is defeated in this ordering.
**Recommendation:** Test each status entry individually rather than the first line of a joined blob. For example, iterate the porcelain lines and throw if any line does **not** start with `??`:
```powershell
$dirty = (git status --porcelain) | Where-Object { $_ -and ($_ -notmatch '^\?\?') }
if ($dirty) {
throw "Working tree has tracked changes. Commit or stash before tagging.`n$($dirty -join "`n")"
}
```
(Equivalently, keep the joined string but use the multiline flag and negate per-line: `($status -split "`n") | ? { $_ -notmatch '^\?\?' }`.) Including the offending lines in the thrown message also helps the operator see what is dirty.
**Resolution:** 2026-06-15 — Replaced the order-dependent joined-blob check in `tag-go-module.ps1` with a per-line filter (`git status --porcelain | Where-Object { $_ -and ($_ -notmatch '^\?\?') }`) that throws on any tracked change regardless of ordering, listing the offending lines. Verified under pwsh 7.5.4 that an untracked path sorting ahead of a modified tracked file is now correctly rejected, while untracked-only and clean trees are still allowed.
### Client.Go-029
| Field | Value |
|---|---|
| Severity | Low |
| Category | Documentation & comments |
| Location | `clients/go/README.md:300-303` |
| Status | Resolved |
**Description:** The "Installing the Go client" section advises, for build environments that cannot reach `gitea.dohertylan.com` directly, to "use `GONOSUMCHECK` + `GOPRIVATE` to bypass the checksum database for the internal module path." `GONOSUMCHECK` is a dead environment variable — it was removed from the Go toolchain in Go 1.13 (its short-lived successor `GONOSUMDB` was also removed), and on the Go 1.26 toolchain this client targets (`go.mod` says `go 1.26`) setting it has no effect. The actual mechanism is `GOPRIVATE` (or the finer-grained `GONOSUMCHECK`-replacement `GONOSUMDB`→now `GONOSUMCHECK` is gone) — `GOPRIVATE=gitea.dohertylan.com/*` alone already both skips the checksum database and bypasses the public proxy for matching module paths, so the `GONOSUMCHECK` half of the recommendation is inert and misleading. A reader who copies the advice and finds checksum-db verification still failing has no working escape hatch from this doc.
**Recommendation:** Drop `GONOSUMCHECK` and document the current knobs: set `GOPRIVATE=gitea.dohertylan.com/*` (covers both sum-db bypass and direct VCS fetch), or for the checksum database specifically `GONOSUMCHECK`'s modern equivalent `GONOSUMDB` is also gone — use `GONOSUMCHECK``GOFLAGS=-insecure` only for plaintext, and `GONOSUMCHECK`. Concretely: "set `GOPRIVATE=gitea.dohertylan.com/*` (this disables both the checksum database and the public module proxy for that path); add `GOINSECURE=gitea.dohertylan.com/*` if the host serves the module over plain HTTP."
**Resolution:** 2026-06-15 — Dropped the dead `GONOSUMCHECK` advice from the "Installing the Go client" section of `clients/go/README.md`; it now documents `GOPRIVATE=gitea.dohertylan.com/*` (which bypasses both the public module proxy and checksum-database verification for that path) plus `GOINSECURE=gitea.dohertylan.com/*` for plain-HTTP hosts.
+83 -2
View File
@@ -4,8 +4,8 @@
|---|---|
| Module | `clients/java` |
| Reviewer | Claude Code |
| Review date | 2026-05-24 |
| Commit reviewed | `42b0037` |
| Review date | 2026-06-15 |
| Commit reviewed | `410acc9` |
| Status | Re-reviewed |
| Open findings | 0 |
@@ -77,6 +77,35 @@ Client.Java-001..031 are unchanged.
| 9 | Testing coverage | Issue found: the new `MxGatewayClient.streamAlarms` SDK method has no library-side test in `zb-mom-ww-mxgateway-client/src/test/...` — only the CLI test exercises the path via a `FakeClient.streamAlarms` override that bypasses the production `subscription.wrap(observer)` glue (Client.Java-035). |
| 10 | Documentation & comments | Issue found: README (`clients/java/README.md:182-183`) documents the new `stream-alarms` and `acknowledge-alarm` commands with `--session-id <id>` (neither command has that option) and `acknowledge-alarm --alarm-reference …` (actual flag is `--reference`) — every documented invocation fails at picocli parse time (Client.Java-032). |
### 2026-06-15 re-review (commit 410acc9)
Re-review pass at `410acc9`. Diff against `42b0037` is eleven commits touching
`clients/java`: `d3cb311` (Client.Java-032..036 fixes — shared subscription
base + batch tokenizer), `0d6193c`/`803a207`/`b4bc2df`/`4a19854`/`b244851`/
`68f905a` (the `BrowseChildren` lazy-browse SDK surface: `GalaxyRepositoryClient.browse()`,
`browse(BrowseChildrenOptions)`, `browseChildrenRaw`, `browseChildrenInner`,
plus the `LazyBrowseNode` walker and `BrowseChildrenOptions`), `a276f46`/
`ba82afe`/`2eb8137` (lenient-by-default TLS: new `requireCertificateValidation`
option, `InsecureTrustManagerFactory` fallback, foojay toolchain resolver), and
`fe44e3c` (maven-publish wiring for the Gitea Maven feed). Generated
protobuf/gRPC Java is excluded. `gradle test` could not be run here — this macOS
host has no Java runtime (the module builds on the Windows host per project
memory); findings below are from source inspection. Prior findings
Client.Java-001..036 are unchanged.
| # | Category | Result |
|---|---|---|
| 1 | Correctness & logic bugs | No issues found in this diff. `LazyBrowseNode.expand()` leader/coalesce logic is correct (single in-flight future, slot cleared on failure for retry); `browseChildrenInner` pagination handles the empty/null next-page token and guards against repeated page tokens; the `child_has_children` parallel array is bounds-checked (`i < getChildHasChildrenCount()`), defaulting absent hints to false. |
| 2 | mxaccessgw conventions | No issues found. No MXAccess COM, no synthesized events, generated code untouched. The lenient-TLS default is a documented repo-wide design decision (`docs/DesignDecisions.md` "TLS Auto-Certificate and Lenient Client Trust"), not a Java-specific deviation. |
| 3 | Concurrency & thread safety | No issues found. `LazyBrowseNode` does not hold the `expandLock` monitor across the BrowseChildren RPC (fixed in `68f905a`); readers use a separate `ReentrantReadWriteLock` so `getChildren()`/`isExpanded()` never block on the in-flight RPC; `BrowseChildrenOptions` is immutable. The shared `MxGatewayStreamSubscription` base (Client.Java-036) is covered. |
| 4 | Error handling & resilience | No issues found. `browseChildrenRaw` normalises non-`MxGatewayException` gRPC errors via `MxGatewayErrors.fromGrpc`; the non-leader `expand()` path rethrows the leader's `MxGatewayException`/`RuntimeException` and restores the interrupt flag on `InterruptedException`. |
| 5 | Security | No issues found. maven-publish credentials come from `GITEA_USERNAME`/`GITEA_TOKEN` env vars with empty-string fallback — no committed secrets. The lenient-TLS `InsecureTrustManagerFactory` default is the documented, intentional design for this PKI-less internal tool; strict verification is reachable via `caCertificatePath` (pin) or `requireCertificateValidation(true)`, both tested in `MxGatewayClientTlsTests`. |
| 6 | Performance & resource management | No issues found. |
| 7 | Design-document adherence | No issues found. The browse surface matches `docs/GalaxyRepository.md#browsechildren` (cache-served lazy expand, `has_children` hint, repeated-page-token → error); the TLS posture matches `docs/GatewayConfiguration.md` and `JavaClientDesign.md`. |
| 8 | Code organization & conventions | Issue found: the new `requireCertificateValidation` library option is not exposed or propagated by the CLI `CommonOptions.toClientOptions()`, so CLI users cannot opt into JVM-trust-store verification — same additive-surface gap pattern as the resolved Client.Java-025 (Client.Java-038). |
| 9 | Testing coverage | No issues found. The browse surface has thorough library tests in `GalaxyRepositoryClientTests` (roots, expand-populates, idempotent-single-RPC, unknown-parent not-found, multi-page gather, concurrent-callers-one-RPC, filter forwarding, repeated-page-token rejection); TLS lenient/strict paths are covered by `MxGatewayClientTlsTests` against a real in-process TLS server. |
| 10 | Documentation & comments | Issue found: the README "Browsing lazily" first code snippet calls `galaxy.browseChildren(BrowseChildrenRequest…)`, but no such method exists on `GalaxyRepositoryClient` — the raw single-RPC method is `browseChildrenRaw(BrowseChildrenRequest)`; the documented snippet does not compile (Client.Java-037). |
## Findings
### Client.Java-001
@@ -662,4 +691,56 @@ This is the same maintenance-hazard pattern Client.Java-009 / Client.Java-016 id
**Resolution:** 2026-05-24 — Extracted a package-private abstract base `MxGatewayStreamSubscription<TRequest, TResponse> implements AutoCloseable` (new file `clients/java/zb-mom-ww-mxgateway-client/src/main/java/com/zb/mom/ww/mxgateway/client/MxGatewayStreamSubscription.java`). It holds the shared `AtomicReference<ClientCallStreamObserver<TRequest>>` and `AtomicBoolean cancelled` pair, the `wrap(StreamObserver<TResponse>)` factory that returns a `ClientResponseObserver` with the Client.Java-014 close-before-beforeStart fix baked in, the `cancel()` / `close()` implementation, and an immutable `cancelMessage` injected by the subclass constructor. The four prior 60-line near-clones (`MxGatewayEventSubscription`, `MxGatewayAlarmFeedSubscription`, `MxGatewayActiveAlarmsSubscription`, `DeployEventSubscription`) collapse to ~10-line subclasses that only declare their `<Request, Response>` type parameters and supply the cancel-message string to `super(...)`. Public API surface is preserved: each subclass remains a `public final class` with a public no-arg constructor (the constructor was implicit on the original classes; I made it explicit `public` on the subclasses so the existing CLI `FakeClient.streamAlarms` in a different package can still `new MxGatewayAlarmFeedSubscription()`). The `wrap(...)` method is `final` and package-private on the base — same accessibility the four subclasses had before — so production callers in `MxGatewayClient`/`GalaxyRepositoryClient` see no change. New test file `MxGatewayStreamSubscriptionContractTests` exercises the lifecycle/cancellation contract identically across all four subclasses (16 tests, four per scenario): (a) cancel-before-beforeStart eagerly cancels the stream once it attaches with the subclass-specific message, (b) cancel-after-beforeStart forwards directly to the stream, (c) `close()` delegates to `cancel()`, (d) the wrapped observer forwards `onNext`/`onError`/`onCompleted` verbatim, and a compile-time `typeBoundsCheck` helper that asserts each subclass still binds its `<Req, Resp>` parameters to the right proto types. TDD red phase confirmed: temporarily breaking one subclass's `super(...)` message to `"BROKEN MESSAGE"` made the contract test for that subclass fail with `expected: <client cancelled alarm feed> but was: <BROKEN MESSAGE>`; restoring the correct value turned all 16 contract tests green. Future fixes to the shared lifecycle now live in one place — the next Client.Java-014/021-style race fix cannot drift across the four classes.
### Client.Java-037
| Field | Value |
|---|---|
| Severity | Medium |
| Category | Documentation & comments |
| Location | `clients/java/README.md:138-149` |
| Status | Resolved |
**Description:** The "Browsing lazily" section's first (low-level) code snippet documents a `browseChildren` method that does not exist on the public client surface:
```java
BrowseChildrenReply reply = galaxy.browseChildren(
BrowseChildrenRequest.newBuilder().build());
```
`GalaxyRepositoryClient` exposes only `browse()`, `browse(BrowseChildrenOptions)`, and the raw single-RPC method `browseChildrenRaw(BrowseChildrenRequest)` (verified at `GalaxyRepositoryClient.java:227,238,251`). There is no `browseChildren(BrowseChildrenRequest)`, so the documented snippet fails to compile — a user copy-pasting the primary low-level example hits a missing-symbol error immediately. The README hedges the snippet with "This snippet documents the API as it appears once the Java client is regenerated on the Windows host," but the discrepancy is not a regeneration artifact: the hand-written wrapper method is named `browseChildrenRaw`, not `browseChildren`. The adjacent "High-level walker" snippet (`galaxy.browse()`, `root.expand()`, `root.getChildren()`, `child.hasChildrenHint()`, `child.getObject().getTagName()`) is correct against the actual API; only the low-level snippet is wrong.
**Recommendation:** Change `galaxy.browseChildren(` to `galaxy.browseChildrenRaw(` in the low-level snippet so it matches the real method name, or replace the low-level example with the `browse()`/`LazyBrowseNode` walker that the SDK actually intends as the primary surface. Drop the "as it appears once regenerated" caveat once the snippet compiles against the current source. Consider an `installDist`-based or compile-checked doc snippet test to prevent README API drift, mirroring the parse-only assertions added for Client.Java-032.
**Resolution:** 2026-06-15 — Confirmed against source: `GalaxyRepositoryClient` (`zb-mom-ww-mxgateway-client/.../GalaxyRepositoryClient.java:227,238,251`) exposes only `browse()`, `browse(BrowseChildrenOptions)`, and the raw single-RPC `browseChildrenRaw(BrowseChildrenRequest)` — there is no `browseChildren(BrowseChildrenRequest)`, so the documented snippet did not compile. Fixed the README "Browsing lazily" low-level snippet at `clients/java/README.md` by renaming `galaxy.browseChildren(` to `galaxy.browseChildrenRaw(`; the surrounding accessors (`BrowseChildrenReply`/`BrowseChildrenRequest` types, `getChildrenList()`, `getChildHasChildrenList()`, `getTagName()`) are all valid proto accessors and were left unchanged. Replaced the misleading "as it appears once the Java client is regenerated on the Windows host" caveat (the discrepancy was a hand-written wrapper name, not a codegen artifact) with prose steering callers to the high-level `browse()`/`LazyBrowseNode` walker as the preferred surface and `browseChildrenRaw` as the direct-paging escape hatch. Documentation-only change; no test added (no compile-checked doc-snippet harness exists yet — left as the noted future enhancement).
### Client.Java-038
| Field | Value |
|---|---|
| Severity | Low |
| Category | Code organization & conventions |
| Location | `clients/java/zb-mom-ww-mxgateway-cli/src/main/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCli.java:1347-1393` |
| Status | Resolved |
**Description:** Commit `a276f46` added `requireCertificateValidation` to `MxGatewayClientOptions` as a first-class TLS-trust toggle (lenient-by-default; set `true` to verify against the JVM trust store without pinning a CA). The CLI `CommonOptions` exposes `--plaintext`, `--ca-file`, and `--server-name-override` and propagates them through `toClientOptions()`, but it neither declares a `--require-certificate-validation` option nor sets `builder.requireCertificateValidation(...)`. CLI users therefore have no way to request strict verification short of supplying a pinned CA via `--ca-file`; the lenient `InsecureTrustManagerFactory` default is forced on every non-pinned TLS CLI connection. This is the same additive-surface gap pattern as the resolved Client.Java-025 (`shutdownTimeout` not propagated to the CLI). `docs/CrossLanguageSmokeMatrix.md` documents `--require-certificate-validation` for the Rust CLI's pin-only stack but not Java, so this is not a direct README contradiction; it is a library-vs-CLI surface inconsistency. Severity is Low because the secure-by-pinning path (`--ca-file`) remains available and the lenient default is the documented intended behaviour for this internal tool.
**Recommendation:** Add a `--require-certificate-validation` boolean option to `CommonOptions` (default unset/false to preserve the lenient default) and propagate it into `toClientOptions()` via `builder.requireCertificateValidation(value)`. Include the resolved value in `redactedJsonMap()` so `--json` output reflects the effective trust posture. Add a CLI parse-only assertion exercising the flag to keep the CLI surface tracking the library surface.
**Resolution:** 2026-06-15 — Confirmed against source: `MxGatewayClientOptions` (`zb-mom-ww-mxgateway-client/.../MxGatewayClientOptions.java:108,260`) exposes `requireCertificateValidation()` and a `Builder.requireCertificateValidation(boolean)`, but the CLI `CommonOptions` in `MxGatewayCli.java` declared no flag and `toClientOptions()` never set it, forcing the lenient default on every non-pinned TLS CLI connection. Added a bare-boolean `@Option(names = "--require-certificate-validation")` field to `CommonOptions` (defaults to `false`, preserving the lenient default; mirrors the existing `--plaintext` flag-style option), propagated it through `toClientOptions()` via `.requireCertificateValidation(requireCertificateValidation)`, and added it to `redactedJsonMap()` so `--json` output reflects the effective trust posture. Documented the new flag and the lenient-by-default trust posture in `clients/java/README.md`. Note: the Client.Java-025 precedent (`shutdownTimeout`) was applied to the pre-rename `mxgateway-cli` module and is not present in this renamed `zb-mom-ww-mxgateway-cli` `toClientOptions()`; I mirrored the live `--ca-file`/`--server-name-override` TLS-option plumbing pattern instead, which is the correct precedent here. Regression tests in `MxGatewayCliTests`: `requireCertificateValidationFlagPropagatesThroughToClientOptions` (drives `acknowledge-alarm --require-certificate-validation` through a new `CapturingClientFactory` that records `options.toClientOptions()` and asserts `MxGatewayClientOptions.requireCertificateValidation()` is `true`) and `requireCertificateValidationDefaultsToLenientWhenFlagAbsent` (asserts the flag defaults to `false`). The capturing factory exercises the real `toClientOptions()` propagation, stronger than a parse-only check.
### Client.Java-039
| Field | Value |
|---|---|
| Severity | High |
| Category | Correctness & logic bugs |
| Location | `clients/java/zb-mom-ww-mxgateway-cli/src/main/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCli.java:1699` (origin: `src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto`, `AlarmFeedMessage.payload` provider-status arm added in commit `1d85db7`) |
| Status | Resolved |
**Description:** The Java CLI does not compile at HEAD `410acc9`. `formatAlarmFeedMessage` switches over `message.getPayloadCase()` as an exhaustive switch *expression* with no `default`, covering only `ACTIVE_ALARM`, `SNAPSHOT_COMPLETE`, `TRANSITION`, and `PAYLOAD_NOT_SET`. The alarm-provider-fallback contract change `1d85db7` added a fourth `AlarmFeedMessage.payload` oneof arm (`AlarmProviderStatus provider_status`), so the generated `PayloadCase` enum now has a `PROVIDER_STATUS` value the switch does not handle — `javac` rejects it with "the switch expression does not cover all possible input values" and `gradle :zb-mom-ww-mxgateway-cli:compileJava` fails. This is the same class of cross-component contract-propagation break as Client.Rust-030 and IntegrationTests-026: a new contract field that left a downstream exhaustive consumer uncompilable. The original re-review (Client.Java-037/038) missed it because there is no JVM on the macOS review host and `gradle` could not be run; the break surfaced when the fixes were verified on the Windows host. Because the CLI is the cross-language e2e driver, the whole Java client artifact set cannot build and no Java e2e smoke can run.
**Recommendation:** Add a `PROVIDER_STATUS` arm to `formatAlarmFeedMessage` that renders the provider status (mode / degraded / reason) consistently with the other alarm-feed arms — do not add a `default ->` that silently drops it, since the provider status is meaningful and the exhaustive switch is the compiler-enforced guard that catches exactly this kind of future contract drift.
**Resolution:** 2026-06-15 — Confirmed via `gradle :zb-mom-ww-mxgateway-cli:compileJava` failing with "the switch expression does not cover all possible input values" at `MxGatewayCli.java:1699` on the Windows host. Added a `case PROVIDER_STATUS ->` arm to `formatAlarmFeedMessage` yielding `provider-status mode=%s degraded=%b reason=%s` (from `AlarmProviderStatus.getMode().name()` / `getDegraded()` / `getReason()`), plus the `import mxaccess_gateway.v1.MxaccessGateway.AlarmProviderStatus;`. No `default` arm — the exhaustive switch expression remains the compile-time guard against future `payload` oneof additions. Verified `gradle test` builds and passes on the Windows host (Java 21).
+270 -3
View File
@@ -4,16 +4,48 @@
|---|---|
| Module | `clients/python` |
| Reviewer | Claude Code |
| Review date | 2026-05-24 |
| Commit reviewed | `42b0037` |
| Review date | 2026-06-15 |
| Commit reviewed | `410acc9` |
| Status | Re-reviewed |
| Open findings | 0 |
## Checklist coverage
### 2026-06-15 re-review (commit 410acc9)
Re-review pass at `410acc9`. The diff against the previous review base
`42b0037` covers: PyPI metadata + Gitea PyPI-feed install instructions in
`pyproject.toml` / `README.md`; a new lazy Galaxy browse surface
(`GalaxyRepositoryClient.browse_children_raw` / `browse` / `_iter_browse_children`,
the `LazyBrowseNode` walker, and `BrowseChildrenOptions`); a TLS
trust-on-first-use (TOFU) default in `options.py` gated by a new
`ClientOptions.require_certificate_validation` flag; the `_use_plaintext`
TLS-default contract carried forward; and the `batch` `CliRunner`-removal
follow-through. The new browse / TOFU surface is well tested
(`tests/test_galaxy.py`, `tests/test_auth_options.py`, `tests/test_tls.py`).
`python -m pytest` passes (80 passed, 1 skipped — the loopback-TLS test is
opt-in via `MXGATEWAY_RUN_TLS_TESTS=1`). `python -m pip wheel .` builds the
wheel cleanly against the installed setuptools 82.0.1.
| # | Category | Result |
|---|---|---|
| 1 | Correctness & logic bugs | Issue found: `_split_authority` raises an uncaught `ValueError` for a port-less endpoint instead of a transport error (Client.Python-029). |
| 2 | mxaccessgw conventions | No new issues found — secrets still redacted, generated code untouched, no committed tokens in the new Gitea feed URLs (placeholders only). |
| 3 | Concurrency & thread safety | No new issues found — `LazyBrowseNode.expand` uses a per-node `asyncio.Lock` with a double-checked guard and is verified concurrent-safe by `test_browse_expand_concurrent_callers_only_fire_one_rpc`. |
| 4 | Error handling & resilience | Issue found: the TOFU branch calls the blocking `ssl.get_server_certificate` with no timeout from inside the `async def connect` path, blocking the event loop and hanging indefinitely on a black-holed host (Client.Python-028). |
| 5 | Security | Issue found: the new `require_certificate_validation` security flag is not reachable through the documented `connect(...)` convenience kwargs or any CLI flag, so callers using those paths are locked into TOFU and cannot force certificate validation (Client.Python-027). TOFU itself is design-sanctioned (`docs/GatewayConfiguration.md` line 470). |
| 6 | Performance & resource management | No new issues found beyond the blocking TLS probe captured in Client.Python-028. |
| 7 | Design-document adherence | No new issues found — TOFU default, `require_certificate_validation` naming, and the BrowseChildren surface match `docs/GatewayConfiguration.md` / `docs/GalaxyRepository.md`; both README doc anchors resolve. |
| 8 | Code organization & conventions | Issue found: `pyproject.toml` uses the PEP 639-deprecated `license = { text = ... }` table form (Client.Python-030). pyproject metadata is otherwise correct and the wheel builds. |
| 9 | Testing coverage | Issue found: the `tls` pytest mark used by `tests/test_tls.py` is not registered in `[tool.pytest.ini_options]`, emitting a `PytestUnknownMarkWarning` (Client.Python-031). New browse / TOFU paths are otherwise well covered. |
| 10 | Documentation & comments | No new issues found — README TLS/browse/Gitea-feed prose matches the code; the alarm-CLI README examples corrected under Client.Python-022 remain correct. |
### Prior coverage (commit a020350)
A re-review at commit `a020350` over the same module. Prior findings
(Client.Python-001 — Client.Python-017) remain closed and are kept as
history. This section reflects categories evaluated in this pass.
history. This section reflects categories evaluated in that pass.
| # | Category | Result |
|---|---|---|
@@ -1171,3 +1203,238 @@ scope; `test_commands_module_bench_read_bulk_does_not_use_bare_except_pass`
greps the function source for the `except Exception:\n pass` pattern
and rejects it. Both tests failed against the pre-fix source and pass
against the fix.
### Client.Python-027
| Field | Value |
|---|---|
| Severity | Medium |
| Category | Security |
| Location | `clients/python/src/zb_mom_ww_mxgateway/client.py:36-54`, `clients/python/src/zb_mom_ww_mxgateway/galaxy.py:47-66`, `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py:165-172,918-930` |
| Status | Resolved |
**Description:** This commit adds `ClientOptions.require_certificate_validation`
(default `False`) so a caller can force system-trust certificate verification
instead of the new lenient trust-on-first-use (TOFU) default. The flag is
honoured inside `create_channel`, but it is not surfaced through either of the
two documented ways a normal caller dials the gateway:
1. `GatewayClient.connect(...)` and `GalaxyRepositoryClient.connect(...)` accept
the convenience kwargs `endpoint` / `api_key` / `plaintext` / `ca_file` /
`server_name_override` and build the `ClientOptions` internally, but do **not**
accept or forward `require_certificate_validation`. The README's high-level
examples (e.g. the lazy-browse walker) use exactly this kwarg form
(`GalaxyRepositoryClient.connect(endpoint=..., api_key=..., plaintext=True)`),
so the kwarg path is the primary documented entry point.
2. The CLI exposes `--plaintext`, `--tls`, and `--ca-file` but no
`--require-certificate-validation` flag, and `_connect` constructs
`ClientOptions(...)` without setting the field. A CLI user connecting to a
TLS gateway is therefore locked into TOFU.
The net effect is that the *only* way to opt into real certificate validation is
to construct a `ClientOptions` instance directly and pass it as the positional
`options=` argument — a path neither the README nor the CLI documents. A
security-sensitive deployment that wants the strict (verify-against-system-trust)
posture cannot select it through the documented surface, so it silently stays on
TOFU. TOFU itself is design-sanctioned (`docs/GatewayConfiguration.md` line 470
explicitly says "Python uses trust-on-first-use"), so this is an opt-in-to-strict
reachability gap rather than an insecure default — hence Medium with a workaround.
**Recommendation:** Add a `require_certificate_validation: bool = False` kwarg to
both `GatewayClient.connect` and `GalaxyRepositoryClient.connect` and forward it
into the constructed `ClientOptions`. Add a `--require-certificate-validation`
(or `--verify-tls`) flag to the shared CLI option set and wire it through
`_connect`. Add a test asserting the flag flows through to
`ClientOptions.require_certificate_validation` and a README note documenting how
to select the strict posture.
**Resolution:** 2026-06-15 — Confirmed: `connect` built `ClientOptions` from a
fixed kwarg set that omitted `require_certificate_validation`, and the CLI had no
flag, so the strict posture was only reachable via a hand-built `options=`. Added
a `require_certificate_validation: bool = False` kwarg to both
`GatewayClient.connect` and `GalaxyRepositoryClient.connect` (forwarded into the
constructed `ClientOptions`), a `--require-certificate-validation` flag to the
shared `gateway_options` CLI option set, and wired it through `_connect`. README
TLS section now documents the strict posture is reachable via the connect kwarg,
the options struct, and the CLI flag. Tests:
`tests/test_client_session.py::test_gateway_connect_forwards_require_certificate_validation`,
`::test_galaxy_connect_forwards_require_certificate_validation`,
`tests/test_cli.py::test_require_certificate_validation_flag_flows_through_connect`,
`::test_require_certificate_validation_defaults_off` — all failed before the fix
and pass after.
### Client.Python-028
| Field | Value |
|---|---|
| Severity | Medium |
| Category | Error handling & resilience |
| Location | `clients/python/src/zb_mom_ww_mxgateway/options.py:120-130`, `clients/python/src/zb_mom_ww_mxgateway/client.py:59`, `clients/python/src/zb_mom_ww_mxgateway/galaxy.py:71` |
| Status | Resolved |
**Description:** The TOFU branch of `create_channel` calls
`ssl.get_server_certificate((host, port))` to pre-fetch the server certificate.
`create_channel` is a synchronous function, but it is invoked exclusively from
inside the `async def connect` classmethods of `GatewayClient` and
`GalaxyRepositoryClient` (`client.py:59`, `galaxy.py:71`). `ssl.get_server_certificate`
opens a real blocking TCP+TLS socket on the calling thread, so:
1. It **blocks the asyncio event loop** for the full duration of the connect/handshake.
This is at odds with the rest of the client, which is fully `async`.
2. It passes **no `timeout`** to `ssl.get_server_certificate`. The `test_tofu_connect_failure_raises_transport_error`
test only proves the *connection-refused* case (a closed port returns fast).
A black-holed / firewall-drop host (packets silently dropped) makes the
underlying `socket.create_connection` hang on the OS default connect timeout,
which can be minutes, with the event loop frozen the whole time. A caller that
wrapped `connect` in `asyncio.wait_for(...)` cannot cancel it because the block
is in synchronous C, not at an `await` point.
The other TLS branches (`ca_file`, `require_certificate_validation`) build the
channel lazily and return immediately, so only the lenient default — the most
common path — has this hazard.
**Recommendation:** Pass an explicit `timeout=` to `ssl.get_server_certificate`
(it accepts one), bounded by `options.call_timeout` or a short fixed value, so a
black-holed host fails fast as a `MxGatewayTransportError` instead of hanging.
Better, run the synchronous probe off the event loop — make the TOFU pre-fetch
path awaitable (e.g. wrap it in `asyncio.get_running_loop().run_in_executor(...)`
from an `async` channel factory, or document that `connect` must not be called
from a running loop). Add a regression test that asserts the probe honours a
timeout.
**Resolution:** 2026-06-15 — Confirmed: the TOFU branch called
`ssl.get_server_certificate((host, port))` with no timeout from the synchronous
`create_channel`, which both `connect` classmethods invoked directly on the event
loop. Fix is two-part: (1) `create_channel` now passes
`timeout=options.call_timeout` (falling back to a fixed
`_TOFU_PROBE_TIMEOUT_SECONDS = 10.0` when no call_timeout is set) to
`ssl.get_server_certificate`, and the existing `except OSError` wraps a
timeout/connect failure into `MxGatewayTransportError` (TimeoutError/socket.timeout
are OSError subclasses); (2) both `GatewayClient.connect` and
`GalaxyRepositoryClient.connect` now run the blocking factory off the loop via
`await asyncio.to_thread(create_channel, resolved)`, so the event loop is never
frozen and a caller's `asyncio.wait_for` can cancel the connect. Tests:
`tests/test_auth_options.py::test_tofu_probe_passes_a_bounded_timeout`,
`::test_tofu_probe_timeout_raises_transport_error` (parametrized over
socket.timeout / TimeoutError / OSError), and
`tests/test_client_session.py::test_gateway_connect_runs_create_channel_off_the_event_loop`,
`::test_galaxy_connect_runs_create_channel_off_the_event_loop`. The timeout and
off-loop tests failed before the fix and pass after.
### Client.Python-029
| Field | Value |
|---|---|
| Severity | Low |
| Category | Correctness & logic bugs |
| Location | `clients/python/src/zb_mom_ww_mxgateway/options.py:78-90` |
| Status | Resolved |
**Description:** `_split_authority` parses a non-bracketed target with
`host, _, port = target.rpartition(":")` and returns
`(host or "localhost", int(port) if port else 443)`. For a port-less endpoint
such as `"mygateway"`, `rpartition(":")` returns `("", "", "mygateway")`, so
`host` becomes `""` (→ `"localhost"`) and `port` becomes `"mygateway"`, and
`int("mygateway")` raises an uncaught `ValueError: invalid literal for int()`.
Because `_split_authority` is called *before* the `try/except OSError` guard in
`create_channel`, the failure escapes as a raw `ValueError` rather than the
intended `MxGatewayTransportError`, and the message does not name the endpoint.
Verified at runtime:
`_split_authority("mygateway")``ValueError: invalid literal for int() with base 10: 'mygateway'`.
gRPC targets normally carry an explicit port (`host:port`), so impact is narrow,
but a typo or a bare-hostname endpoint produces a confusing crash on the TOFU
default path. The bracketed-IPv6 and `host:port` cases are covered by tests; the
port-less case is not.
**Recommendation:** Treat a non-numeric / missing port as the default (443) and
keep the whole string as the host, e.g. detect a trailing `:<digits>` explicitly
rather than assuming the `rpartition` tail is numeric, or wrap the `int(port)`
conversion so a non-numeric tail falls back to host-only with the default port.
Add a `_split_authority("mygateway")` case to `tests/test_tls.py`.
**Resolution:** 2026-06-15 — Confirmed: `_split_authority("mygateway")` raised
`ValueError: invalid literal for int() with base 10: 'mygateway'` because
`rpartition(":")` put the whole string in the port slot. Rewrote the
non-bracketed branch to inspect the `rpartition` separator and the tail: no colon
→ whole target is the host with default port 443; a colon with a non-digit/empty
tail → left side is the host with default port 443; a digit tail → parse the
port. The bare-hostname case now returns `("mygateway", 443)` instead of raising,
and the existing `":5120"` / `"localhost:5120"` / IPv6 cases are unchanged. Test:
`tests/test_tls.py::test_split_authority_defaults_port_for_portless_endpoint`
(covers `"mygateway"`, `"https://mygateway"`, and `"mygateway:"`) — failed before
the fix and passes after.
### Client.Python-030
| Field | Value |
|---|---|
| Severity | Low |
| Category | Code organization & conventions |
| Location | `clients/python/pyproject.toml:17` |
| Status | Resolved |
**Description:** This commit re-adds a `license` key to `pyproject.toml` as the
table form `license = { text = "Proprietary" }`. Under PEP 639 (active in the
installed setuptools 82.0.1), the `[project.license]` **table** forms (`text` and
`file`) are deprecated in favour of the SPDX string expression, and a future
setuptools major may reject them — the same class of regression that
Client.Python-018 (the earlier `license = "Proprietary"` string, rejected because
`Proprietary` is not a valid SPDX identifier) recorded for this exact field. The
build currently succeeds (verified: `python -m pip wheel .` produces
`zb_mom_ww_mxaccess_gateway_client-0.1.0-py3-none-any.whl` and the metadata
carries `License: Proprietary` plus the `License :: Other/Proprietary License`
classifier), so this is a forward-looking maintainability flag, not a present
breakage. Note that pairing a `license` table with a `License ::` trove
classifier is also flagged by PyPI/twine as redundant under the new metadata
rules.
**Recommendation:** Prefer the PEP 639 SPDX-string form with a `LicenseRef-*`
custom identifier for an unlisted licence (`license = "LicenseRef-Proprietary"`)
— this is the future-proof equivalent of the intent and avoids the deprecated
table form — or drop the `license` key entirely and rely on the existing
`License :: Other/Proprietary License` classifier (the Client.Python-018
resolution chose this). The `tests/test_packaging.py::test_pip_wheel_build_succeeds`
guard (added under Client.Python-020) will catch the day a setuptools upgrade
turns the deprecation into a hard error.
**Resolution:** 2026-06-15 — Switched the deprecated `license = { text =
"Proprietary" }` table form to the PEP 639 SPDX-string form
`license = "LicenseRef-Proprietary"` (the future-proof custom identifier for an
unlisted/proprietary licence). Also removed the now-redundant
`License :: Other/Proprietary License` trove classifier, which setuptools >= 77
flags as conflicting when a `License-Expression` is present. The built wheel
metadata now carries `License-Expression: LicenseRef-Proprietary` and no
`Classifier: License ::` line. Verified by `python -m pip wheel . --no-deps`,
which builds cleanly; the existing
`tests/test_packaging.py::test_pip_wheel_build_succeeds` guard exercises the same
build and passes.
### Client.Python-031
| Field | Value |
|---|---|
| Severity | Low |
| Category | Testing coverage |
| Location | `clients/python/tests/test_tls.py:34`, `clients/python/pyproject.toml:53-56` |
| Status | Resolved |
**Description:** `tests/test_tls.py` applies a module-level
`pytestmark = pytest.mark.tls`, but the `tls` marker is not registered in
`[tool.pytest.ini_options]` (which declares only `addopts`, `pythonpath`, and
`testpaths`). Every run emits a `PytestUnknownMarkWarning: Unknown
pytest.mark.tls - is this a typo?`. The warning is benign today, but (a) it is
exactly the kind of typo the warning exists to catch, so a future genuine
mistyped marker would be lost in the noise, and (b) if the suite ever adopts
`filterwarnings = ["error"]` (a common hardening step), the unregistered marker
would turn into a hard collection failure.
**Recommendation:** Register the marker, e.g.
`markers = ["tls: loopback TLS tests, opt-in via MXGATEWAY_RUN_TLS_TESTS=1"]`
under `[tool.pytest.ini_options]` in `clients/python/pyproject.toml`.
**Resolution:** 2026-06-15 — Registered the `tls` marker by adding
`markers = ["tls: loopback TLS tests, opt-in via MXGATEWAY_RUN_TLS_TESTS=1"]`
under `[tool.pytest.ini_options]` in `clients/python/pyproject.toml`.
`python -m pytest` now reports no `PytestUnknownMarkWarning` (full run: 91
passed, 1 skipped, 0 warnings; previously 1 warning). The `tls`-marked
`tests/test_tls.py` module is the guard — its run is now warning-free.
+77 -2
View File
@@ -4,8 +4,8 @@
|---|---|
| Module | `clients/rust` |
| Reviewer | Claude Code |
| Review date | 2026-05-24 |
| Commit reviewed | `42b0037` |
| Review date | 2026-06-15 |
| Commit reviewed | `410acc9` |
| Status | Re-reviewed |
| Open findings | 0 |
@@ -96,6 +96,25 @@ under review does not address them.
| 9 | Testing coverage | Issue found: zero tests cover `stream_alarms` on `GatewayClient`, the new bulk read/write SDK methods, or the `BenchReadBulk` flow; the fake gateway's `stream_alarms` impl drops the sender immediately (Client.Rust-024). |
| 10 | Documentation & comments | Issue found: `.cargo/config.toml`'s comment promises "Release builds are unaffected" but the `link-arg=/STACK:8388608` setting is unconditional under `cfg(windows)` and only applies to the MSVC linker (Client.Rust-027). |
### 2026-06-15 re-review (commit 410acc9)
Re-review pass at `410acc9`. The diff against `42b0037` (`git diff 42b0037..HEAD -- clients/rust/`) covers: Cargo metadata + Gitea alternative-registry config (`Cargo.toml`, `.cargo/config.toml`, README install section); a `[registries.dohertj2-gitea]` index entry and `publish = ["dohertj2-gitea"]` with `mxgw-cli` set `publish = false`; the resolution work for Client.Rust-022..029 (malformed-reply `Result` plumbing, `next_correlation_id` re-export, clippy fixes, `read_bulk<S: AsRef<str>>`); a **new** Galaxy lazy-browse walker (`browse`, `browse_children_raw`, `browse_children_inner`, `BrowseChildrenOptions`, `LazyBrowseNode`) with six unit tests; a **new** TLS pin-only guard (`build_tls_config` + `ClientOptions::with_require_certificate_validation` + `--require-certificate-validation` CLI flag) with a new `tests/tls.rs`; and the alarm-provider-fallback proto surface (`AlarmFeedMessage.provider_status`, added contracts-side in `1d85db7`).
`cargo fmt --check` is clean. `cargo check -p zb-mom-ww-mxgateway-client`, `cargo test -p zb-mom-ww-mxgateway-client` (24 lib + integration, 4 proto-fixture, 4 tls — all pass), and the library half of the workspace are clean. **`cargo clippy --workspace --all-targets -- -D warnings` and `cargo check --workspace` both FAIL at HEAD** — not on a lint but on a hard `E0004` compile error: the `mxgw-cli` binary's two `match &message.payload` blocks (`crates/mxgw-cli/src/main.rs:1731,1757`) are non-exhaustive after the proto added `AlarmFeedMessage.payload::ProviderStatus` (Client.Rust-030). The library crate compiles and all its tests pass; the break is confined to the CLI binary. No committed registry tokens — `.cargo/config.toml` carries only the sparse index URL; the README documents the token living in `~/.cargo/credentials.toml`.
| # | Category | Result |
|---|---|---|
| 1 | Correctness & logic bugs | Issue found: `mxgw-cli` fails to compile at HEAD — non-exhaustive `AlarmFeedMessage.payload` match missing the new `ProviderStatus` arm (Client.Rust-030). The library `read_bulk`/galaxy-walker/TLS-guard logic is correct and tested. |
| 2 | mxaccessgw conventions | Issue found: `cargo clippy --workspace --all-targets -- -D warnings` / `cargo check --workspace` do not pass — CLAUDE.md mandates they do (Client.Rust-030). The prior 029 clippy regressions are resolved; this is a new build break from the alarm-provider proto change. |
| 3 | Concurrency & thread safety | No issues found — `LazyBrowseNode` shares state via `Arc<…AsyncMutex<…>>`; `expand()` holds the mutex across the `browse_children_inner` await so concurrent expanders serialize and the idempotency check is race-free. `CORRELATION_SEQUENCE` is still `AtomicU64`/`Relaxed`. No `unsafe`. |
| 4 | Error handling & resilience | Issue found: the strict TLS path (`require_certificate_validation(true)` with no CA) builds a `ClientTlsConfig` with zero trust roots (no `tls-native-roots`/`tls-webpki-roots` feature, no `.with_*_roots()` call), so it cannot validate any certificate — contradicting the documented "verify against the system trust roots" behaviour (Client.Rust-031). The galaxy page-token loop has a correct repeated-token guard. |
| 5 | Security | No issues found in the registry/secret surface — `.cargo/config.toml` holds only the sparse index URL, no token; README puts the Bearer token in `~/.cargo/credentials.toml` (uncommitted). (See Client.Rust-031 for the strict-TLS validation gap, classified under error handling.) |
| 6 | Performance & resource management | No issues found — `read_bulk` is now borrow-based (`&[S]`), the bench loop reuses `tags_ref` (Client.Rust-026 resolved). The walker clones the `GalaxyClient` channel handle per node, which is the intended cheap `Channel` clone. |
| 7 | Design-document adherence | Issue found: `RustClientDesign.md` is not updated for the new Galaxy lazy-browse SDK surface (`browse` / `browse_children_raw` / `LazyBrowseNode` / `BrowseChildrenOptions`); CLAUDE.md requires docs to change with the source (Client.Rust-032). The TLS pin-only section pre-dates this diff but repeats the inaccurate "system trust roots" claim (cross-referenced from Client.Rust-031). |
| 8 | Code organization & conventions | No issues found — Cargo metadata (name/version/license/repository/keywords/categories) is well-formed; `publish = ["dohertj2-gitea"]` on the library and `publish = false` on `mxgw-cli` is the right split. `license = "Proprietary"` is non-SPDX but cargo accepts it and it is a deliberate closed-source marker. |
| 9 | Testing coverage | No issues found in the new surface — the walker has six unit tests (roots, expand, idempotency, NotFound, multi-page, filter-forwarding) and TLS has four. Gap noted: `tls_with_require_certificate_validation_does_not_short_circuit` connects to a dead address, so it only asserts the guard does not fire and never exercises a real handshake — which is why the no-trust-roots defect in Client.Rust-031 is not caught by a test. |
| 10 | Documentation & comments | Issue found: the `alarm_feed_message_summary` / `alarm_feed_message_to_json` doc comments still say "three `payload` oneof cases" (`main.rs:1729,1755`) although the proto now has four; folded into Client.Rust-030's fix. The TLS doc inaccuracy is Client.Rust-031. |
## Findings
### Client.Rust-001
@@ -687,3 +706,59 @@ The third error (`BulkReplyKind` enum-variant-names) is also touched by the diff
**Recommendation:** Re-apply Client.Rust-001 (add doc comments on `with_max_grpc_message_bytes` / `max_grpc_message_bytes` in `options.rs`), Client.Rust-002 (drop the `Bulk` suffix from `BulkReplyKind`'s variants so they become `AddItem` / `AdviseItem` / …, or add a narrowly-scoped `#[allow(clippy::enum_variant_names)]` with a reason comment), and Client.Rust-012 (replace `last_deploy.lock().unwrap().clone()` with `*last_deploy.lock().unwrap()` in `galaxy.rs:282`). Verify with `cargo clippy --workspace --all-targets -- -D warnings`. Consider adding a pre-commit / CI gate so the next reviewer never has to discover the regression by running clippy.
**Resolution:** 2026-05-24 — Re-applied all three resolutions. `clients/rust/src/options.rs` now has `///` doc comments on `with_max_grpc_message_bytes` and `max_grpc_message_bytes`. `clients/rust/src/galaxy.rs:282` uses `*self.state.last_deploy.lock().unwrap()` instead of `.clone()`. `clients/rust/src/session.rs`'s `BulkReplyKind` variants are renamed to `AddItem` / `AdviseItem` / `RemoveItem` / `UnAdviseItem` / `Subscribe` / `Unsubscribe` (no shared `Bulk` suffix), with the call sites in `add_item_bulk` / `advise_item_bulk` / `remove_item_bulk` / `un_advise_item_bulk` / `subscribe_bulk` / `unsubscribe_bulk` updated accordingly. The sibling `BulkWriteReplyKind` already had non-suffix-sharing variants (`Write` / `Write2` / `WriteSecured` / `WriteSecured2`) and required no rename. `cargo clippy --workspace --all-targets -- -D warnings` is clean at HEAD.
### Client.Rust-030
| Field | Value |
|---|---|
| Severity | High |
| Category | Correctness & logic bugs |
| Location | `clients/rust/crates/mxgw-cli/src/main.rs:1731,1757` (origin: `src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto:909-924`, added in commit `1d85db7`) |
| Status | Resolved |
**Description:** The `mxgw-cli` binary does not compile at HEAD `410acc9`. `cargo check --workspace`, `cargo clippy --workspace --all-targets -- -D warnings`, `cargo build --workspace`, and `cargo test --workspace` all fail with a hard `E0004` (non-exhaustive patterns), so the entire documented Rust build/test/clippy workflow that CLAUDE.md mandates is broken:
```
error[E0004]: non-exhaustive patterns: `&Some(...alarm_feed_message::Payload::ProviderStatus(_))` not covered
--> crates/mxgw-cli/src/main.rs:1731:11
error[E0004]: non-exhaustive patterns: `&Some(...alarm_feed_message::Payload::ProviderStatus(_))` not covered
--> crates/mxgw-cli/src/main.rs:1757:11
```
The alarm-provider-fallback contract change (`1d85db7`, within the reviewed range) added a fourth `AlarmFeedMessage.payload` oneof arm — `AlarmProviderStatus provider_status = 4`. tonic-build regenerates the Rust enum with the new `ProviderStatus` variant, but `alarm_feed_message_summary` (`main.rs:1731`) and `alarm_feed_message_to_json` (`main.rs:1757`) each `match &message.payload` exhaustively over only `ActiveAlarm` / `SnapshotComplete` / `Transition` / `None` with no wildcard arm. Because they are exhaustive matches on a now-larger enum, the binary fails to compile rather than silently mishandling the new variant. The library crate (`zb-mom-ww-mxgateway-client`) itself compiles cleanly and all 32 of its tests pass; the break is confined to the CLI — but the CLI is the cross-language e2e matrix driver, so the whole `clients/rust` workspace is unbuildable and no Rust e2e smoke can run against the gateway at this commit. This is the alarm-surface gap the review request asked to check: the `ProviderStatus` payload is unhandled in the only place the Rust client renders the alarm feed.
**Recommendation:** Add a `Some(alarm_feed_message::Payload::ProviderStatus(status))` arm to both `alarm_feed_message_summary` and `alarm_feed_message_to_json` (render the provider-status fields — mode, degraded/provenance, reference — consistent with how the .NET/Go/Java/Python CLIs serialise it so the cross-language parity matcher recognises the payload). While there, update the two doc comments that still say "three `payload` oneof cases" (`main.rs:1729,1755`) to four. Verify with `cargo clippy --workspace --all-targets -- -D warnings` and `cargo test --workspace`. Consider a CI gate so a contract change that adds a oneof arm cannot leave the Rust CLI unbuildable again.
**Resolution:** 2026-06-15 — Root cause confirmed: the contract's new fourth `AlarmFeedMessage.payload` oneof arm (`AlarmProviderStatus provider_status`, proto fields `mode`/`degraded`/`reason`/`since`) left both `match &message.payload` blocks non-exhaustive (`E0004`). Added a `Some(alarm_feed_message::Payload::ProviderStatus(status))` arm to both `alarm_feed_message_summary` (`mode`/`degraded`/`reason` one-liner) and `alarm_feed_message_to_json` (a `providerStatus` object with `mode`/`degraded`/`reason`/`since`), added an `AlarmEnumName::provider_mode` enum-name helper consistent with the existing `condition_state`/`transition_kind` renderers, and updated the summary doc comment to "four payload oneof cases". No `_ => {}` wildcard. Test: `alarm_feed_provider_status_renders_in_summary_and_json` (in `crates/mxgw-cli/src/main.rs`). All four cargo commands now pass.
### Client.Rust-031
| Field | Value |
|---|---|
| Severity | Medium |
| Category | Error handling & resilience |
| Location | `clients/rust/src/options.rs:196-240` (`build_tls_config`); `clients/rust/Cargo.toml:40` (tonic features); docs: `clients/rust/src/options.rs:76-101`, `clients/rust/README.md` (TLS trust section), `clients/rust/crates/mxgw-cli/src/main.rs:429-431`, `clients/rust/RustClientDesign.md:202` |
| Status | Resolved |
**Description:** The new strict-verification escape hatch does not do what it documents. `build_tls_config` only configures trust roots when a CA file is pinned: with `require_certificate_validation(true)` and no `ca_file`, it returns a bare `ClientTlsConfig::new()` and never calls `.with_native_roots()`, `.with_webpki_roots()`, or `.with_enabled_roots()`. The crate also enables only `tonic` feature `tls-ring` (`Cargo.toml:40`) — neither `tls-native-roots` nor `tls-webpki-roots` is on, so even if the code wanted to enable system roots the methods are feature-gated out. A `ClientTlsConfig` with zero trust anchors rejects every server certificate during the rustls handshake, so the strict path cannot connect to any TLS gateway — not even one whose certificate is genuinely chained to a system root. Yet `with_require_certificate_validation`'s doc comment (`options.rs:80-89`), the README "TLS trust (pin-only)" section, the `--require-certificate-validation` CLI flag help (`main.rs:429-431`), and `RustClientDesign.md:202` all tell the user this option will "verify against the system trust roots." The documented behaviour is unreachable; the only working TLS path is CA pinning (`with_ca_file`).
This is masked by the tests: `tls_with_require_certificate_validation_does_not_short_circuit` (`tests/tls.rs`) dials a dead address (`https://127.0.0.1:1`) and only asserts the no-CA guard error does *not* fire — it never reaches a handshake, so the absent-roots defect is invisible to the suite.
**Recommendation:** Either (a) make the strict path actually load system roots — add the `tls-native-roots` (and/or `tls-webpki-roots`) feature to the `tonic` dependency and call `tls = tls.with_native_roots()` (or `.with_enabled_roots()`) in the `require_certificate_validation == true && ca_file.is_none()` branch of `build_tls_config` — and add a test that pins a self-signed cert as a CA and asserts a system-root-only connection to that same server is *rejected* (proving roots are actually consulted); or (b) if loading system roots is intentionally out of scope for v1, correct every doc site (the `with_require_certificate_validation` doc comment, README, CLI flag help, and `RustClientDesign.md`) to state that the strict flag does not currently enable any trust roots and that CA pinning is the only supported TLS path. Option (a) is the better fix because the flag otherwise has no working effect.
**Resolution:** 2026-06-15 — Took option (a). Root cause confirmed: strict-on/no-CA returned a bare `ClientTlsConfig::new()` with zero trust anchors and the crate only enabled tonic `tls-ring`, so the documented "verify against the system trust roots" path could never validate any certificate. Added `tls-native-roots` to the `tonic` features in `Cargo.toml` and refactored `build_tls_config` to compute the trust posture via a new pure `tls_trust_decision` helper returning `TlsTrustDecision::{None,PinnedCa,SystemRoots,RejectNoCa}`; the `SystemRoots` branch now calls `ClientTlsConfig::with_native_roots()` so a cert chaining to an OS-trusted root validates. Corrected every doc site to state the strict flag verifies against OS roots (not a bare self-signed cert, which still needs `with_ca_file`): the `with_require_certificate_validation` doc comment and `build_tls_config` docs (`options.rs`), README "TLS trust" section, and `RustClientDesign.md` "Trust posture"; the CLI flag help was already accurate. TDD: added failing-first unit tests then the fix — `strict_without_ca_uses_system_roots`, `lenient_without_ca_is_rejected`, `pinned_ca_uses_pinned_trust`, `plaintext_needs_no_tls` (in `src/options.rs`). All four cargo commands pass.
### Client.Rust-032
| Field | Value |
|---|---|
| Severity | Low |
| Category | Design-document adherence |
| Location | `clients/rust/RustClientDesign.md`; surface in `clients/rust/src/galaxy.rs:281-379` |
| Status | Resolved |
**Description:** The diff under review adds substantial new public Galaxy SDK surface — `GalaxyClient::browse`, `GalaxyClient::browse_children_raw`, the `BrowseChildrenOptions` filter struct, and the `LazyBrowseNode` lazy walker (`object`, `has_children_hint`, `children`, `is_expanded`, `expand`) — none of which is described in `RustClientDesign.md`. The README was updated with a "Browsing lazily" / "High-level walker" section, but CLAUDE.md requires the design docs to change in the same change as the public API. A reader consulting the detailed design to understand the Galaxy client surface will not learn that lazy browsing, sibling pagination, the `child_has_children` hint, or the idempotent `expand` contract exist.
**Recommendation:** Add a "Lazy browse" subsection to the Galaxy section of `RustClientDesign.md` enumerating `browse`, `browse_children_raw`, `BrowseChildrenOptions` (its filter fields and AND semantics), and `LazyBrowseNode` (the `Arc`-shared clone semantics, the idempotent single-RPC `expand`, the `has_children_hint`, and the internal paged `BrowseChildren` loop with its repeated-page-token guard). Cross-reference `docs/GalaxyRepository.md#browsechildren` for the wire-level request/filter semantics the README already links.
**Resolution:** 2026-06-15 — Confirmed by inspection that `RustClientDesign.md` had no Galaxy library-API coverage at all. Added a new "Galaxy Repository" section documenting `browse`, `browse_children_raw`, the `BrowseChildrenOptions` filter struct (all six fields, AND combination semantics, `include_attributes` tri-state), and `LazyBrowseNode` (`Arc`-shared clone semantics, `has_children_hint`, the idempotent single-RPC `expand` under an async mutex with page size 500, and the repeated-page-token `Error::InvalidArgument` guard), cross-referencing `docs/GalaxyRepository.md#browsechildren`. Also noted the fourth alarm `provider_status` oneof case in the Alarms section while resolving Client.Rust-030. Doc-only change verified by inspection; design-doc anchor target confirmed present.
+69 -2
View File
@@ -4,8 +4,8 @@
|---|---|
| Module | `src/ZB.MOM.WW.MxGateway.Contracts` |
| Reviewer | Claude Code |
| Review date | 2026-05-24 |
| Commit reviewed | `42b0037` |
| Review date | 2026-06-15 |
| Commit reviewed | `410acc9` |
| Status | Re-reviewed |
| Open findings | 0 |
@@ -50,6 +50,43 @@ Python and Go descriptors. No fields renumbered or repurposed.
| 9 | Testing coverage | No issues found — `ProtobufContractRoundTripTests` and `GatewayContractInfoTests` continue to pin the protocol version; new `QueryActiveAlarmsRequest` lacks a round-trip test but the RPC type is generated and exercised end-to-end by the gRPC client tests in each language. |
| 10 | Documentation & comments | Issues found: Contracts-017 (the `rpc QueryActiveAlarms` comment block does not mention the `alarm_filter_prefix` request field). |
#### 2026-06-15 re-review (commit 410acc9)
Re-review pass at `410acc9` scoped to the contract changes since `42b0037`
(`git diff 42b0037..HEAD -- src/ZB.MOM.WW.MxGateway.Contracts/`). The window
contains two unrelated additive contract features. The brief targets the
**alarm-provider fallback** surface in `mxaccess_gateway.proto`: the new
`AlarmProviderMode` enum (`UNSPECIFIED=0`/`ALARMMGR=1`/`SUBTAG=2`), the
`AlarmSubtagTarget` watch-list message, `AlarmFailoverConfig`, the three new
`SubscribeAlarmsCommand` fields (`forced_mode=2`, `watch_list=3`, `failover=4`),
the `OnAlarmProviderModeChangedEvent` (`MxEvent.body` oneof tag 25,
`MxEventFamily=6`), the `degraded=14`/`source_provider=15` provenance fields on
`OnAlarmTransitionEvent` **and** `ActiveAlarmSnapshot`, and the
`AlarmFeedMessage.provider_status=4` oneof case carrying `AlarmProviderStatus`.
The same window also adds the Galaxy `BrowseChildren` lazy-browse RPC
(`galaxy_repository.proto`) and three XML doc comments on `GatewayContractInfo`
constants — both outside the brief's alarm focus but checked for additive-only
hygiene (clean). `Generated/*.cs` is build output and was not reviewed as
hand-written. `mxaccess_worker.proto` is unchanged (the alarm additions live in
the gateway proto the worker imports — matches the design doc's Superseded note).
Verified against `docs/plans/2026-06-13-alarm-subtag-fallback-design.md`,
`docs/plans/2026-06-15-forced-subtag-mode-fix.md`, and the worker/gateway source
(`AlarmDispatcher.cs:213`, `MxAccessEventMapper.cs:151`, `GatewayAlarmMonitor.cs`).
| # | Category | Result |
|---|---|---|
| 1 | Correctness & logic bugs | No issues found. Field semantics are correct against source: `AlarmProviderStatus.degraded`/`OnAlarmTransitionEvent.degraded` track `mode == SUBTAG` (worker `AlarmDispatcher.cs:213` sets `SourceProvider = Degraded ? Subtag : Alarmmgr`; gateway `GatewayAlarmMonitor._providerDegraded = toMode == Subtag`). `OnAlarmProviderModeChangedEvent.hresult` "0 on failback" matches the Auto-mode failover/failback path that emits it; forced mode is seeded gateway-side and emits no worker event, so the comment is not contradicted. |
| 2 | mxaccessgw conventions | No issues found. The subtag fallback synthesizes events **inside the worker** and marks every synthesized transition `degraded`, satisfying the CLAUDE.md "gateway forwards only worker-emitted events; synthesizing is an explicit opt-in non-parity mode" rule. `snake_case` fields, `PascalCase` messages, the `ALARM_PROVIDER_MODE_`/`MX_EVENT_FAMILY_` enum-prefix discipline, and the top-of-file wire-compatibility policy block (Contracts-005) are all honoured. Generated code regenerated, not hand-edited. |
| 3 | Concurrency & thread safety | N/A — pure contract definitions plus a static constants class. |
| 4 | Error handling & resilience | No issues found. The degraded/provider-status surface lets clients distinguish the lower-fidelity subtag feed from the authoritative alarmmgr feed; `AlarmProviderStatus` is emitted on stream open and every switch so late joiners learn the mode. |
| 5 | Security | No issues found — none of the new fields carry credentials or secrets. `AlarmSubtagTarget` carries only item-address strings. |
| 6 | Performance & resource management | No issues found. `repeated AlarmSubtagTarget watch_list` is sent once at subscribe time, not per-event; provenance fields are scalars. No hot-path bloat. |
| 7 | Design-document adherence | No drift. The shipped contract matches `docs/plans/2026-06-13-alarm-subtag-fallback-design.md` (including its Superseded notes: additions in the gateway proto, not the worker proto). |
| 8 | Code organization & conventions | No issues found. Every addition uses a new, contiguous field number — `SubscribeAlarmsCommand` 2-4, `MxEvent.body` 25, `MxEventFamily` 6, `OnAlarmTransitionEvent`/`ActiveAlarmSnapshot` 14-15, `AlarmFeedMessage.payload` 4 — with no reuse, renumbering, or type narrowing of any existing field. Enum zero-values are `UNSPECIFIED`. Additive-only invariant preserved. |
| 9 | Testing coverage | Issues found: Contracts-018 — `ProtobufContractRoundTripTests` covers the new `AlarmProviderStatus` (via `AlarmFeedMessage`) and the `OnAlarmTransitionEvent` `degraded`/`source_provider` fields, but has no round-trip coverage for the `ActiveAlarmSnapshot` provenance fields, the `SubscribeAlarmsCommand` extensions (`forced_mode`/`watch_list`/`failover`), or `OnAlarmProviderModeChangedEvent`. |
| 10 | Documentation & comments | Issues found: Contracts-019 — the `ActiveAlarmSnapshot.degraded`/`source_provider` fields carry no in-proto comment while the byte-identical fields on `OnAlarmTransitionEvent` are documented; and the `AlarmProviderMode` enum doc explains `UNSPECIFIED` only for the `forced_mode` use, not for the provenance (`source_provider`) reuse. |
## Findings
### Contracts-001
@@ -341,3 +378,33 @@ additive-only with no reuse, renumbering, or type narrowing.
Re-review: no new findings. Open finding count remains 0. All seventeen
recorded Contracts findings (Contracts-001..017) remain closed
(Resolved / Won't Fix).
### Contracts-018
| Field | Value |
|---|---|
| Severity | Low |
| Category | Testing coverage |
| Location | `src/ZB.MOM.WW.MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs:396` (`ActiveAlarmSnapshot_RoundTripsAllFields`) |
| Status | Resolved |
**Description:** The alarm-provider fallback feature added several new wire fields to `mxaccess_gateway.proto`. `ProtobufContractRoundTripTests` was extended with `AlarmFeedMessage_RoundTripsProviderStatus` (covers `AlarmProviderStatus` + the `provider_status` oneof case) and `Transition_RoundTripsDegradedProvenance` (covers `OnAlarmTransitionEvent.degraded`/`source_provider`), but three pieces of the new contract surface have no round-trip coverage: (a) `ActiveAlarmSnapshot.degraded` (14) / `source_provider` (15) — `ActiveAlarmSnapshot_RoundTripsAllFields` stops at `OperatorComment` (field 11) and never sets or asserts the two new provenance fields, so a future renumber/type change to them would not be caught; (b) the `SubscribeAlarmsCommand` extensions `forced_mode` (2), `watch_list` (3, `repeated AlarmSubtagTarget`), and `failover` (4, `AlarmFailoverConfig`) — no test exercises these, and the live `forced_mode` enum-drop concern that prompted the `2026-06-15-forced-subtag-mode-fix` investigation is exactly the kind of wire shape prior contract tests have been written to pin; (c) `OnAlarmProviderModeChangedEvent` (the `MxEvent.body` oneof tag 25 / `MxEventFamily=6` worker→gateway event). This is the same class of gap previously flagged for the bulk family (Contracts-007 / Contracts-010): new wire shapes shipped without round-trip pinning.
**Recommendation:** Extend `ActiveAlarmSnapshot_RoundTripsAllFields` (or add a focused test) to set and assert `degraded = true` + `source_provider = AlarmProviderMode.Subtag`; add a round-trip test for `SubscribeAlarmsCommand` populating `forced_mode`, a `watch_list` entry (all six `AlarmSubtagTarget` string fields), and a `failover` `AlarmFailoverConfig`; and add a round-trip / `MxEvent` oneof-case test for `OnAlarmProviderModeChangedEvent` pinning `MxEvent.BodyCase == OnAlarmProviderModeChanged` for `MxEventFamily.OnAlarmProviderModeChanged`.
**Resolution:** _(2026-06-15)_ Verified the three coverage gaps against the proto — `ActiveAlarmSnapshot.degraded`/`source_provider` (14/15), `SubscribeAlarmsCommand.forced_mode`/`watch_list`/`failover` (2/3/4), and the `MxEvent.body` oneof tag 25 / `MxEventFamily=6` `OnAlarmProviderModeChangedEvent` were all unpinned. Added three focused round-trip tests to `ProtobufContractRoundTripTests`: `ActiveAlarmSnapshot_RoundTripsDegradedProvenance` (sets/asserts `degraded = true` + `source_provider = AlarmProviderMode.Subtag`), `SubscribeAlarmsCommand_RoundTripsForcedModeWatchListAndFailover` (populates `forced_mode`, a `watch_list` entry with all six `AlarmSubtagTarget` string fields, and a `failover` `AlarmFailoverConfig`), and `MxEvent_RoundTripsOnAlarmProviderModeChangedBody` (pins `MxEvent.BodyCase == OnAlarmProviderModeChanged` + `Family == OnAlarmProviderModeChanged`). All fields round-trip — no contract bug found. The full `ProtobufContractRoundTrip` filter is 49/49 green.
### Contracts-019
| Field | Value |
|---|---|
| Severity | Low |
| Category | Documentation & comments |
| Location | `src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto:850-851` (`ActiveAlarmSnapshot`), `:318-324` (`AlarmProviderMode`) |
| Status | Resolved |
**Description:** Two in-proto documentation gaps on the new alarm-provider surface. (1) `OnAlarmTransitionEvent.degraded` (line 805-808) and `source_provider` (809-810) carry clear comments ("True when this transition came from the subtag-monitoring fallback … synthesized from data changes, reduced fidelity"; "Which provider produced this transition."), but the byte-identical `ActiveAlarmSnapshot.degraded` (850) and `source_provider` (851) are declared bare with no comment. The two messages model the same provenance concept and a reader of `ActiveAlarmSnapshot` alone gets no signal that a non-`UNSPECIFIED` `source_provider` plus `degraded = true` means the snapshot came from the lower-fidelity subtag source. (2) The `AlarmProviderMode` enum comment (318-319) documents the zero value only for one use site — "UNSPECIFIED on a SubscribeAlarmsCommand means auto: alarmmgr primary with subtag fallback" — but the same enum is reused as a provenance field on `OnAlarmTransitionEvent.source_provider`, `ActiveAlarmSnapshot.source_provider`, `OnAlarmProviderModeChangedEvent.mode`, and `AlarmProviderStatus.mode`. The worker always sets `source_provider` to `ALARMMGR` or `SUBTAG` (never `UNSPECIFIED`; `MxAccessEventMapper.cs:151` defaults to `Alarmmgr`, `AlarmDispatcher.cs:213` picks `Subtag`/`Alarmmgr`), so `UNSPECIFIED` as a provenance value has no defined meaning and the comment does not say so. The ProtobufStyleGuide rule "comment fields carrying MXAccess parity / non-obvious semantics" applies — this is a non-parity provenance marker.
**Recommendation:** (1) Add comments to `ActiveAlarmSnapshot.degraded` / `source_provider` mirroring the wording already on `OnAlarmTransitionEvent` (or a one-line cross-reference). (2) Extend the `AlarmProviderMode` enum comment to note that as a `source_provider` / `mode` provenance value the field is always `ALARMMGR` or `SUBTAG` on the wire and `UNSPECIFIED` should be treated as "unknown / not yet determined", so the zero value is unambiguous at every use site. Comment-only changes; no wire-format impact.
**Resolution:** _(2026-06-15)_ Confirmed both gaps in `mxaccess_gateway.proto`: `ActiveAlarmSnapshot.degraded`/`source_provider` (14/15) were bare while the byte-identical `OnAlarmTransitionEvent` fields were documented, and the `AlarmProviderMode` enum comment only explained `UNSPECIFIED` for the `forced_mode` use. (1) Added comments to `ActiveAlarmSnapshot.degraded`/`source_provider` mirroring the `OnAlarmTransitionEvent` wording (subtag-fallback / reduced-fidelity, always ALARMMGR or SUBTAG, never UNSPECIFIED). (2) Extended the `AlarmProviderMode` enum comment to distinguish its two use sites: as `forced_mode`, `UNSPECIFIED` = auto; as a provenance value (`OnAlarmTransitionEvent.source_provider`, `ActiveAlarmSnapshot.source_provider`, `OnAlarmProviderModeChangedEvent.mode`, `AlarmProviderStatus.mode`) the worker always emits ALARMMGR/SUBTAG and `UNSPECIFIED` should be read as "unknown / not yet determined". Comment-only changes; no wire-format impact. NOTE: on this dev box the `csharp` protoc generator DOES emit proto leading comments into `Generated/MxaccessGateway.cs` `<summary>` XML doc (contrary to the brief's assumption), so the build regenerated `Generated/MxaccessGateway.cs` with the new doc comments only — diff is `///`-comment lines exclusively, zero code/wire/type changes. `dotnet build -f net10.0` succeeds with 0 warnings / 0 errors.
+104 -2
View File
@@ -4,8 +4,8 @@
|---|---|
| Module | `src/ZB.MOM.WW.MxGateway.IntegrationTests` |
| Reviewer | Claude Code |
| Review date | 2026-05-24 |
| Commit reviewed | `42b0037` |
| Review date | 2026-06-15 |
| Commit reviewed | `410acc9` |
| Status | Re-reviewed |
| Open findings | 0 |
@@ -14,6 +14,34 @@
A comprehensive review completes every category, recording "No issues found" where
a category produced nothing rather than leaving it blank.
### 2026-06-15 re-review (commit `410acc9`)
Scope: `git diff 42b0037..HEAD -- src/ZB.MOM.WW.MxGateway.IntegrationTests/`
(5 files). The substantive change is the `DashboardLdapLiveTests` cutover to the
shared `ZB.MOM.WW.Auth.Ldap.LdapAuthService` + `DashboardGroupRoleMapper`
(matching the production `DashboardAuthenticator` ctor split); plus the
`ResolveRepositoryRoot` `stopBoundary` parameter and its new regression test
(IntegrationTests-025 resolution), and XML-doc backfill on
`LiveLdapFactAttribute` / `WorkerLiveMxAccessSmokeTests`. NOTE: the review
brief's "live alarm-subtag smoke test(s)" do not exist in this diff — no new
alarm-subtag tests landed here. Instead the in-window Server alarm-monitor
evolution (`ebf1d95`/`9208225`/`410acc9`) changed `GatewayAlarmMonitor`'s
constructor without updating its IntegrationTests caller, leaving the whole
module non-compiling (IntegrationTests-026).
| # | Category | Result |
|---|---|---|
| 1 | Correctness & logic bugs | Issue found: IntegrationTests-026 (the entire IntegrationTests project fails to compile at HEAD — `WorkerLiveMxAccessSmokeTests` constructs `GatewayAlarmMonitor` with the stale 3-arg form `(sessionManager, options, logger)` while the production ctor now requires 5 args `(ISessionManager, IAlarmWatchListResolver, GatewayMetrics, IOptions<GatewayOptions>, ILogger)`; verified by `dotnet build` → CS7036). |
| 2 | mxaccessgw conventions | No issues found. Live opt-in gating, `[Collection]`/`[Trait]` discipline, "no synthesized events", and the credential-redaction contract for the LDAP failure-path assertions are all preserved; the cutover keeps the existing skip-by-default behaviour. |
| 3 | Concurrency & thread safety | No issues found in this diff. |
| 4 | Error handling & resilience | No issues found. The `ServerUnreachable` test still asserts the connect failure is absorbed into a `Fail` result; the fail-closed contract now lives in the shared `LdapAuthService` and the test exercises it via `Port = 1`. |
| 5 | Security | No issues found. The wrong-password / unknown-user / unreachable tests still assert no credential leak into `FailureMessage`; the cutover adds no new credential surface and writes no secrets to evidence/probe logs. |
| 6 | Performance & resource management | No issues found. |
| 7 | Design-document adherence | Issue found: IntegrationTests-028 (the live test hand-rolls a field-by-field `LibraryLdapOptions` from the gateway shadow `LdapOptions` defaults instead of binding `MxGateway:Ldap` the way production's `AddZbLdapAuth(configuration, "MxGateway:Ldap")` does, so the live test no longer exercises the production option-binding path and silently omits `ConnectionTimeoutMs` / `ServerCertificateValidationCallback`). |
| 8 | Code organization & conventions | Issue found: IntegrationTests-027 (`DashboardLdapLiveTests` directly consumes `LdapAuthService` / `LdapOptions` from `ZB.MOM.WW.Auth.Ldap` but the IntegrationTests `.csproj` has no direct `PackageReference` — it compiles only via transitive flow through the Server `ProjectReference`). |
| 9 | Testing coverage | No issues found beyond IntegrationTests-026 — the role-claim and stop-boundary assertions added in this window strengthen coverage; but the module cannot build, so none of the IntegrationTests run until IntegrationTests-026 is fixed. |
| 10 | Documentation & comments | Issue found: IntegrationTests-029 (`docs/GatewayTesting.md` "Live LDAP" still describes the old in-`DashboardAuthenticator` branches — "rejected by the candidate bind", "yields no candidate" — that the library cutover moved into the shared `LdapAuthService`; the test comments were updated in this diff but the doc prose was not, contrary to CLAUDE.md's same-commit doc rule). |
### 2026-05-20 re-review (commit `a020350`)
| # | Category | Result |
@@ -506,3 +534,77 @@ The current dev box layout (`C:\Users\dohertj2\Desktop\mxaccessgw`) is safe beca
**Recommendation:** Isolate the walker from any ambient ancestor by either (a) constructing an `isolatedRoot` directly under a drive root and pointing the walker at a chain entirely under it (e.g. create `<isolatedRoot>\level1\level2\level3` and start the walk at `level3`, then assert the throw — the walker stops at the drive root regardless of what is on it), (b) refactoring `ResolveRepositoryRoot` to accept an injectable `stopBoundary` parameter for tests and pass `isolatedRoot` as the boundary, or (c) replacing the `Assert.Throws` shape with an explicit upward-walk check that the test owns. Option (a) is the smallest change: prepend a sentinel — e.g. create a dummy `<isolatedRoot>\sentinel-no-markers` and assert nothing about Temp ancestors — and pass the test only when the walker reaches that sentinel without finding a marker. The current shape is acceptable on the documented dev box but should not be the sole regression coverage for IntegrationTests-022.
**Resolution:** Resolved 2026-05-24 — Took option (b) (inject a stop-boundary) because option (a) does not actually solve the leak: a sentinel chain under `Path.GetTempPath()` still leaves the walker free to ascend past it into Temp / AppData / Users / C:\, so any ambient ancestor with `src/` + `.git`/`.sln`/`.slnx` still wins. Added an optional `stopBoundary` parameter to `IntegrationTestEnvironment.ResolveRepositoryRoot(string startDirectory, string? stopBoundary = null)`. When supplied, the walker checks the boundary for markers and then stops, refusing to ascend past it; production callers (the `MXGATEWAY_LIVE_MXACCESS_WORKER_EXE` resolution path) continue to pass `null` so the walk to drive-root behavior is unchanged. Updated both existing tests (`ResolveRepositoryRoot_AcceptsGitWorktreeFile` and `ResolveRepositoryRoot_NoMarkers_ThrowsInvalidOperationExceptionNamingStartAndMarkers`) to pass their owned temp directory as the boundary, sealing the walker inside a chain the test fully controls. Added a new regression test `ResolveRepositoryRoot_StopBoundary_IsolatesWalkerFromAmbientAncestorMarkers` that deliberately constructs an outer marker-bearing ancestor (`outerRoot/src` + `outerRoot/.git`), an inner boundary, and an isolated start beneath the boundary; first asserts that without the boundary the walker leaks up to `outerRoot` (the precise IntegrationTests-025 failure mode), then asserts that *with* the boundary the same call throws — proving the boundary is the load-bearing isolation. TDD red/green confirmed: the new regression test fails against the pre-fix walker (`Assert.Throws() Failure: No exception was thrown`) and passes once the boundary handling is restored. Re-ran the full `IntegrationTestEnvironmentTests` slice with `TMP` / `TEMP` redirected under a deliberately constructed `<temp>\fake-repo-ancestor` directory carrying `src/` and a `.git` file — the original flake repro from the finding — and confirmed all 5 tests pass (the same redirection produced `Assert.Throws() Failure` on the pre-fix code). Build: 0 warnings / 0 errors.
### IntegrationTests-026
| Field | Value |
|---|---|
| Severity | High |
| Category | Correctness & logic bugs |
| Location | `src/ZB.MOM.WW.MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:1098-1101`, `src/ZB.MOM.WW.MxGateway.Server/Alarms/GatewayAlarmMonitor.cs:55-60` |
| Status | Resolved |
**Description:** The entire IntegrationTests project fails to compile at HEAD (`410acc9`). `GatewayServiceFixture` (in `WorkerLiveMxAccessSmokeTests.cs`) constructs the `GatewayAlarmMonitor` it passes into `MxAccessGatewayService` with the stale three-argument form:
```csharp
new ZB.MOM.WW.MxGateway.Server.Alarms.GatewayAlarmMonitor(
sessionManager,
options,
_loggerFactory.CreateLogger<...GatewayAlarmMonitor>())
```
but the production constructor (evolved in-window by `ebf1d95` "monitor resolves watch-list, sends ForcedMode/failover, reflects provider mode into feed + metrics", with later refinements in `9208225` and `410acc9`) now requires **five** parameters: `GatewayAlarmMonitor(ISessionManager sessionManager, IAlarmWatchListResolver watchListResolver, GatewayMetrics metrics, IOptions<GatewayOptions> options, ILogger<GatewayAlarmMonitor> logger)`. `dotnet build src/ZB.MOM.WW.MxGateway.IntegrationTests/...` fails with `CS7036: There is no argument given that corresponds to the required parameter 'options'`. Because this is the only `MxAccessGatewayService` assembly site in the fixture, the whole module — every live opt-in test *and* the non-live `IntegrationTestEnvironmentTests` — cannot build or run. This is a CLAUDE.md "Source Update Workflow" violation: a cross-component Server alarm-monitor change was not propagated to its IntegrationTests caller in the same commit, and "build each affected component" was not honored for the IntegrationTests project. It also silently masks the verification basis for IntegrationTests-022..025's "build is green" resolution claims at this HEAD.
**Recommendation:** Update the `GatewayAlarmMonitor` construction in `GatewayServiceFixture` to the current 5-arg signature: supply an `IAlarmWatchListResolver` (a minimal test stub returning an empty/representative watch list, or the production resolver if cheap to construct), the existing `_metrics` (`GatewayMetrics`), the existing `options` wrapped as `IOptions<GatewayOptions>` (e.g. `Options.Create(...)`), and the logger. Then run `dotnet build src/ZB.MOM.WW.MxGateway.IntegrationTests/...` to confirm 0 errors and `dotnet test ... --filter FullyQualifiedName~IntegrationTestEnvironmentTests` to confirm the non-live tests pass and the live tests still skip cleanly when the env vars are unset. Add a build of the IntegrationTests project to the verification step whenever `GatewayAlarmMonitor` / `MxAccessGatewayService` constructors change.
**Resolution:** Resolved 2026-06-15: Confirmed the project failed to build at HEAD (CS7036 on the stale 3-arg `GatewayAlarmMonitor` ctor call in `GatewayServiceFixture`). Updated the construction to the current 5-arg signature — added a new `TestSupport/EmptyAlarmWatchListResolver` singleton stub (`IAlarmWatchListResolver` returning an empty watch-list, avoiding the production resolver's `IGalaxyRepository` dependency), and passed the fixture's existing `_metrics` (`GatewayMetrics`) and `options` (`IOptions<GatewayOptions>`). `dotnet build` now succeeds with 0 errors/warnings; non-live tests pass (5) and all 15 live tests skip cleanly with the env vars unset.
### IntegrationTests-027
| Field | Value |
|---|---|
| Severity | Low |
| Category | Code organization & conventions |
| Location | `src/ZB.MOM.WW.MxGateway.IntegrationTests/ZB.MOM.WW.MxGateway.IntegrationTests.csproj`, `src/ZB.MOM.WW.MxGateway.IntegrationTests/DashboardLdapLiveTests.cs:4-5,134` |
| Status | Resolved |
**Description:** After the cutover, `DashboardLdapLiveTests` directly consumes `ZB.MOM.WW.Auth.Ldap.LdapAuthService` and `ZB.MOM.WW.Auth.Abstractions.Ldap.LdapOptions` (`using ZB.MOM.WW.Auth.Ldap; using ZB.MOM.WW.Auth.Abstractions.Ldap;` and `new LdapAuthService(ldapOptions)`). But the IntegrationTests `.csproj` declares no direct `PackageReference` to `ZB.MOM.WW.Auth.Ldap` or `ZB.MOM.WW.Auth.Abstractions` — it has only `ProjectReference`s to Contracts and Server. It compiles solely because the Server's `PackageReference`s to those packages flow transitively (the Server csproj sets no `PrivateAssets`). A project that directly references a library's public types should declare a direct dependency on it; the current shape means the build silently depends on the Server never marking those packages `PrivateAssets="compile"` and on the transitive compile-asset flow staying enabled. If either changes, the IntegrationTests build breaks with a confusing CS0246 far from the cause.
**Recommendation:** Add explicit `<PackageReference Include="ZB.MOM.WW.Auth.Ldap" Version="0.1.2" />` and `<PackageReference Include="ZB.MOM.WW.Auth.Abstractions" Version="0.1.2" />` (matching the Server's pinned versions, ideally via a shared `Directory.Packages.props` if central package management is in use) to the IntegrationTests project so its direct use of those types is backed by a direct dependency.
**Resolution:** Resolved 2026-06-15: Confirmed the csproj had only `ProjectReference`s and pulled `LdapAuthService`/`LdapOptions` transitively. Added direct `PackageReference`s `ZB.MOM.WW.Auth.Abstractions` and `ZB.MOM.WW.Auth.Ldap` at `0.1.2` (matching the Server's pinned versions; no central package management exists in this repo). Build remains clean. (The IntegrationTests-028 fix also added `Microsoft.Extensions.Configuration.Json`/`.Binder` at `10.0.7`, pinned to the resolved transitive version to avoid an NU1605 downgrade.)
### IntegrationTests-028
| Field | Value |
|---|---|
| Severity | Low |
| Category | Design-document adherence |
| Location | `src/ZB.MOM.WW.MxGateway.IntegrationTests/DashboardLdapLiveTests.cs:120-161`, `src/ZB.MOM.WW.MxGateway.Server/Dashboard/DashboardServiceCollectionExtensions.cs:35` |
| Status | Resolved |
**Description:** Production wires the shared LDAP provider by binding the `MxGateway:Ldap` configuration section straight onto the shared `LdapOptions` via `AddZbLdapAuth(configuration, "MxGateway:Ldap")`. The live test instead hand-rolls a `LibraryLdapOptions` instance by copying the eleven fields of the gateway *shadow* `LdapOptions` defaults (the `LibraryOptions()` helper). Two consequences:
1. The shared `LdapOptions` actually exposes **thirteen** settable properties — the hand-copy omits `ConnectionTimeoutMs` and `ServerCertificateValidationCallback` (verified by reflecting `ZB.MOM.WW.Auth.Abstractions` 0.1.2). `ConnectionTimeoutMs` has a non-zero default and directly governs the `AuthenticateAsync_ServerUnreachable_FailsWithoutThrowing` (`Port = 1`) test's timing, so the live test exercises the *shared default* timeout, not whatever an operator (or the gateway config) would set — diverging from the production-bound value.
2. It adds a third manual copy of the shadow→shared field mapping on top of the documented "Review C2 DRIFT WARNING" seam in `Server/Configuration/LdapOptions.cs`. A field added to the shared type is silently dropped by this test until someone remembers to extend `LibraryOptions()`.
The prior `DashboardAuthenticator` ctor took `IOptions<GatewayOptions>`, so the old test shared the same options object production used; the cutover lost that fidelity. CLAUDE.md treats the live tests as the parity check against the real seeded directory — they should bind options the way production does.
**Recommendation:** Have the test build the shared `LdapOptions` the same way production does — bind it from the `MxGateway:Ldap` section (e.g. load the gateway `appsettings.json` / a minimal in-memory config and call the same `AddZbLdapAuth` binding path, or resolve the bound `IOptions<LdapOptions>` from a DI container that ran `AddZbLdapAuth`). At minimum, document why the two extra shared fields are intentionally left at their defaults, and add `ConnectionTimeoutMs` to the copy so the unreachable-server test's timeout matches production. Prefer eliminating the hand-copy so the shadow-drift surface does not grow.
**Resolution:** Resolved 2026-06-15: Confirmed by reflecting `ZB.MOM.WW.Auth.Abstractions` 0.1.2 that the shared `LdapOptions` exposes 13 settable properties while the hand-copy populated only 11 (omitting `ConnectionTimeoutMs` and `ServerCertificateValidationCallback`). Eliminated the field-by-field hand-copy: `LibraryOptions()` now binds the real `MxGateway:Ldap` section from the Server's `appsettings.json` (resolved via `IntegrationTestEnvironment.ResolveRepositoryRoot`) onto the shared `LdapOptions` with `configuration.GetSection("MxGateway:Ldap").Get<LdapOptions>()` — the same section/binding path production's `AddZbLdapAuth(configuration, "MxGateway:Ldap")` uses. Verified the bind yields `ConnectionTimeoutMs=10000` (the shared default the unreachable-server test relies on) and the dev directory connection (localhost:3893, Transport=None, AllowInsecure). A new shared field is now picked up automatically rather than silently dropped.
### IntegrationTests-029
| Field | Value |
|---|---|
| Severity | Low |
| Category | Documentation & comments |
| Location | `docs/GatewayTesting.md:218-224` |
| Status | Resolved |
**Description:** The "Live LDAP" section of `docs/GatewayTesting.md` still describes the failure branches in terms of the old `DashboardAuthenticator` internals: "`admin` with a wrong password is rejected by the **candidate bind**" and "an unknown username yields **no candidate**". After the cutover in this diff, the bind/search mechanics (and therefore the "candidate bind" / "candidate is null" branches) live in the shared `LdapAuthService`, not in `DashboardAuthenticator` — which is exactly why the test comments in `DashboardLdapLiveTests.cs` were reworded in this same diff from "Exercises the `LdapException` branch" / "the `candidate is null` branch" to "user-bind-failure branch" / "user-not-found branch". The doc prose was not updated to match. CLAUDE.md requires docs that describe security/auth behavior to change in the same commit as the source; the comments moved but the doc did not, leaving the doc describing branches that no longer exist in `DashboardAuthenticator`.
**Recommendation:** Reword the `docs/GatewayTesting.md` "Live LDAP" failure-branch sentences to describe observable behavior without referencing the now-internal "candidate bind" mechanics (e.g. "a wrong password is rejected without leaking the password", "an unknown username fails authentication"), and note that bind/search is delegated to the shared `ZB.MOM.WW.Auth.Ldap` provider so the prose stays accurate after the cutover.
**Resolution:** Resolved 2026-06-15: Reworded the "Live LDAP" failure-branch prose to describe observable behavior ("fails authentication without leaking the password", "an unknown username fails authentication") instead of the now-internal "candidate bind" / "no candidate" mechanics, and added a sentence noting `DashboardAuthenticator` delegates the bind/search to the shared `ZB.MOM.WW.Auth.Ldap` provider (`LdapAuthService`) and only maps groups to roles — matching the in-source test-comment cutover. Verified by inspection.
+47 -11
View File
@@ -10,17 +10,17 @@ Each module's `findings.md` is the source of truth; this file is generated from
| Module | Reviewer | Date | Commit | Status | Open | Total |
|---|---|---|---|---|---|---|
| [Client.Dotnet](Client.Dotnet/findings.md) | Claude Code | 2026-05-24 | `42b0037` | Re-reviewed | 0 | 21 |
| [Client.Go](Client.Go/findings.md) | Claude Code | 2026-05-24 | `42b0037` | Re-reviewed | 0 | 27 |
| [Client.Java](Client.Java/findings.md) | Claude Code | 2026-05-24 | `42b0037` | Re-reviewed | 0 | 36 |
| [Client.Python](Client.Python/findings.md) | Claude Code | 2026-05-24 | `42b0037` | Re-reviewed | 0 | 26 |
| [Client.Rust](Client.Rust/findings.md) | Claude Code | 2026-05-24 | `42b0037` | Re-reviewed | 0 | 29 |
| [Contracts](Contracts/findings.md) | Claude Code | 2026-05-24 | `42b0037` | Re-reviewed | 0 | 17 |
| [IntegrationTests](IntegrationTests/findings.md) | Claude Code | 2026-05-24 | `42b0037` | Re-reviewed | 0 | 25 |
| [Server](Server/findings.md) | Claude Code | 2026-05-24 | `42b0037` | Re-reviewed | 0 | 50 |
| [Tests](Tests/findings.md) | Claude Code | 2026-05-24 | `42b0037` | Re-reviewed | 0 | 31 |
| [Worker](Worker/findings.md) | Claude Code | 2026-05-24 | `42b0037` | Re-reviewed | 0 | 25 |
| [Worker.Tests](Worker.Tests/findings.md) | Claude Code | 2026-05-24 | `42b0037` | Re-reviewed | 0 | 30 |
| [Client.Dotnet](Client.Dotnet/findings.md) | Claude Code | 2026-06-15 | `410acc9` | Re-reviewed | 0 | 25 |
| [Client.Go](Client.Go/findings.md) | Claude Code | 2026-06-15 | `410acc9` | Re-reviewed | 0 | 29 |
| [Client.Java](Client.Java/findings.md) | Claude Code | 2026-06-15 | `410acc9` | Re-reviewed | 0 | 39 |
| [Client.Python](Client.Python/findings.md) | Claude Code | 2026-06-15 | `410acc9` | Re-reviewed | 0 | 31 |
| [Client.Rust](Client.Rust/findings.md) | Claude Code | 2026-06-15 | `410acc9` | Re-reviewed | 0 | 32 |
| [Contracts](Contracts/findings.md) | Claude Code | 2026-06-15 | `410acc9` | Re-reviewed | 0 | 19 |
| [IntegrationTests](IntegrationTests/findings.md) | Claude Code | 2026-06-15 | `410acc9` | Re-reviewed | 0 | 29 |
| [Server](Server/findings.md) | Claude Code | 2026-06-15 | `410acc9` | Re-reviewed | 0 | 53 |
| [Tests](Tests/findings.md) | Claude Code | 2026-06-15 | `410acc9` | Re-reviewed | 0 | 35 |
| [Worker](Worker/findings.md) | Claude Code | 2026-06-15 | `410acc9` | Re-reviewed | 0 | 28 |
| [Worker.Tests](Worker.Tests/findings.md) | Claude Code | 2026-06-15 | `410acc9` | Re-reviewed | 0 | 33 |
## Pending findings
@@ -38,6 +38,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Client.Go-001 | High | Resolved | Correctness & logic bugs | `clients/go/mxgateway/errors.go:88-93`, `clients/go/mxgateway/errors.go:117-128` |
| Client.Java-013 | High | Resolved | Testing coverage | `clients/java/mxgateway-cli/src/test/java/com/dohertylan/mxgateway/cli/MxGatewayCliTests.java:212-304`, `clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java:1214-1244` |
| Client.Java-032 | High | Resolved | Documentation & comments | `clients/java/README.md:182-183` |
| Client.Java-039 | High | Resolved | Correctness & logic bugs | `clients/java/zb-mom-ww-mxgateway-cli/src/main/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCli.java:1699` (origin: `src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto`, `AlarmFeedMessage.payload` provider-status arm added in commit `1d85db7`) |
| Client.Python-018 | High | Resolved | Code organization & conventions | `clients/python/pyproject.toml:11` |
| Client.Python-022 | High | Resolved | Documentation & comments | `clients/python/README.md:201-202`, `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py:389-420` |
| Client.Rust-001 | High | Resolved | mxaccessgw conventions | `clients/rust/src/options.rs:98,143` |
@@ -46,8 +47,10 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Client.Rust-012 | High | Resolved | mxaccessgw conventions | `clients/rust/src/galaxy.rs:282` |
| Client.Rust-013 | High | Resolved | mxaccessgw conventions | `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:414-424` (origin); `clients/rust/src/generated.rs:11-31` (suppression site) |
| Client.Rust-029 | High | Resolved | mxaccessgw conventions | `clients/rust/src/options.rs:98,143`; `clients/rust/src/galaxy.rs:282`; `clients/rust/src/session.rs:664-671` |
| Client.Rust-030 | High | Resolved | Correctness & logic bugs | `clients/rust/crates/mxgw-cli/src/main.rs:1731,1757` (origin: `src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto:909-924`, added in commit `1d85db7`) |
| IntegrationTests-001 | High | Resolved | Design-document adherence | `src/MxGateway.IntegrationTests/Galaxy/LiveGalaxyRepositoryFactAttribute.cs:7`, `src/MxGateway.IntegrationTests/Galaxy/GalaxyRepositoryLiveTests.cs` |
| IntegrationTests-002 | High | Resolved | Design-document adherence | `src/MxGateway.IntegrationTests/DashboardLdapLiveTests.cs:13`, `src/MxGateway.Server/Configuration/LdapOptions.cs:27` |
| IntegrationTests-026 | High | Resolved | Correctness & logic bugs | `src/ZB.MOM.WW.MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:1098-1101`, `src/ZB.MOM.WW.MxGateway.Server/Alarms/GatewayAlarmMonitor.cs:55-60` |
| Server-003 | High | Resolved | Security | `src/MxGateway.Server/Dashboard/DashboardAuthorizationHandler.cs:39,54-59`, `src/MxGateway.Server/Dashboard/DashboardAuthenticator.cs:236-258` |
| Server-017 | High | Resolved | Security | `src/MxGateway.Server/Security/Authorization/GatewayGrpcScopeResolver.cs:13-27`, `src/MxGateway.Server/Grpc/MxAccessGatewayService.cs:173-247`, `docs/Authorization.md:108-110` |
| Tests-001 | High | Resolved | Testing coverage | `src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs:483-489` |
@@ -55,16 +58,19 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Worker-001 | High | Resolved | Concurrency & thread safety | `src/MxGateway.Worker/MxAccess/WnWrapAlarmConsumer.cs:204-207` |
| Worker-002 | High | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:545-549` |
| Worker-003 | High | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:399-403`, `:416-419` |
| Worker-026 | High | Resolved | Concurrency & thread safety | `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/FailoverAlarmConsumer.cs:289-338`, `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/MxAccessStaSession.cs:307-320` |
| Worker.Tests-001 | High | Resolved | Testing coverage | `src/MxGateway.Worker.Tests/Sta/` (no `StaMessagePumpTests.cs`) |
| Worker.Tests-002 | High | Resolved | Testing coverage | `src/MxGateway.Worker.Tests/MxAccess/MxAccessStaSessionTests.cs`, `src/MxGateway.Worker.Tests/MxAccess/MxAccessEventMapperTests.cs` |
| Client.Dotnet-001 | Medium | Resolved | Error handling & resilience | `clients/dotnet/MxGateway.Client/GrpcMxGatewayClientTransport.cs:190-199`, `clients/dotnet/MxGateway.Client/GrpcGalaxyRepositoryClientTransport.cs:131-140` |
| Client.Dotnet-002 | Medium | Resolved | Testing coverage | `clients/dotnet/MxGateway.Client.Tests/FakeGatewayTransport.cs:145-148`, `clients/dotnet/MxGateway.Client.Tests/MxGatewayClientSessionTests.cs:236-256` |
| Client.Dotnet-003 | Medium | Resolved | Concurrency & thread safety | `clients/dotnet/MxGateway.Client/MxGatewaySession.cs:659-663`, `clients/dotnet/MxGateway.Client/MxGatewayClient.cs:230-240` |
| Client.Dotnet-018 | Medium | Resolved | Documentation & comments | `clients/dotnet/README.md:137-138` |
| Client.Dotnet-022 | Medium | Resolved | mxaccessgw conventions | `clients/dotnet/Directory.Build.props:1-21` |
| Client.Go-002 | Medium | Resolved | Error handling & resilience | `clients/go/mxgateway/session.go:440-516` |
| Client.Go-003 | Medium | Resolved | Correctness & logic bugs | `clients/go/cmd/mxgw-go/main.go:517-532` |
| Client.Go-022 | Medium | Resolved | Code organization & conventions | `clients/go/cmd/mxgw-go/main.go:398-412,417-519` |
| Client.Go-023 | Medium | Resolved | Concurrency & thread safety | `clients/go/cmd/mxgw-go/main.go:604-606,616-632` |
| Client.Go-028 | Medium | Resolved | Correctness & logic bugs | `scripts/tag-go-module.ps1:42-46` |
| Client.Java-001 | Medium | Resolved | Security | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewaySecrets.java:30-32` |
| Client.Java-002 | Medium | Resolved | Concurrency & thread safety | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxEventStream.java:31,66-92` |
| Client.Java-003 | Medium | Resolved | mxaccessgw conventions | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java:119-140` |
@@ -77,12 +83,15 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Client.Java-028 | Medium | Resolved | Documentation & comments | `clients/java/JavaClientDesign.md:23-27` |
| Client.Java-033 | Medium | Resolved | Correctness & logic bugs | `clients/java/zb-mom-ww-mxgateway-cli/src/main/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCli.java:1078-1098` |
| Client.Java-034 | Medium | Resolved | Correctness & logic bugs | `clients/java/zb-mom-ww-mxgateway-cli/src/main/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCli.java:182-198` |
| Client.Java-037 | Medium | Resolved | Documentation & comments | `clients/java/README.md:138-149` |
| Client.Python-003 | Medium | Resolved | Error handling & resilience | `clients/python/src/mxgateway/client.py:125-137,155-173` |
| Client.Python-005 | Medium | Resolved | Performance & resource management | `clients/python/src/mxgateway/galaxy.py:117-140` |
| Client.Python-009 | Medium | Resolved | Testing coverage | `clients/python/tests/` |
| Client.Python-013 | Medium | Resolved | Security | `clients/python/src/mxgateway_cli/commands.py:757-762` |
| Client.Python-023 | Medium | Resolved | Security | `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py:901-906` |
| Client.Python-024 | Medium | Resolved | Code organization & conventions | `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py:13,48-119` |
| Client.Python-027 | Medium | Resolved | Security | `clients/python/src/zb_mom_ww_mxgateway/client.py:36-54`, `clients/python/src/zb_mom_ww_mxgateway/galaxy.py:47-66`, `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py:165-172,918-930` |
| Client.Python-028 | Medium | Resolved | Error handling & resilience | `clients/python/src/zb_mom_ww_mxgateway/options.py:120-130`, `clients/python/src/zb_mom_ww_mxgateway/client.py:59`, `clients/python/src/zb_mom_ww_mxgateway/galaxy.py:71` |
| Client.Rust-005 | Medium | Resolved | Correctness & logic bugs | `clients/rust/src/session.rs:489-520` |
| Client.Rust-006 | Medium | Resolved | Error handling & resilience | `clients/rust/src/session.rs:531-555` |
| Client.Rust-015 | Medium | Resolved | Error handling & resilience | `clients/rust/crates/mxgw-cli/src/main.rs:1053-1070` |
@@ -90,6 +99,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Client.Rust-018 | Medium | Resolved | Error handling & resilience | `clients/rust/crates/mxgw-cli/src/main.rs:1098-1170`; `scripts/bench-read-bulk.ps1:347-365`; siblings: `clients/go/cmd/mxgw-go/main.go:600-648`, `clients/python/src/mxgateway_cli/commands.py:614-662`, `clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs:685-770`, `clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java:855-940` |
| Client.Rust-022 | Medium | Resolved | Correctness & logic bugs | `clients/rust/src/session.rs:369-391,403-420,427-444,452-469,476-493,631-696,706-724` |
| Client.Rust-024 | Medium | Resolved | Testing coverage | `clients/rust/tests/client_behavior.rs:405-415`; `clients/rust/src/session.rs:369-493`; `clients/rust/src/client.rs:265-291`; `clients/rust/crates/mxgw-cli/src/main.rs:1310-1505` |
| Client.Rust-031 | Medium | Resolved | Error handling & resilience | `clients/rust/src/options.rs:196-240` (`build_tls_config`); `clients/rust/Cargo.toml:40` (tonic features); docs: `clients/rust/src/options.rs:76-101`, `clients/rust/README.md` (TLS trust section), `clients/rust/crates/mxgw-cli/src/main.rs:429-431`, `clients/rust/RustClientDesign.md:202` |
| Contracts-002 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:384-385`, `:95` |
| Contracts-009 | Medium | Resolved | Design-document adherence | `docs/Contracts.md:13-24` |
| IntegrationTests-003 | Medium | Resolved | Correctness & logic bugs | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:89-97` |
@@ -113,6 +123,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Server-033 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs:265-323` (`TryRestoreFromDiskAsync`), `:84-99` (`_firstLoad` / `WaitForFirstLoadAsync`); `src/MxGateway.Server/Grpc/GalaxyRepositoryGrpcService.cs:141-163` (`WaitForCacheBootstrap`) |
| Server-038 | Medium | Resolved | Security | `src/ZB.MOM.WW.MxGateway.Server/Dashboard/Hubs/EventsHub.cs:23-44` |
| Server-044 | Medium | Resolved | Correctness & logic bugs | `src/ZB.MOM.WW.MxGateway.Server/Sessions/SessionManager.cs:216-254` |
| Server-051 | Medium | Resolved | Error handling & resilience | `src/ZB.MOM.WW.MxGateway.Server/Alarms/AlarmWatchListResolver.cs:64-78` |
| Tests-003 | Medium | Resolved | Performance & resource management | `src/MxGateway.Tests/Security/Authentication/SqliteAuthStoreTests.cs:170-176`, `src/MxGateway.Tests/Security/Authentication/ApiKeyAdminCliRunnerTests.cs:252-258` |
| Tests-004 | Medium | Resolved | Testing coverage | `src/MxGateway.Tests/Security/Authorization/GatewayGrpcAuthorizationInterceptorTests.cs` |
| Tests-005 | Medium | Resolved | Testing coverage | `src/MxGateway.Tests/Gateway/Grpc/EventStreamServiceTests.cs:239-261`, `src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs` |
@@ -122,6 +133,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Tests-020 | Medium | Resolved | Testing coverage | `src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceConstraintTests.cs:275-347`, `src/MxGateway.Server/Grpc/MxAccessGatewayService.cs:803-829` |
| Tests-026 | Medium | Resolved | Testing coverage | `src/ZB.MOM.WW.MxGateway.Tests/Gateway/Grpc/EventStreamServiceTests.cs`, `src/ZB.MOM.WW.MxGateway.Server/Grpc/EventStreamService.cs:123-126` |
| Tests-027 | Medium | Resolved | Concurrency & thread safety | `src/ZB.MOM.WW.MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs:199-240`, `src/ZB.MOM.WW.MxGateway.Server/Metrics/GatewayMetrics.cs:8,73,246-251` |
| Tests-032 | Medium | Resolved | Testing coverage | `src/ZB.MOM.WW.MxGateway.Server/Alarms/GatewayAlarmMonitor.cs:435-441`, `src/ZB.MOM.WW.MxGateway.Tests/Alarms/GatewayAlarmMonitorProviderModeTests.cs`, `src/ZB.MOM.WW.MxGateway.Tests/Alarms/AlarmFailoverEndToEndTests.cs` |
| Worker-004 | Medium | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:565-588` |
| Worker-005 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs:205-258` (production alarm poll loop) |
| Worker-006 | Medium | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:117-124`, `src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs:386-491` |
@@ -130,6 +142,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Worker-016 | Medium | Resolved | Concurrency & thread safety | `src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs:261-265` |
| Worker-017 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Worker/Sta/StaRuntime.cs:280-288`, `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:602-631` |
| Worker-023 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:610-668`, `src/MxGateway.Worker/MxAccess/MxAccessCommandExecutor.cs:124-153` |
| Worker-027 | Medium | Resolved | Error handling & resilience | `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/SyntheticAlarmGuid.cs:38-40` |
| Worker.Tests-003 | Medium | Resolved | Concurrency & thread safety | `src/MxGateway.Worker.Tests/Sta/StaRuntimeTests.cs:46-48` |
| Worker.Tests-004 | Medium | Resolved | Concurrency & thread safety | `src/MxGateway.Worker.Tests/MxAccess/MxAccessStaSessionTests.cs:281-329` |
| Worker.Tests-005 | Medium | Resolved | Performance & resource management | `src/MxGateway.Worker.Tests/Ipc/WorkerFrameProtocolTests.cs:20-31,103-105`, `src/MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs:28-31` |
@@ -138,6 +151,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Worker.Tests-016 | Medium | Resolved | Code organization & conventions | `src/MxGateway.Worker.Tests/MxAccess/AlarmCommandExecutorTests.cs:317-393` |
| Worker.Tests-017 | Medium | Resolved | Testing coverage | `src/MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs` |
| Worker.Tests-018 | Medium | Resolved | Correctness & logic bugs | `src/MxGateway.Worker.Tests/MxAccess/MxAccessLiveComCreationTests.cs:18-31, 35-73, 75-145, 148-220, 222-342` |
| Worker.Tests-031 | Medium | Resolved | Testing coverage | `src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/FailoverAlarmConsumerTests.cs` (all `FailoverSettings` constructions) |
| Client.Dotnet-004 | Low | Resolved | Error handling & resilience | `clients/dotnet/MxGateway.Client/MxGatewayClient.cs:283-294`, `clients/dotnet/MxGateway.Client/GalaxyRepositoryClient.cs:392-403` |
| Client.Dotnet-005 | Low | Resolved | Correctness & logic bugs | `clients/dotnet/MxGateway.Client/MxGatewaySession.cs:82,124,175` |
| Client.Dotnet-006 | Low | Resolved | Code organization & conventions | `clients/dotnet/MxGateway.Client/MxGatewayClientOptions.cs:50`, `clients/dotnet/MxGateway.Client/MxGatewayClientContractInfo.cs:10-14` |
@@ -155,6 +169,9 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Client.Dotnet-019 | Low | Resolved | Correctness & logic bugs | `clients/dotnet/ZB.MOM.WW.MxGateway.Client.Cli/MxGatewayClientCli.cs:745` |
| Client.Dotnet-020 | Low | Resolved | Error handling & resilience | `clients/dotnet/ZB.MOM.WW.MxGateway.Client.Cli/MxGatewayClientCli.cs:792-810`, `clients/dotnet/ZB.MOM.WW.MxGateway.Client.Cli/MxGatewayClientCli.cs:774-780` |
| Client.Dotnet-021 | Low | Resolved | Correctness & logic bugs | `clients/dotnet/ZB.MOM.WW.MxGateway.Client.Cli/MxGatewayClientCli.cs:487`, `clients/dotnet/ZB.MOM.WW.MxGateway.Client.Cli/MxGatewayClientCli.cs:715` |
| Client.Dotnet-023 | Low | Resolved | Code organization & conventions | `clients/dotnet/Directory.Build.props:17`, `clients/dotnet/ZB.MOM.WW.MxGateway.Client.Cli/IMxGatewayCliClient.cs:6`, `clients/dotnet/ZB.MOM.WW.MxGateway.Client.Tests/*.cs` |
| Client.Dotnet-024 | Low | Resolved | Code organization & conventions | `clients/dotnet/Directory.Build.props:12`, `clients/dotnet/ZB.MOM.WW.MxGateway.Client/ZB.MOM.WW.MxGateway.Client.csproj:19-24` |
| Client.Dotnet-025 | Low | Resolved | Concurrency & thread safety | `clients/dotnet/ZB.MOM.WW.MxGateway.Client/LazyBrowseNode.cs:38,41,54,82,94` |
| Client.Go-004 | Low | Resolved | mxaccessgw conventions | `clients/go/mxgateway/alarms_test.go:153-154`, `clients/go/mxgateway/galaxy_test.go:58-59` |
| Client.Go-005 | Low | Resolved | Design-document adherence | `clients/go/mxgateway/client.go:64,68`, `clients/go/mxgateway/galaxy.go:83,87` |
| Client.Go-006 | Low | Resolved | Error handling & resilience | `clients/go/mxgateway/errors.go:9-130` |
@@ -177,6 +194,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Client.Go-025 | Low | Resolved | Correctness & logic bugs | `clients/go/mxgateway/session.go:395-485,495-525` |
| Client.Go-026 | Low | Resolved | Error handling & resilience | `clients/go/cmd/mxgw-go/main.go:1196-1222` |
| Client.Go-027 | Low | Resolved | Code organization & conventions | `clients/go/cmd/mxgw-go/main.go:1195-1206` |
| Client.Go-029 | Low | Resolved | Documentation & comments | `clients/go/README.md:300-303` |
| Client.Java-006 | Low | Resolved | Performance & resource management | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java:323-328`, `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/GalaxyRepositoryClient.java:279-284` |
| Client.Java-007 | Low | Resolved | Testing coverage | `clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/` |
| Client.Java-008 | Low | Resolved | Error handling & resilience | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java:298-304` |
@@ -199,6 +217,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Client.Java-031 | Low | Resolved | mxaccessgw conventions | `clients/java/README.md:13,17,26` |
| Client.Java-035 | Low | Resolved | Testing coverage | `clients/java/zb-mom-ww-mxgateway-client/src/test/java/com/zb/mom/ww/mxgateway/client/MxGatewayClientSessionTests.java` |
| Client.Java-036 | Low | Resolved | Code organization & conventions | `clients/java/zb-mom-ww-mxgateway-client/src/main/java/com/zb/mom/ww/mxgateway/client/MxGatewayAlarmFeedSubscription.java`, `MxGatewayEventSubscription.java`, `MxGatewayActiveAlarmsSubscription.java`, `DeployEventSubscription.java` |
| Client.Java-038 | Low | Resolved | Code organization & conventions | `clients/java/zb-mom-ww-mxgateway-cli/src/main/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCli.java:1347-1393` |
| Client.Python-001 | Low | Resolved | Documentation & comments | `clients/python/pyproject.toml:8,25`, `clients/python/src/mxgateway_cli/commands.py:25` |
| Client.Python-002 | Low | Resolved | Code organization & conventions | `clients/python/src/mxgateway/__init__.py:27` |
| Client.Python-004 | Low | Resolved | Correctness & logic bugs | `clients/python/src/mxgateway_cli/commands.py:386,402-404` |
@@ -217,6 +236,9 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Client.Python-021 | Low | Resolved | Documentation & comments | `clients/python/src/mxgateway_cli/commands.py`, `clients/python/README.md:235-258` |
| Client.Python-025 | Low | Resolved | Testing coverage | `clients/python/tests/test_cli.py`, `clients/python/src/zb_mom_ww_mxgateway/{client.py,session.py}`, `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py` |
| Client.Python-026 | Low | Resolved | Correctness & logic bugs | `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py:674-738` |
| Client.Python-029 | Low | Resolved | Correctness & logic bugs | `clients/python/src/zb_mom_ww_mxgateway/options.py:78-90` |
| Client.Python-030 | Low | Resolved | Code organization & conventions | `clients/python/pyproject.toml:17` |
| Client.Python-031 | Low | Resolved | Testing coverage | `clients/python/tests/test_tls.py:34`, `clients/python/pyproject.toml:53-56` |
| Client.Rust-004 | Low | Resolved | Documentation & comments | `clients/rust/src/version.rs:7` |
| Client.Rust-007 | Low | Resolved | Design-document adherence | `clients/rust/RustClientDesign.md:14-55` |
| Client.Rust-008 | Low | Resolved | Performance & resource management | `clients/rust/src/value.rs:161-261` |
@@ -233,6 +255,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Client.Rust-026 | Low | Resolved | Performance & resource management | `clients/rust/crates/mxgw-cli/src/main.rs:1402-1406,1419-1423` |
| Client.Rust-027 | Low | Resolved | Documentation & comments | `clients/rust/.cargo/config.toml:1-9` |
| Client.Rust-028 | Low | Resolved | mxaccessgw conventions | `clients/rust/crates/mxgw-cli/src/main.rs:1126-1166` |
| Client.Rust-032 | Low | Resolved | Design-document adherence | `clients/rust/RustClientDesign.md`; surface in `clients/rust/src/galaxy.rs:281-379` |
| Contracts-001 | Low | Resolved | Design-document adherence | `docs/Grpc.md:13` (and `:3`, `:32`, `:39`) |
| Contracts-003 | Low | Won't Fix | Code organization & conventions | `src/MxGateway.Contracts/MxGateway.Contracts.csproj:10` |
| Contracts-004 | Low | Resolved | Documentation & comments | `src/MxGateway.Contracts/GatewayContractInfo.cs:3-6` |
@@ -248,6 +271,8 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Contracts-015 | Low | Resolved | Documentation & comments | `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:571-582` |
| Contracts-016 | Low | Resolved | Code organization & conventions | `src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto:31-41` (`QueryActiveAlarmsRequest`) |
| Contracts-017 | Low | Resolved | Documentation & comments | `src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto:23-29` (the `rpc QueryActiveAlarms` block) |
| Contracts-018 | Low | Resolved | Testing coverage | `src/ZB.MOM.WW.MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs:396` (`ActiveAlarmSnapshot_RoundTripsAllFields`) |
| Contracts-019 | Low | Resolved | Documentation & comments | `src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto:850-851` (`ActiveAlarmSnapshot`), `:318-324` (`AlarmProviderMode`) |
| IntegrationTests-007 | Low | Resolved | Concurrency & thread safety | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:20`, `src/MxGateway.IntegrationTests/Galaxy/GalaxyRepositoryLiveTests.cs:5`, `src/MxGateway.IntegrationTests/DashboardLdapLiveTests.cs:9` |
| IntegrationTests-008 | Low | Resolved | Code organization & conventions | `src/MxGateway.IntegrationTests/LiveLdapFactAttribute.cs`, `src/MxGateway.IntegrationTests/Galaxy/LiveGalaxyRepositoryFactAttribute.cs`, `src/MxGateway.IntegrationTests/LiveMxAccessFactAttribute.cs` |
| IntegrationTests-009 | Low | Resolved | Documentation & comments | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:372-375` |
@@ -263,6 +288,9 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| IntegrationTests-023 | Low | Resolved | Testing coverage | `src/ZB.MOM.WW.MxGateway.IntegrationTests/DashboardLdapLiveTests.cs:14-29` |
| IntegrationTests-024 | Low | Resolved | Code organization & conventions | `src/ZB.MOM.WW.MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs` (`NullDashboardEventBroadcaster` private class at end of file) |
| IntegrationTests-025 | Low | Resolved | Correctness & logic bugs | `src/ZB.MOM.WW.MxGateway.IntegrationTests/IntegrationTestEnvironmentTests.cs:57-84` (`ResolveRepositoryRoot_NoMarkers_ThrowsInvalidOperationExceptionNamingStartAndMarkers`) |
| IntegrationTests-027 | Low | Resolved | Code organization & conventions | `src/ZB.MOM.WW.MxGateway.IntegrationTests/ZB.MOM.WW.MxGateway.IntegrationTests.csproj`, `src/ZB.MOM.WW.MxGateway.IntegrationTests/DashboardLdapLiveTests.cs:4-5,134` |
| IntegrationTests-028 | Low | Resolved | Design-document adherence | `src/ZB.MOM.WW.MxGateway.IntegrationTests/DashboardLdapLiveTests.cs:120-161`, `src/ZB.MOM.WW.MxGateway.Server/Dashboard/DashboardServiceCollectionExtensions.cs:35` |
| IntegrationTests-029 | Low | Resolved | Documentation & comments | `docs/GatewayTesting.md:218-224` |
| Server-007 | Low | Resolved | Performance & resource management | `src/MxGateway.Server/Galaxy/GalaxyHierarchyProjector.cs:55-70` |
| Server-008 | Low | Resolved | Performance & resource management | `src/MxGateway.Server/Grpc/GalaxyRepositoryGrpcService.cs:111-134,160-189` |
| Server-009 | Low | Resolved | Error handling & resilience | `src/MxGateway.Server/Security/Authentication/AuthSqliteConnectionFactory.cs:15-32` |
@@ -297,6 +325,8 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Server-048 | Low | Resolved | Testing coverage | `src/ZB.MOM.WW.MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs:463-498` |
| Server-049 | Low | Resolved | Documentation & comments | `src/ZB.MOM.WW.MxGateway.Server/Dashboard/IDashboardSessionAdminService.cs:5-18`, `src/ZB.MOM.WW.MxGateway.Server/Dashboard/DashboardSessionAdminService.cs:8-25` |
| Server-050 | Low | Resolved | Error handling & resilience | `src/ZB.MOM.WW.MxGateway.Server/Dashboard/DashboardSessionAdminService.cs:42-75,92-125` |
| Server-052 | Low | Resolved | Documentation & comments | `src/ZB.MOM.WW.MxGateway.Server/Alarms/IAlarmWatchListResolver.cs:24-30`, `src/ZB.MOM.WW.MxGateway.Server/Alarms/AlarmWatchListResolver.cs:101-114`, `docs/GatewayConfiguration.md:247` |
| Server-053 | Low | Resolved | Testing coverage | `src/ZB.MOM.WW.MxGateway.Tests/Alarms/AlarmWatchListResolverTests.cs`, `src/ZB.MOM.WW.MxGateway.Tests/Alarms/GatewayAlarmMonitorProviderModeTests.cs` |
| Tests-007 | Low | Resolved | Code organization & conventions | `src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs:682`, `src/MxGateway.Tests/Gateway/Grpc/GalaxyRepositoryGrpcServiceTests.cs:324`, `src/MxGateway.Tests/Gateway/GatewayEndToEndFakeWorkerSmokeTests.cs:460`, `src/MxGateway.Tests/Security/Authorization/GatewayGrpcAuthorizationInterceptorTests.cs:233` |
| Tests-008 | Low | Resolved | mxaccessgw conventions | `src/MxGateway.Tests/Gateway/Sessions/WorkerAlarmRpcDispatcherTests.cs:1-9`, `src/MxGateway.Tests/Gateway/Sessions/NotWiredAlarmRpcDispatcherTests.cs:1-3`, `src/MxGateway.Tests/Gateway/Sessions/SessionManagerAlarmAutoSubscribeTests.cs:1` |
| Tests-009 | Low | Resolved | Documentation & comments | `src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs:36-37,99,365` |
@@ -317,6 +347,9 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Tests-029 | Low | Resolved | Error handling & resilience | `src/ZB.MOM.WW.MxGateway.Tests/Gateway/Dashboard/DashboardSessionAdminServiceTests.cs:61-106,139-222`, `src/ZB.MOM.WW.MxGateway.Server/Dashboard/DashboardSessionAdminService.cs:77-125` |
| Tests-030 | Low | Resolved | Testing coverage | `src/ZB.MOM.WW.MxGateway.Tests/Gateway/Dashboard/DashboardApiKeyManagementServiceTests.cs:115-163`, `src/ZB.MOM.WW.MxGateway.Server/Dashboard/DashboardApiKeyManagementService.cs:146-177` |
| Tests-031 | Low | Resolved | Concurrency & thread safety | `src/ZB.MOM.WW.MxGateway.Tests/Gateway/Dashboard/DashboardSnapshotPublisherTests.cs:22-61` |
| Tests-033 | Low | Resolved | Testing coverage | `src/ZB.MOM.WW.MxGateway.Server/Dashboard/DashboardAlarmProviderStatus.cs`, `src/ZB.MOM.WW.MxGateway.Tests/Gateway/Dashboard/DashboardBrowseAndAlarmModelTests.cs:140-195` |
| Tests-034 | Low | Resolved | mxaccessgw conventions | `src/ZB.MOM.WW.MxGateway.Tests/Diagnostics/GatewayLogRedactorSeamTests.cs:1-15` |
| Tests-035 | Low | Resolved | Concurrency & thread safety | `src/ZB.MOM.WW.MxGateway.Tests/Alarms/AlarmFailoverEndToEndTests.cs:315-329` |
| Worker-009 | Low | Resolved | Performance & resource management | `src/MxGateway.Worker/Ipc/WorkerFrameReader.cs:31,49`, `src/MxGateway.Worker/Ipc/WorkerFrameWriter.cs:57-58` |
| Worker-010 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Conversion/VariantConverter.cs:204-226` |
| Worker-011 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Ipc/WorkerPipeClient.cs:169-171` |
@@ -331,6 +364,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Worker-022 | Low | Resolved | Code organization & conventions | `src/MxGateway.Worker/MxAccess/MxAlarmSnapshot.cs:12`, `:26`, `:49` |
| Worker-024 | Low | Resolved | Concurrency & thread safety | `src/MxGateway.Worker/MxAccess/AlarmCommandHandler.cs:63-187`, `src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs:191-323` |
| Worker-025 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:111-117` |
| Worker-028 | Low | Resolved | Code organization & conventions | `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/SubtagAlarmStateMachine.cs:43-52`, `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/SubtagAlarmConsumer.cs:70-75` |
| Worker.Tests-008 | Low | Resolved | Documentation & comments | `src/MxGateway.Worker.Tests/Conversion/VariantConverterTests.cs:175-182` |
| Worker.Tests-009 | Low | Resolved | Code organization & conventions | `src/MxGateway.Worker.Tests/MxAccess/AlarmCommandHandlerTests.cs`, `AlarmDispatcherTests.cs`, `AlarmCommandExecutorTests.cs`, `AlarmRecordTransitionMapperTests.cs`, `WnWrapAlarmConsumerXmlTests.cs` |
| Worker.Tests-010 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker.Tests/MxAccess/MxAccessStaSessionTests.cs:230-258` |
@@ -351,3 +385,5 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Worker.Tests-028 | Low | Resolved | Design-document adherence | `docs/GatewayTesting.md`, `src/MxGateway.Worker.Tests/Probes/` |
| Worker.Tests-029 | Low | Resolved | Code organization & conventions | `src/MxGateway.Worker.Tests/Probes/AlarmsLiveSmokeTests.cs:9`, `src/MxGateway.Worker.Tests/Probes/AlarmClientWmProbeTests.cs:14`, `src/MxGateway.Worker.Tests/Probes/WnWrapConsumerProbeTests.cs:10` |
| Worker.Tests-030 | Low | Resolved | Documentation & comments | `src/MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs:862-890` |
| Worker.Tests-032 | Low | Resolved | Error handling & resilience | `src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/FailoverAlarmConsumerTests.cs` |
| Worker.Tests-033 | Low | Resolved | Testing coverage | `src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/SubtagAlarmStateMachineTests.cs` |
+95 -2
View File
@@ -4,8 +4,8 @@
|---|---|
| Module | `src/ZB.MOM.WW.MxGateway.Server` |
| Reviewer | Claude Code |
| Review date | 2026-05-24 |
| Commit reviewed | `42b0037` |
| Review date | 2026-06-15 |
| Commit reviewed | `410acc9` |
| Status | Re-reviewed |
| Open findings | 0 |
@@ -120,6 +120,38 @@ contention nor the bounded `_events` channel saw any changes in this wave.
| 9 | Testing coverage | No issues found in this module — see Tests-026 in the Tests module for the missing EventsHub broadcast coverage. |
| 10 | Documentation & comments | Issues found: Server-040, Server-043 (both documentation gaps). |
### 2026-06-15 re-review (commit 410acc9)
Re-review pass at `410acc9` over the `42b0037..HEAD` diff. The diff is large (~137 files)
but the bulk is vendored theme/CSS/font asset swaps (`wwwroot`), generated code, and the
shared-library auth refactor / TLS cert-autogen / lazy-browse / canonical-audit waves that
each carry their own design+plan and were verified in passing only. This pass is scoped to
the **alarm-provider subtag-fallback** wave the task called out: the central
`GatewayAlarmMonitor` provider-mode seeding + failover/failback handling, the new
`AlarmWatchListResolver` / `IAlarmWatchListResolver`, `AlarmFallbackOptions` /
`AlarmDiscoveryOptions` / `AlarmSubtagNameOptions` and their `GatewayOptionsValidator`
wiring, the `DashboardAlarmProviderStatus` badge + `AlarmsPage.razor` hub attach, the
provider-mode gauge + `provider_switches` counter (`GatewayMetrics`,
`AlarmProviderSwitchReason`), the Galaxy alarm-attribute discovery query
(`GalaxyRepository.GetAlarmAttributesAsync` / `AlarmAttributesSql` / `GalaxyAlarmAttributeRow`),
the `/auth/login` POST move + configurable `Dashboard:CookieName`, and the
`BrowseChildrenRequest` scope-resolver entry. Prior findings Server-044 through Server-050
are confirmed resolved by the SessionManager/GatewaySession changes in range and remain
closed. New findings filed against this pass: Server-051..053.
| # | Category | Result |
|---|---|---|
| 1 | Correctness & logic bugs | Issues found: Server-051 (`AlarmWatchListResolver.ResolveAsync`'s broad `catch (Exception)` swallows `OperationCanceledException`, contradicting the `IAlarmWatchListResolver` cancellation contract). |
| 2 | mxaccessgw conventions | No issues found — file-scoped namespaces, `sealed`, `Async` suffix, Options pattern, MXAccess-aligned naming all hold; no UI component libraries (badge is Bootstrap-only); the alarm SQL is a parameterless constant; no secret/tag-value logging; well-known reason strings centralised in `AlarmProviderReasons`. |
| 3 | Concurrency & thread safety | No issues found — `_providerMode`/`_providerDegraded`/`_providerReason`/`_providerSince` are read/written only under `_sync`; `BroadcastToAll` runs under `_sync`; the reconcile after a mode change is intentionally awaited outside `_sync` to avoid the documented self-deadlock; the provider-mode gauge is serialized on `GatewayMetrics._syncRoot`. |
| 4 | Error handling & resilience | Issues found: Server-051 (cancellation swallowed in the resolver — also an error-handling/contract concern). |
| 5 | Security | No issues found — `BrowseChildren` runs the same `ResolveBrowseSubtrees()` constraint scoping and `MetadataRead` scope as `DiscoverHierarchy`; the configurable `Dashboard:CookieName` falls back to the canonical default and cannot be blanked; the `/auth/login` POST keeps antiforgery + return-URL sanitisation. |
| 6 | Performance & resource management | No issues found in the alarm-fallback code — discovery is a one-shot per subscribe lifecycle; the watch-list is composed once. |
| 7 | Design-document adherence | No issues found — `docs/GatewayConfiguration.md`, `docs/Metrics.md`, `docs/GalaxyRepository.md`, and the `docs/plans/2026-06-13-alarm-subtag-fallback*` / `2026-06-15-forced-subtag-mode-fix.md` plans were landed in the same range and match the code. |
| 8 | Code organization & conventions | No issues found — new alarm types live under `Alarms/`, options under `Configuration/`, metric helper under `Metrics/`, registered via `AddGatewayAlarms`. |
| 9 | Testing coverage | Issues found: Server-053 (`AlarmWatchListResolver` `ExcludeAttributes`-vs-`IncludeAttributes` precedence and the resolver's cancellation contract are untested; no redundant-mode-change guard test). |
| 10 | Documentation & comments | Issues found: Server-052 (`IAlarmWatchListResolver` XML contract claims cancellation propagates while the implementation swallows it; the `Discovery:ExcludeAttributes` doc says "Repository-derived watch-list" while the code also removes matching explicit `IncludeAttributes`). |
## Findings
### Server-001
@@ -929,3 +961,64 @@ Today neither call site has a Blazor error boundary, so an unhandled exception l
**Recommendation:** Add a general `catch (Exception exception)` after the `SessionManagerException` catch in both `CloseSessionAsync` and `KillWorkerAsync`, log a warning (matching the SessionManagerException pattern), and return `DashboardSessionAdminResult.Fail($"{operation} failed unexpectedly. See the gateway log for details.")`. This makes the result type truly the only output the page sees. Add a regression test using a `ThrowingSessionManager` that throws e.g. `InvalidOperationException` from `KillWorkerAsync` and asserts the service returns a failing result rather than propagating.
**Resolution:** 2026-05-24 — Added the recommended general `catch (Exception)` arms to both `DashboardSessionAdminService.CloseSessionAsync` and `KillWorkerAsync` (`src/ZB.MOM.WW.MxGateway.Server/Dashboard/DashboardSessionAdminService.cs`), placed after the `SessionManagerException` catches and behind a `catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) throw;` so caller cancellation still propagates cleanly. The new catches log a warning with actor + session id and return `DashboardSessionAdminResult.Fail("{Operation} failed unexpectedly for session {SessionId}. See the gateway log for details.")`, mirroring the SessionManagerException pattern. Regression tests in `src/ZB.MOM.WW.MxGateway.Tests/Gateway/Dashboard/DashboardSessionAdminServiceTests.cs`: `CloseSessionAsync_WhenManagerThrowsUnexpected_ReturnsFriendlyFail` (the `ISessionManager` throws `InvalidOperationException("unexpected")`) and `KillWorkerAsync_WhenManagerThrowsUnexpected_ReturnsFriendlyFail` (throws `IOException("pipe broken")`); both assert the service returns a failing result with a non-blank message rather than propagating. The fake's new `CloseThrowsUnexpected` / `KillThrowsUnexpected` properties hold the configured exception. Confirmed to fail before the fix (raw exception propagated) and pass after.
### Server-051
| Field | Value |
|---|---|
| Severity | Medium |
| Category | Error handling & resilience |
| Location | `src/ZB.MOM.WW.MxGateway.Server/Alarms/AlarmWatchListResolver.cs:64-78` |
| Status | Resolved |
**Description:** `AlarmWatchListResolver.ResolveAsync` wraps the Galaxy Repository discovery call in a bare `catch (Exception ex)` that logs a warning and continues with an empty (config-only) discovery set:
```csharp
try { rows = await _repository.GetAlarmAttributesAsync(cancellationToken)...; }
catch (Exception ex) { _logger.LogWarning(ex, "...continuing with configuration-only watch-list."); rows = []; }
```
`OperationCanceledException` / `TaskCanceledException` derive from `Exception`, so a cancellation triggered while `GetAlarmAttributesAsync` is awaiting SQL is **swallowed**, not propagated. The resolver then returns a (config-only or empty) watch-list as though the call completed normally. This directly contradicts the `IAlarmWatchListResolver.ResolveAsync` XML contract, which states: *"Cancellation is the one exception: a triggered cancellationToken still propagates an OperationCanceledException."* In practice the resolver is called from `GatewayAlarmMonitor.SubscribeAlarmsAsync` on the monitor's lifecycle token; if the gateway is shutting down (or the monitor lifecycle is being torn down) mid-discovery, the resolver hides the cancellation and the monitor proceeds to issue `SubscribeAlarms` with a wrong (empty) watch-list instead of unwinding promptly. The `GalaxyRepository.GetAlarmAttributesAsync` SQL path does honour the token (`OpenAsync(ct)` / `ExecuteReaderAsync(ct)` / `ReadAsync(ct)`), so a real cancellation can land inside this catch.
**Recommendation:** Add a `catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) { throw; }` ahead of the general catch (or filter the general catch with `when (ex is not OperationCanceledException)`), so cancellation propagates per the documented contract while genuine discovery failures still degrade to a config-only list. Add a regression test that cancels the token mid-`GetAlarmAttributesAsync` and asserts `OperationCanceledException` propagates.
**Resolution:** Resolved 2026-06-15. Confirmed against source: the bare `catch (Exception ex)` swallowed `OperationCanceledException`. Filtered the catch with `when (ex is not OperationCanceledException)` so a real cancellation propagates per the `IAlarmWatchListResolver` contract while genuine discovery failures still degrade to a config-only list. Regression test: `AlarmWatchListResolverTests.ResolveAsync_RepositoryCancelled_PropagatesOperationCanceled` (failed before the fix, passes after).
### Server-052
| Field | Value |
|---|---|
| Severity | Low |
| Category | Documentation & comments |
| Location | `src/ZB.MOM.WW.MxGateway.Server/Alarms/IAlarmWatchListResolver.cs:24-30`, `src/ZB.MOM.WW.MxGateway.Server/Alarms/AlarmWatchListResolver.cs:101-114`, `docs/GatewayConfiguration.md:247` |
| Status | Resolved |
**Description:** Two prose-vs-code mismatches in the watch-list resolver:
1. The `IAlarmWatchListResolver.ResolveAsync` XML `<returns>` promises that a triggered `cancellationToken` propagates an `OperationCanceledException`, but the implementation swallows it (see Server-051). Whichever way Server-051 is resolved, exactly one of the doc or the code is currently wrong; right now the doc over-promises.
2. `AlarmDiscoveryOptions.ExcludeAttributes` and `docs/GatewayConfiguration.md:247` both describe `ExcludeAttributes` as removing entries from the **"Repository-derived"** watch-list. The implementation's `ordered.RemoveAll(e => excluded.Contains(e.Reference))` runs over the combined list — Galaxy-Repository rows **and** the explicit `Discovery:IncludeAttributes` entries appended just above it — so an exclude entry that matches an explicit include silently removes that include too. The behaviour is defensible (excludes win) but is not what the "Repository-derived" wording says, and an operator who adds an attribute via `IncludeAttributes` and also lists it in `ExcludeAttributes` would be surprised it disappears.
**Recommendation:** For (1), align the `IAlarmWatchListResolver` doc with whatever Server-051 settles on. For (2), either restrict the exclude to GR-discovered rows (apply `RemoveAll` before appending the `IncludeAttributes` entries) or update the option XML doc and `GatewayConfiguration.md` to say excludes are applied to the merged GR-plus-include list and therefore also suppress matching explicit includes.
**Resolution:** Resolved 2026-06-15. (1) No longer over-promises: the Server-051 fix makes the implementation propagate `OperationCanceledException`, so the `IAlarmWatchListResolver.ResolveAsync` `<returns>` doc is now accurate and was left unchanged. (2) Kept the "excludes win" code behaviour (excludes applied to the merged GR-plus-include list) and corrected the prose to match: `AlarmDiscoveryOptions.ExcludeAttributes` XML doc and `docs/GatewayConfiguration.md:247` now state the exclude runs after the GR rows and explicit `IncludeAttributes` are combined, so an exclude matching an explicit include suppresses it too. The "excludes win" precedence is pinned by `AlarmWatchListResolverTests.ResolveAsync_ExcludeAlsoSuppressesMatchingExplicitInclude`.
### Server-053
| Field | Value |
|---|---|
| Severity | Low |
| Category | Testing coverage |
| Location | `src/ZB.MOM.WW.MxGateway.Tests/Alarms/AlarmWatchListResolverTests.cs`, `src/ZB.MOM.WW.MxGateway.Tests/Alarms/GatewayAlarmMonitorProviderModeTests.cs` |
| Status | Resolved |
**Description:** The new alarm-fallback surface is broadly well-tested (`AlarmWatchListResolverTests`, `GatewayAlarmMonitorProviderModeTests`, `DashboardBrowseAndAlarmModelTests`, `GalaxyAlarmAttributeMappingTests`, `GatewayOptionsValidatorTests`), but two behaviours that the diff introduced have no coverage:
- **Resolver cancellation contract (Server-051):** no test cancels the token mid-discovery and asserts `OperationCanceledException` propagates. Because the existing `ResolveAsync_RepositoryThrows_LogsAndReturnsConfigOnlySet` asserts the swallow path, the cancellation regression is precisely the case that would catch the Server-051 bug — and its absence is why the contract violation went unnoticed.
- **Exclude-vs-include precedence (Server-052 item 2):** no test exercises a `Discovery:IncludeAttributes` entry that also appears in `ExcludeAttributes`, so the "excludes also drop explicit includes" behaviour is unpinned and would silently change if the merge order were edited.
Additionally, `GatewayAlarmMonitor.ApplyProviderModeChangeAsync` increments the `mxgateway.alarms.provider_switches` counter and resets `_providerSince` unconditionally on every `OnAlarmProviderModeChanged` event, with no guard for a redundant event whose `toMode` equals the current mode; there is no test asserting the from==to / no-op behaviour either way.
**Recommendation:** Add resolver tests for (a) cancellation propagation and (b) an include that is also excluded; and a `GatewayAlarmMonitorProviderMode` test pinning the provider-switch counter behaviour for a same-mode repeat event (whichever semantics the team intends). These lock down the contracts the Server-051/052 findings expose.
**Resolution:** Resolved 2026-06-15. Added all three missing tests: (a) `AlarmWatchListResolverTests.ResolveAsync_RepositoryCancelled_PropagatesOperationCanceled` (cancellation propagation, also covers Server-051); (b) `AlarmWatchListResolverTests.ResolveAsync_ExcludeAlsoSuppressesMatchingExplicitInclude` (exclude-vs-include precedence, also Server-052 item 2); and (c) `GatewayAlarmMonitorProviderModeTests.ProviderModeChange_RepeatedSameMode_RecordsASwitchForEachEvent`, which pins the existing semantics — each worker-reported `OnAlarmProviderModeChanged` event records a `provider_switches` increment (and resets `_providerSince`) even when `toMode` equals the current mode, since the worker is the authority on when a mode change occurred and the gateway does not synthesize or suppress it.
+92 -2
View File
@@ -4,13 +4,43 @@
|---|---|
| Module | `src/ZB.MOM.WW.MxGateway.Tests` |
| Reviewer | Claude Code |
| Review date | 2026-05-24 |
| Commit reviewed | `42b0037` |
| Review date | 2026-06-15 |
| Commit reviewed | `410acc9` |
| Status | Re-reviewed |
| Open findings | 0 |
## Checklist coverage
### 2026-06-15 re-review (commit `410acc9`)
Re-review of the `42b0037..410acc9` diff (≈57 files), scoped to the alarm-provider
fallback feature: the end-to-end failover/failback lifecycle test
(`AlarmFailoverEndToEndTests`), the provider-mode/metric tests
(`GatewayAlarmMonitorProviderModeTests`), the watch-list resolver tests
(`AlarmWatchListResolverTests`), the validator additions
(`GatewayOptionsValidatorTests` AlarmFallback block), the dashboard badge model
(`DashboardBrowseAndAlarmModelTests`), the alarm metric tests
(`GatewayMetricsTests`), the Galaxy alarm mapper (`GalaxyAlarmAttributeMappingTests`),
and the new `provider_status` / degraded-provenance protobuf round-trips. The
non-alarm churn in the diff (kill/shutdown SessionManager tests closing prior
Tests-028/029, XML-doc-only additions to `SessionManagerBulkTests`/`GatewaySessionTests`,
browse-tab and TLS tests) was walked but is not the review focus.
| # | Category | Result |
|---|---|---|
| 1 | Correctness & logic bugs | No issues found in this diff. The lifecycle test correctly disambiguates the recovery `ProviderStatus` from the baseline by matching on `Reason == "recovered"`; the `ModeString_MapsToForcedProviderMode` `Assert.Empty(WatchList)` is weak (the stub resolver returns `[]` regardless of mode) but not wrong. |
| 2 | mxaccessgw conventions | Issue found: Tests-034 (`GatewayLogRedactorSeamTests.cs` is in the global namespace with redundant `System.Collections.Generic`/`Xunit` usings, `var`, and a non-`sealed` `public class` — the same C# style drift family as the resolved Tests-008). |
| 3 | Concurrency & thread safety | Issue found: Tests-035 (`AlarmFailoverEndToEndTests.DegradedTransition_*`'s second-subscriber `await foreach`-to-`SnapshotComplete` has no `WaitTimeout`, so a regression that never emits `SnapshotComplete` hangs the test instead of failing cleanly). The metric/feed reader races are correctly gated by `baselineReceived` TCS before emitting events. |
| 4 | Error handling & resilience | No issues found in this diff. `AlarmWatchListResolverTests.ResolveAsync_RepositoryThrows_LogsAndReturnsConfigOnlySet` covers the discovery-unavailable degradation path; validator failure paths are well covered. |
| 5 | Security | No issues found in this diff. The redaction seam assertion in `GatewayLogRedactorSeamTests` (despite its style drift) meaningfully pins API-key masking in `ClientIdentity`; secured-bulk credential round-trips are pinned. |
| 6 | Performance & resource management | No issues found in this diff. Monitors/CTSs are disposed; `using GatewayMetrics`/`using GatewayAlarmMonitor` throughout. |
| 7 | Design-document adherence | No issues found in this diff. Tests match the alarm-fallback plan and the forced-vs-failover-degraded badge distinction. |
| 8 | Code organization & conventions | See Tests-034. The two alarm-monitor test files replicate (not share) the `FakeSessionManager`/`StubWatchListResolver` harness; the in-file remark documents this is deliberate to keep the sibling untouched — acceptable, not filed. |
| 9 | Testing coverage | Issues found: Tests-032 (the monitor's `toMode``AlarmProviderSwitchReason` derivation — Subtag→Failover, Alarmmgr→Failback — is untested: `Failback` is asserted nowhere and the monitor tests check only the switch *count*, so a swapped/`Unknown` reason regression passes), Tests-033 (`DashboardAlarmProviderStatus.FromFeed` and its non-provider-status `ArgumentException` guard, the `SinceUtc` mapping, the `DegradedLabel` text, and the `Degraded && Mode==Alarmmgr` guard branch are all uncovered). |
| 10 | Documentation & comments | No issues found in this diff. New alarm test files carry orienting class-level summaries; `GalaxyAlarmAttributeMappingTests`'s "derivation" framing slightly overstates the pass-through mapper but is harmless. |
### 2026-05-24 re-review of the Tests-013019 batch
This pass (commit `a020350`) re-reviews the module after the Tests-013019 batch was resolved alongside Server-017, Server-021, and Contracts-010.
| # | Category | Result |
@@ -557,3 +587,63 @@ The cancellation tests for `WorkerClient` in `WorkerClientTests` *do* exercise t
**Recommendation:** (a) The cheap fix: have `ThrowOnceThenYieldSnapshotService` record `_firstThrowAt = DateTimeOffset.UtcNow` immediately before the `throw`, and change the assertion to `secondSubscribeAt - firstThrowAt >= reconnectDelay - 10ms` — the gap then measures only the reconnect delay, eliminating the variable scheduling baseline. (b) The deeper fix: extend `DashboardSnapshotPublisher` to accept an `ITimeProvider`-style delay seam (or a virtual `DelayAsync` hook) so a `ManualTimeProvider` could advance time deterministically. (a) is preferred for now; (b) belongs as a follow-up if more reconnect-loop tests are added.
**Resolution:** 2026-05-24 — Applied option (a). Added `FirstThrowAt` to `ThrowOnceThenYieldSnapshotService` and set it via `FirstThrowAt = DateTimeOffset.UtcNow;` immediately before the first-call `throw`. Removed the pre-`StartAsync` `startedAt` baseline; the assertion now reads `gap = secondSubscribeAt - firstThrowAt` (both timestamps captured inside the fake), and the 10 ms slack absorbs the Windows `Task.Delay` quantum without the variable `StartAsync` / scheduling overhead in the baseline. This is the same flake-isolation pattern Tests-006 / Tests-017 used (measuring only the production delay, not test-side setup). Suite green; the test passes deterministically across repeated runs.
### Tests-032
| Field | Value |
|---|---|
| Severity | Medium |
| Category | Testing coverage |
| Location | `src/ZB.MOM.WW.MxGateway.Server/Alarms/GatewayAlarmMonitor.cs:435-441`, `src/ZB.MOM.WW.MxGateway.Tests/Alarms/GatewayAlarmMonitorProviderModeTests.cs`, `src/ZB.MOM.WW.MxGateway.Tests/Alarms/AlarmFailoverEndToEndTests.cs` |
| Status | Resolved |
**Description:** `GatewayAlarmMonitor.HandleProviderModeChanged` derives the provider-switch reason from the target mode: `toMode switch { Subtag => Failover, Alarmmgr => Failback, _ => Unknown }` (lines 435-439), then calls `_metrics.AlarmProviderSwitched(fromModeInt, toModeInt, switchReason)`. No test in the diff asserts this derivation. `GatewayAlarmMonitorProviderModeTests.ProviderModeChange_BroadcastsDegradedStatus_AndIncrementsSwitchMetric` only asserts the switch *count* (`switchCount == 1`) — it never inspects the `from`/`to`/`reason` tags on the measurement. `AlarmFailoverEndToEndTests.ProviderFailoverAndFailback_FullLifecycle` drives both a failover (alarmmgr→subtag) and a failback (subtag→alarmmgr) but asserts only on feed `ProviderStatus` messages, not on the metric tags. The only place the `reason` tag is read is `GatewayMetricsTests.AlarmProviderSwitched_IncrementsCounterWithExpectedTags`, which passes `AlarmProviderSwitchReason.Failover` *explicitly* to the metrics layer — that pins the metrics-side tag formatting, not the monitor's `toMode→reason` mapping. `AlarmProviderSwitchReason.Failback` is asserted nowhere in the suite. A regression that swapped the Failover/Failback arms, or collapsed them to `Unknown`, would pass every existing test while emitting wrong dashboard/observability data for every failback.
**Recommendation:** Extend `GatewayAlarmMonitorProviderModeTests` (or add a failback case) to capture the `reason` tag through a `MeterListener` and assert it equals `"failover"` on an alarmmgr→subtag change and `"failback"` on a subtag→alarmmgr change, mirroring the tag-capturing pattern already in `GatewayMetricsTests.AlarmProviderSwitched_IncrementsCounterWithExpectedTags`. This pins the monitor's `toMode→AlarmProviderSwitchReason` derivation, not just the count.
**Resolution:** 2026-06-15 — Confirmed root cause: the existing monitor tests asserted only the switch *count*, and `Failback` was asserted nowhere in the suite, so a swapped/`Unknown` reason arm would pass. Added `GatewayAlarmMonitorProviderModeTests.ProviderModeChange_FailoverThenFailback_RecordsCorrectReasonTags`, which captures the `reason` tag off the `mxgateway.alarms.provider_switches` counter via a `MeterListener` and drives an alarmmgr→subtag change then a subtag→alarmmgr change, asserting the captured reasons are exactly `["failover", "failback"]`. This pins the monitor's `toMode→AlarmProviderSwitchReason` derivation (`ApplyProviderModeChangeAsync`). Test passes against current production code (no production change); no bug found.
### Tests-033
| Field | Value |
|---|---|
| Severity | Low |
| Category | Testing coverage |
| Location | `src/ZB.MOM.WW.MxGateway.Server/Dashboard/DashboardAlarmProviderStatus.cs`, `src/ZB.MOM.WW.MxGateway.Tests/Gateway/Dashboard/DashboardBrowseAndAlarmModelTests.cs:140-195` |
| Status | Resolved |
**Description:** The three new badge-mapping tests cover `FromProviderStatus` for green (Alarmmgr/not-degraded), amber (Subtag/degraded), and cyan (Subtag/forced). Several adjacent behaviours of the same projection are uncovered: (1) `DashboardAlarmProviderStatus.FromFeed(AlarmFeedMessage)` — the public entry the dashboard SignalR snapshot path actually calls — and its `ArgumentException` thrown when the message is not a `ProviderStatus` payload have zero coverage in the suite (a grep for `FromFeed` across the test project returns no hits). (2) The `SinceUtc` field (`status.Since?.ToDateTimeOffset()`) is never asserted, so a regression dropping or mis-converting the badge timestamp would not be caught. (3) The `DegradedLabel` constant text ("Subtag monitoring (degraded)") is asserted nowhere — the amber test only checks the `bg-warning` CSS class, so a label swap would pass. (4) The `degraded = status.Degraded || status.Mode == AlarmProviderMode.Subtag` guard's second branch (`Degraded == true` while `Mode == Alarmmgr`) — an explicitly-degraded alarmmgr status — is untested, so the "guard against either being set independently" comment in the product code is unverified.
**Recommendation:** Add `FromFeed_NonProviderStatusPayload_Throws` (asserting `ArgumentException`) and `FromFeed_ProviderStatusPayload_ProjectsBadge`; assert `SinceUtc` on a status carrying a `Since` timestamp; assert `model.Label == DashboardAlarmProviderStatus.DegradedLabel` in the amber test; and add a `Degraded=true, Mode=Alarmmgr` case asserting it maps to the degraded (amber) badge per the independent-flag guard.
**Resolution:** 2026-06-15 — Confirmed the four coverage gaps against `DashboardAlarmProviderStatus`. Added to `DashboardBrowseAndAlarmModelTests`: `FromFeed_ProviderStatusPayload_ProjectsBadge` and `FromFeed_NonProviderStatusPayload_Throws` (the latter asserts `ArgumentException` for a `SnapshotComplete` feed message); `FromProviderStatus_WithSinceTimestamp_MapsSinceUtc` (pins `SinceUtc` round-trips the protobuf `Since` timestamp); `FromProviderStatus_Alarmmgr_DegradedFlagSet_WarningBadge` (the `Degraded && Mode==Alarmmgr` independent-flag branch maps to the amber degraded badge); and a `DegradedLabel` text assertion added to the existing amber `FromProviderStatus_Subtag_Degraded_WarningBadge`. All pass against current production code (no production change); no bug found.
### Tests-034
| Field | Value |
|---|---|
| Severity | Low |
| Category | mxaccessgw conventions |
| Location | `src/ZB.MOM.WW.MxGateway.Tests/Diagnostics/GatewayLogRedactorSeamTests.cs:1-15` |
| Status | Resolved |
**Description:** `GatewayLogRedactorSeamTests.cs` diverges from the project's C# style guide and the rest of the test suite: it declares no file-scoped namespace (the class lands in the global namespace, unlike every other test file which sits under `ZB.MOM.WW.MxGateway.Tests.*`); it carries redundant explicit `using System.Collections.Generic;` and `using Xunit;` (both are implicit global usings in this project, enforced elsewhere — see the resolved Tests-008); it uses `var` for `redactor`/`props` where the suite uses explicit types per `docs/style-guides/CSharpStyleGuide.md`; and it declares `public class` rather than the project's `sealed`-by-default convention. The redaction assertion itself is sound (it meaningfully pins API-key masking in `ClientIdentity`), so this is purely the same style-drift family as the previously-filed-and-resolved Tests-008, not a correctness issue.
**Recommendation:** Add `namespace ZB.MOM.WW.MxGateway.Tests.Diagnostics;` (file-scoped), drop the redundant `System.Collections.Generic`/`Xunit` usings, mark the class `public sealed class`, and replace the two `var` declarations with explicit types (`GatewayLogRedactorSeam` / `Dictionary<string, object?>`).
**Resolution:** 2026-06-15 — Confirmed the style drift. Rewrote `GatewayLogRedactorSeamTests.cs` to add the file-scoped `namespace ZB.MOM.WW.MxGateway.Tests.Diagnostics;`, dropped the redundant `using System.Collections.Generic;`/`using Xunit;` (both implicit global usings), marked the class `public sealed class`, and replaced the two `var` declarations with explicit `GatewayLogRedactorSeam` / `Dictionary<string, object?>` types. The single `Redact_MasksApiKeyInClientIdentity` assertion is unchanged and still passes.
### Tests-035
| Field | Value |
|---|---|
| Severity | Low |
| Category | Concurrency & thread safety |
| Location | `src/ZB.MOM.WW.MxGateway.Tests/Alarms/AlarmFailoverEndToEndTests.cs:315-329` |
| Status | Resolved |
**Description:** In `DegradedTransition_CachedThenReplayed_CarriesDegradedAndSourceProviderToNewSubscriber`, the second-subscriber loop iterates `monitor.StreamAsync(null, newStreamCts.Token)` with no timeout, breaking only when a `SnapshotComplete` payload arrives (lines 317-329). Every other wait in this file routes through `WaitForAsync(..., WaitTimeout)` or `Task.WaitAsync(WaitTimeout)`; this `await foreach` does not. If a regression caused the monitor to stop emitting `SnapshotComplete` for a new subscriber (e.g. a snapshot-replay path that throws before the terminal message), the test would hang on the `await foreach` rather than fail with a `TimeoutException`, relying on the xUnit `longRunningTestSeconds` warning or the CI hard-kill instead of a clean assertion failure. The first subscriber in the same test is correctly bounded by `WaitForAsync`.
**Recommendation:** Bound the second-subscriber drain with the same `WaitTimeout` used elsewhere — e.g. link `newStreamCts` to a `CancellationTokenSource.CreateLinkedTokenSource` plus `CancelAfter(WaitTimeout)`, or wrap the drain in a `Task` awaited via `WaitAsync(WaitTimeout)` — so a missing `SnapshotComplete` surfaces as a deterministic failure rather than a hang.
**Resolution:** 2026-06-15 — Confirmed the unbounded `await foreach` in `DegradedTransition_CachedThenReplayed_CarriesDegradedAndSourceProviderToNewSubscriber`. Bounded the second-subscriber drain with a `CancellationTokenSource.CreateLinkedTokenSource(newStreamCts.Token, drainTimeoutCts.Token)` where `drainTimeoutCts.CancelAfter(WaitTimeout)`, and wrapped the loop in a `try/catch (OperationCanceledException) when (drainTimeoutCts.IsCancellationRequested)` that rethrows a `TimeoutException`. A regression that never emits `SnapshotComplete` now fails cleanly instead of hanging. Test still passes.
+84 -2
View File
@@ -4,11 +4,48 @@
|---|---|
| Module | `src/ZB.MOM.WW.MxGateway.Worker.Tests` |
| Reviewer | Claude Code |
| Review date | 2026-05-24 |
| Commit reviewed | `42b0037` |
| Review date | 2026-06-15 |
| Commit reviewed | `410acc9` |
| Status | Re-reviewed |
| Open findings | 0 |
## 2026-06-15 re-review (commit `410acc9`)
Re-review of the alarm-fallback test additions in `git diff 42b0037..HEAD --
src/ZB.MOM.WW.MxGateway.Worker.Tests/`. New unit suites land for the subtag
fallback (`SubtagAlarmConsumerTests`, `SubtagAlarmStateMachineTests`,
`SyntheticAlarmGuidTests`, `LmxSubtagAlarmSourceTests`) and the auto-failover
composite (`FailoverAlarmConsumerTests`); the existing alarm suites are updated
for the `SubscribeAlarmsCommand`-based handler signature, the
`(eq, affinity, comFactory)` handler-factory delegate, and the new
degraded/source-provider fields. Most of the change is genuinely new coverage
plus a large volume of XML-doc additions on existing test doubles (benign).
Findings: the failover state-machine transitions (failover at threshold,
failback after stable probes, intermittent-failure reset, before/after-switch
forwarding, ack delegation, `ProbeOnce`-never-re-Subscribes) are all covered;
the acked latch (`OutOfOrderAckThenClear_StillEmitsAckRtn`), the dup-address
guard (`DuplicateActiveSubtag_Throws`), and the exact-match-vs-substring ack
resolution (`AcknowledgeByName_PrefixNameDoesNotFalseMatch`,
`AcknowledgeByGuid_*`) are all pinned. Three coverage gaps remain
(Worker.Tests-031/032/033), all in new alarm-fallback code paths. The two
newest files (`SyntheticAlarmGuidTests`, `LmxSubtagAlarmSourceTests`) omit an
explicit `using Xunit;` but compile via the `<Using Include="Xunit" />` global
using in the csproj, so that is not a finding.
| # | Category | Result |
|---|---|---|
| 1 | Correctness & logic bugs | No issues found — failover/state-machine/ack tests assert meaningful post-conditions (mode, emitted state, target subtag address) and do not pass for the wrong reason; the prefix-name and unknown-guid negative cases pin the exact-match contract. |
| 2 | mxaccessgw conventions | No issues found — new test methods follow `Method_Scenario_Expectation`; STA-affinity respected (state machine / consumer driven synchronously through internal seams). |
| 3 | Concurrency & thread safety | No issues found — new failover/subtag suites are single-threaded and event-driven; no wall-clock floors or fixed sleeps were introduced (the `MxAccessValueCacheTests` change only deletes the old Worker.Tests-020 comment block). |
| 4 | Error handling & resilience | Issues found: Worker.Tests-032 — the `RunPrimary` `when (ex is not OutOfMemoryException)` filter (the OOM-safe catch path) and the `FailoverSettings` clamp branches are untested. |
| 5 | Security | No issues found — no secrets/credentials; ack-operator identity fields are sentinels. |
| 6 | Performance & resource management | No issues found — `IDisposable` test subjects use `using`; the `LmxSubtagAlarmSource` dispose-idempotency / unadvise-only-advised-handles teardown is regression-tested. |
| 7 | Design-document adherence | No issues found — tests mirror the alarm-fallback plan (degraded flag, synthetic GUID, subtag-ack via ack-comment, single-subscribe primary). |
| 8 | Code organization & conventions | No issues found — new suites live under `MxAccess/`; test doubles are per-file (acceptable for these narrow fakes). |
| 9 | Testing coverage | Issues found: Worker.Tests-031 (`ProbeIntervalSeconds` throttle-active branch never exercised — every test uses `probeIntervalSeconds: 0`), Worker.Tests-033 (`SubtagAlarmStateMachine` ack-while-inactive and priority-subtag branches uncovered). |
| 10 | Documentation & comments | No issues found — test XML docs match assertions; no misleading names observed. |
## 2026-05-24 re-review (commit `42b0037`)
**Re-review: no new findings.** `git diff --name-only d692232..42b0037 -- src/ZB.MOM.WW.MxGateway.Worker.Tests` returns empty — the Worker.Tests module has zero source changes since the previous review. All ten checklist categories therefore inherit "No issues found" from the `d692232` pass. The header is bumped to track the latest reviewed commit; Worker.Tests-001..030 remain closed.
@@ -533,3 +570,48 @@ findings (Worker.Tests-001 through -030) are unaffected.
**Recommendation:** Either (a) reassign `CreateCancelEnvelope` to a sequence value `>` shutdown (or pass the sequence as a parameter, matching `CreateGatewayHelloEnvelope`'s parameter style), so the wire trace reads in ascending order; (b) add an XML-doc note on the cancel test stating that the worker has no inbound monotonicity check and the test ignores envelope sequence ordering; (c) parameterise all four helper methods so each test passes its desired sequence and the literal numbers stop carrying implicit meaning. Option (c) is the cleanest because `CreateGatewayHelloEnvelope` is already parameter-driven for nonce/version.
**Resolution:** 2026-05-20 — Took option (c): parameterised `CreateGatewayHelloEnvelope`/`CreateCommandEnvelope`/`CreateCancelEnvelope`/`CreateShutdownEnvelope` with a `ulong sequence` argument (defaults 1/2/2/3 respectively, matching the typical Hello/Command/Cancel/Shutdown ordering), so the literal sequence values no longer carry implicit meaning. Updated the cancel-correlation test's wire trace to ascend (Hello=1, Cancel=2, Shutdown=3) and added a comment noting that the worker has no inbound monotonicity check — the parameter exists so multi-frame tests can pin the trace ordering explicitly when needed.
### Worker.Tests-031
| Field | Value |
|---|---|
| Severity | Medium |
| Category | Testing coverage |
| Location | `src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/FailoverAlarmConsumerTests.cs` (all `FailoverSettings` constructions) |
| Status | Resolved |
**Description:** Every `FailoverSettings` in `FailoverAlarmConsumerTests` is built with `probeIntervalSeconds: 0`, which deliberately *disables* the probe throttle. The throttle-active branch in `FailoverAlarmConsumer.ProbeOnce` (`src/ZB.MOM.WW.MxGateway.Worker/MxAccess/FailoverAlarmConsumer.cs:211-215`) — where a probe is *skipped* because fewer than `ProbeIntervalSeconds` have elapsed since `lastProbeAtUtc` — is therefore never exercised. This is a genuine production behaviour: the failback cadence is the only thing preventing a degraded worker from hammering the broken primary with a `PollOnce` on every timer tick, and `AlarmCommandHandlerTests.Subscribe_AutoModeWithWatchList_...` wires a real non-zero `FailbackProbeIntervalSeconds = 1` into the handler, so the throttle is on the live path. A regression that inverted the comparison (probing only *after* the interval became `>=` instead of skipping while `<`), dropped the `lastProbeAtUtc` update, or removed the throttle entirely would not be caught by any test. The task brief named "ProbeIntervalSeconds enforcement" as an explicit focus area.
**Recommendation:** Add a test that constructs `FailoverSettings(threshold: 1, probeIntervalSeconds: <N>, stableProbes: 1)` with a non-zero interval, forces failover, makes the primary healthy, then calls `ProbeOnce()` twice in quick succession and asserts the second call did *not* probe (e.g. assert `primary.Polls` advanced by exactly one and `Mode` is still `Subtag`). Because the throttle reads `DateTime.UtcNow` directly, either accept a coarse same-wall-clock-instant assertion (two back-to-back calls reliably fall inside any interval ≥ 1s) or, preferably, refactor `ProbeOnce` to take an injectable clock so the throttle boundary can be pinned deterministically without wall-clock dependence (consistent with the Worker.Tests-020 manual-time-source approach).
**Resolution:** 2026-06-15 — Took the coarse same-wall-clock-instant approach (no production-code clock injection needed). Added `FailoverAlarmConsumerTests.ProbeOnce_WithNonZeroInterval_ThrottlesSecondProbeWithinInterval`: builds `FailoverSettings(threshold: 1, probeIntervalSeconds: 3600, stableProbes: 5)`, forces failover to Subtag, makes the primary healthy, then calls `ProbeOnce()` twice back-to-back. The first probe re-polls the primary (`primary.Polls == 1`); the second falls inside the 3600s interval and is throttled, so `primary.Polls` is unchanged and `Mode` stays `Subtag`. `stableProbes: 5` keeps a single clean probe from failing back, so the throttled `ProbeOnce` path stays in scope. A 1-hour interval makes the two back-to-back calls reliably fall inside the window without any timing flakiness.
### Worker.Tests-032
| Field | Value |
|---|---|
| Severity | Low |
| Category | Error handling & resilience |
| Location | `src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/FailoverAlarmConsumerTests.cs` |
| Status | Resolved |
**Description:** Two resilience branches of `FailoverAlarmConsumer` are uncovered by the new tests. (1) `RunPrimary` catches `Exception ex when (ex is not OutOfMemoryException)` (`FailoverAlarmConsumer.cs:295`) — the OOM-safe catch path the task brief explicitly called out. No test throws `OutOfMemoryException` from the primary to verify it *propagates* (rather than being swallowed and counted toward the failover threshold like every other exception); the `FlakyPrimary` fake throws only `COMException`. A regression that broadened the filter to swallow OOM would convert a fatal allocation failure into a silent failover. (2) The `FailoverSettings` constructor clamps `threshold < 1 → 1` and `stableProbes < 1 → 1` (`FailoverSettings.cs:38-40`); no test passes a sub-1 value to confirm the clamp, so a misconfigured `ConsecutiveFailureThreshold = 0` from the gateway could change failover semantics undetected.
**Recommendation:** Add a `FlakyPrimary`-style fake (or a flag on the existing one) that throws `OutOfMemoryException` from `PollOnce`, and assert `sut.PollOnce()` rethrows it via `Assert.Throws<OutOfMemoryException>` and that no `ProviderModeChanged` fired. Add a small `FailoverSettings` fact (or `[Theory]`) asserting `new FailoverSettings(0, 0, 0).Threshold == 1` and `.StableProbes == 1` to pin the clamp.
**Resolution:** 2026-06-15 — Added a `ThrowOutOfMemoryOnPoll` flag to the existing `FlakyPrimary` fake (its `PollOnce` throws `OutOfMemoryException` when set, checked before the `COMException` branch). Regression test `FailoverAlarmConsumerTests.RunPrimary_WhenPrimaryThrowsOutOfMemory_PropagatesAndDoesNotFailOver` drives `PollOnce` through the primary, asserts `Assert.Throws<OutOfMemoryException>`, and asserts no `ProviderModeChanged` fired and `Mode` stays `Alarmmgr` — pinning that the `when (ex is not OutOfMemoryException)` filter lets OOM propagate rather than swallowing it and counting it toward the failover threshold. The clamp is pinned by `FailoverSettings_ClampsSubMinimumValues` (a `[Theory]`): `(0,0,0)→(1,0,1)`, `(-5,-5,-5)→(1,0,1)`, and a pass-through `(3,7,2)→(3,7,2)` to confirm in-range values are not altered.
### Worker.Tests-033
| Field | Value |
|---|---|
| Severity | Low |
| Category | Testing coverage |
| Location | `src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/SubtagAlarmStateMachineTests.cs` |
| Status | Resolved |
**Description:** `SubtagAlarmStateMachineTests` covers the core transition matrix and the acked latch well, but two branches of the new state machine are unexercised. (1) The ack-while-inactive path in `SubtagAlarmStateMachine.ApplyAcked` (`src/ZB.MOM.WW.MxGateway.Worker/MxAccess/SubtagAlarmStateMachine.cs:156-164`): when `.acked` flips true while the alarm is *not* active, the machine must emit nothing and must *not* set `AckedDuringEpisode` — otherwise a stale ack from a prior episode could mis-latch the next raise into a spurious `ACK_RTN`. No test drives an `.acked` change without a preceding active raise. (2) The priority-subtag path (`SubtagRole.Priority``state.Priority = CoerceInt(...)`, line 76-78): `SubtagAlarmConsumerTests.Subscribe_AdvisesAllSubtagsIncludingAckComment` confirms the priority subtag is *advised*, but no test raises a priority value change and asserts it flows into the emitted/snapshot record's `Priority`, so `CoerceInt` and the priority assignment are untested in the state-machine layer.
**Recommendation:** Add (a) `AckedTrueWhileInactive_EmitsNothingAndDoesNotLatch` — apply `.acked=true` with no prior active raise, assert `Apply` returns empty, then raise active and clear and assert the clear emits `UnackRtn` (proving the stale ack did not latch); and (b) `PriorityChange_FlowsIntoEmittedRecord` — apply a priority value then an active raise and assert the emitted record's `Priority` equals the supplied value (and a `CoerceInt` string/garbage case falls back).
**Resolution:** 2026-06-15 — Added both tests to `SubtagAlarmStateMachineTests`. `AckedTrueWhileInactive_EmitsNothingAndDoesNotLatch` applies `.acked=true` with no preceding active raise (asserts `Apply` returns empty), then drives a fresh raise→clear episode and asserts the clear emits `UnackRtn` — proving the stale inactive ack did not latch `AckedDuringEpisode`. `PriorityChange_FlowsIntoEmittedRecord` (the target now includes a `PrioritySubtag`) applies an `int` priority `750` (asserts the priority change emits nothing), raises active and asserts the emitted record's `Priority == 750` (exercising `CoerceInt`'s `int` path and the priority assignment), then applies a non-numeric `"not-a-number"` priority and asserts the snapshot `Priority` is still `750` (the `CoerceInt` string fallback keeps the prior value, not zero).
+76 -2
View File
@@ -4,11 +4,38 @@
|---|---|
| Module | `src/ZB.MOM.WW.MxGateway.Worker` |
| Reviewer | Claude Code |
| Review date | 2026-05-24 |
| Commit reviewed | `42b0037` |
| Review date | 2026-06-15 |
| Commit reviewed | `410acc9` |
| Status | Re-reviewed |
| Open findings | 0 |
## 2026-06-15 re-review (commit `410acc9`)
Re-review of the `42b0037..410acc9` diff — the alarm-provider subtag-fallback
feature (`git diff 42b0037..410acc9 -- src/ZB.MOM.WW.MxGateway.Worker/`). New
substantive code: `SubtagAlarmConsumer`, `SubtagAlarmStateMachine`,
`FailoverAlarmConsumer`, `LmxSubtagAlarmSource`, `SyntheticAlarmGuid`,
`AlarmProviderModeChange`, `FailoverSettings`, `ISubtagAlarmSource` /
`SubtagValueChange`, plus the degraded/`source_provider` propagation in
`AlarmDispatcher` / `MxAccessAlarmEventSink` / `MxAccessEventMapper`, the
`ForcedMode`/watch-list routing and STA-COM-factory threading in
`AlarmCommandHandler` / `MxAccessStaSession`, and the `SubscribeAlarmsCommand`
re-plumb in `MxAccessCommandExecutor`. Three new findings: **Worker-026** (High),
**Worker-027** (Medium), **Worker-028** (Low). Worker-001..025 remain closed.
| # | Category | Result |
|---|---|---|
| 1 | Correctness & logic bugs | No issues found. Subtag synthesis (`SubtagAlarmStateMachine` raise/ack/clear, `AckedDuringEpisode` latch, segment-boundary name derivation), exact-match ack resolution (`ResolveTargetByName` avoids the prefix false-positive), and `MapTransition`'s `Unspecified→*Alm` raise path are all sound. |
| 2 | mxaccessgw conventions | No issues found. The synthesis is worker-side and every degraded record/event carries `degraded=true` + `source_provider=SUBTAG`, satisfying the explicit opt-in non-parity exception to the "never synthesize events" rule. The gateway never instantiates COM. net48 constraint respected — `AlarmProviderModeChange`/`FailoverSettings` are plain classes with get-only ctor-assigned props (no init/positional records); no `WriteRecord`-style init usage introduced. |
| 3 | Concurrency & thread safety | Issue found: Worker-026 (an exception in the failover switch path — `SwitchToStandby`'s priming snapshot or either switch's `ProviderModeChanged` handler — escapes the state machine after `active` has already flipped, killing the STA alarm-poll loop with no mode-changed event). STA affinity itself is sound: `LmxSubtagAlarmSource` owns its own apartment-bound `LMXProxyServerClass`, all consumer calls are STA-confined via `AlarmCommandHandler`'s affinity guard, and `Dispose` UnAdvises before tearing handles down so a late pump callback cannot re-enter. |
| 4 | Error handling & resilience | Issue found: Worker-027 (`SyntheticAlarmGuid` uses `MD5.Create()`, which throws on a net48 FIPS-policy host — breaking every subtag transition stamp and snapshot, and feeding Worker-026's poll-loop-kill path). `FailoverSettings` clamps tunables to safe minimums; `LmxSubtagAlarmSource` teardown is best-effort/idempotent. |
| 5 | Security | No issues found. No secret/credential logging on the alarm path; ack comments are operator-supplied alarm metadata, not secrets. Synthetic GUID is non-cryptographic by design and not a security control. |
| 6 | Performance & resource management | No issues found. `LmxSubtagAlarmSource` releases its COM object via `FinalReleaseComObject` and tracks advised-vs-added handles so `Dispose` only UnAdvises what it advised. The standby is armed once and gated-by-active rather than churning subscribe/unsubscribe per switch. |
| 7 | Design-document adherence | No issues found. Implementation matches `docs/plans/2026-06-13-alarm-subtag-fallback-design.md` (auto-failover/failback, ack-comment-write ack, worker-side synthesis, additive proto fields). The probe re-polls the still-subscribed primary (single-subscribe constraint) as the design's "Superseded" notes describe. |
| 8 | Code organization & conventions | Issue found: Worker-028 (the dup-subtag-address guard in `SubtagAlarmStateMachine.Bind` does not cover duplicate `AlarmFullReference` entries, which silently overwrite in `targetsByReference`/`_statesByReference`). One-public-type-per-file is otherwise respected for the new files. |
| 9 | Testing coverage | No standalone finding. New unit suites exist for each major component (`SubtagAlarmConsumerTests`, `SubtagAlarmStateMachineTests`, `FailoverAlarmConsumerTests`, `LmxSubtagAlarmSourceTests`, `SyntheticAlarmGuidTests`), matching the design's test matrix. The switch-path exception fragility (Worker-026) and the dup-reference case (Worker-028) are untested edge cases noted in those findings. |
| 10 | Documentation & comments | No issues found. The new types carry accurate XML docs; the net48-constraint rationale is documented inline on `FailoverSettings`/`AlarmProviderModeChange`; the "why PollOnce only, no re-Subscribe" and probe-throttle behaviour are documented on `FailoverAlarmConsumer.ProbeOnce`. |
## 2026-05-24 re-review (commit `42b0037`)
**Re-review: no new findings.** `git diff --name-only d692232..42b0037 -- src/ZB.MOM.WW.MxGateway.Worker` returns empty — the Worker module has zero source changes since the previous review. All ten checklist categories therefore inherit "No issues found" from the `d692232` pass. The header is bumped to track the latest reviewed commit; Worker-001..025 remain closed.
@@ -464,3 +491,50 @@ _runtimeSession = _runtimeSessionFactory()
Match the pattern `AlarmCommandHandler.Subscribe` already uses for `consumerFactory()` (`AlarmCommandHandler.cs:76-77`).
**Resolution:** 2026-05-20 — `WorkerPipeSession.RunAsync` now uses `_runtimeSession = _runtimeSessionFactory() ?? throw new InvalidOperationException("Worker runtime session factory returned null.");`, matching the pattern `AlarmCommandHandler.Subscribe` uses for its `consumerFactory()`. A null factory return now produces a clear diagnostic exception at the call site instead of NRE-ing on the next dereference (and the `finally` block's `_runtimeSession?.Dispose()` silently no-oping on a half-initialized session). Regression test `WorkerPipeSessionTests.RunAsync_WhenRuntimeSessionFactoryReturnsNull_ThrowsDiagnosticException` drives `RunAsync` with `() => null!` and asserts the diagnostic `InvalidOperationException` is thrown with the expected message.
### Worker-026
| Field | Value |
|---|---|
| Severity | High |
| Category | Concurrency & thread safety |
| Location | `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/FailoverAlarmConsumer.cs:289-338`, `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/MxAccessStaSession.cs:307-320` |
| Status | Resolved |
**Description:** `FailoverAlarmConsumer.SwitchToStandby` flips `active = Active.Standby` / `mode = Subtag` first, then calls `_ = standby.SnapshotActiveAlarms();` (the priming side-effect), and only then calls `RaiseModeChanged(...)`. If `standby.SnapshotActiveAlarms()` throws, the exception escapes `SwitchToStandby`, escapes the `catch` in `RunPrimary`, and escapes `FailoverAlarmConsumer.PollOnce`/`Subscribe`. The `SubtagAlarmConsumer.SnapshotActiveAlarms` path is not exception-free: it calls `StampSynthetic``SyntheticAlarmGuid.ForReference` (which throws on a FIPS host — see Worker-027) and walks live state. The same exposure exists for `RaiseModeChanged` itself: the attached `AlarmCommandHandler.OnProviderModeChanged` handler runs synchronously and calls `eventQueue.Enqueue(...)`, which throws `MxAccessEventQueueOverflowException` at capacity; that also propagates out of both `SwitchToStandby` and `SwitchToPrimary`.
When this happens the consumer has **already** transitioned `active`/`mode` to Standby (or Primary) but the `ProviderModeChanged` event is never emitted — so the gateway never learns the feed went degraded. Worse, because the failover calls run on the worker's STA inside `RunAlarmPollLoopAsync`, the escaping exception lands in that loop's trailing `catch (Exception)` arm (`MxAccessStaSession.cs:307-320`), which records a single fault and **permanently stops the alarm poll loop**. The standby is then never pumped or probed again — i.e. a transient primary COM fault that should have produced a clean degraded-mode handoff instead produces a total, undetected alarm outage for the session, defeating the entire purpose of the fallback feature. There is no safe operator workaround short of restarting the session.
**Recommendation:** Make the switch atomic and exception-isolated: raise `ProviderModeChanged` (and perform the priming snapshot) inside their own `try`/`catch` so a snapshot or handler failure cannot abort the switch or unwind into the poll loop. Order the state flip so the mode-changed notification is guaranteed to fire even if priming fails (e.g. flip state, raise mode-changed in a guarded block, then attempt the priming snapshot in a separate guarded block whose failure is logged/faulted but non-fatal). Add a regression test where the standby's `SnapshotActiveAlarms` throws on the first call after failover, asserting (a) `ProviderModeChanged` still fires and (b) `PollOnce` does not rethrow.
**Resolution:** 2026-06-15 — Reordered and exception-isolated the failover switch in `FailoverAlarmConsumer`. `SwitchToStandby` now flips `active`/`mode`, then raises `ProviderModeChanged` FIRST (so the gateway always learns the feed went degraded), then primes the standby snapshot via a new `TryPrimeStandbySnapshot()` whose failure is swallowed (`catch when ex is not OutOfMemoryException`) — a priming failure can no longer abort the switch or unwind into the poll loop. `RaiseModeChanged` itself now wraps `ProviderModeChanged?.Invoke` in a `try`/`catch (when ex is not OutOfMemoryException)` so a subscriber handler exception (e.g. `AlarmCommandHandler.OnProviderModeChanged`'s `eventQueue.Enqueue` overflowing) cannot escape `SwitchToStandby`/`SwitchToPrimary` into `RunAlarmPollLoopAsync`'s trailing catch and permanently stop alarm polling. `OutOfMemoryException` is deliberately allowed to propagate. The MXAccessStaSession poll-loop arm is unchanged — the fix prevents the escape rather than catching it there. Regression tests in `FailoverAlarmConsumerTests`: `Failover_WhenStandbyPrimingSnapshotThrows_StillRaisesModeChangeAndDoesNotRethrow` (standby `SnapshotActiveAlarms` throws on the priming call → `ProviderModeChanged` still fires, `Mode` is Subtag, `Subscribe`/`PollOnce` do not rethrow) and `Failover_WhenModeChangedHandlerThrows_SwitchStillTakesEffectAndDoesNotRethrow` (a throwing `ProviderModeChanged` subscriber → switch still takes effect, no rethrow).
### Worker-027
| Field | Value |
|---|---|
| Severity | Medium |
| Category | Error handling & resilience |
| Location | `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/SyntheticAlarmGuid.cs:38-40` |
| Status | Resolved |
**Description:** `SyntheticAlarmGuid.ForReference` derives the deterministic alarm GUID via `using MD5 md5 = MD5.Create();`. The worker targets .NET Framework 4.8, where `MD5.Create()` returns `MD5CryptoServiceProvider`. When the host has the Windows FIPS-compliance policy enabled (`Enabled=1` under `HKLM\System\CurrentControlSet\Control\Lsa\FIPSAlgorithmPolicy`), the non-validated `MD5CryptoServiceProvider` constructor throws `InvalidOperationException` ("This implementation is not part of the Windows Platform FIPS validated cryptographic algorithms."). `SyntheticAlarmGuid.ForReference` is on the hot path of the subtag fallback: `SubtagAlarmConsumer.StampSynthetic` calls it for **every** synthesized transition and **every** snapshot record. On a FIPS host the subtag fallback therefore throws on first use; combined with Worker-026 that exception kills the STA alarm-poll loop, so the fallback is not merely degraded but completely non-functional exactly when it is needed (after the primary alarmmgr provider has failed). The comment already notes MD5 is "never for security" — the issue is availability under FIPS policy, not cryptographic strength. The regulated deployment hosts (Zimmer) are a plausible FIPS environment.
**Recommendation:** Replace `MD5.Create()` with a FIPS-agnostic non-cryptographic 128-bit hash that does not route through the crypto FIPS gate — e.g. compute the 16 GUID bytes from a stable hash that does not use `System.Security.Cryptography` (a fixed FNV-1a / xxHash-style derivation over the UTF-8 bytes), or use `SHA256` truncated to 16 bytes via the managed `SHA256Managed`/`IncrementalHash` only if confirmed FIPS-safe on net48 (it is not guaranteed — prefer the non-crypto route). The mapping only needs determinism and collision resistance for distinct references, not cryptographic properties. Add a test that exercises `ForReference` without depending on a crypto provider.
**Resolution:** 2026-06-15 — Replaced the `MD5.Create()` derivation in `SyntheticAlarmGuid.ForReference` with a pure-managed FNV-1a hash: two independent 64-bit FNV-1a passes over the UTF-8 bytes (the high pass mixes the byte index into its accumulator to decorrelate the halves) fill the low/high 64 bits of the 128-bit GUID, and the input length is folded in so the empty string is non-degenerate (never `Guid.Empty`). The `using System.Security.Cryptography;` import is gone, so no FIPS-gated `MD5CryptoServiceProvider` is ever constructed — the subtag fallback no longer throws on a FIPS-policy host. The derivation stays deterministic and distinct-per-reference. The existing `SyntheticAlarmGuidTests` (`SameReference_SameGuid`, `DifferentReference_DifferentGuid`, `Reference_ProducesNonEmptyGuid`) pin only those properties — not a specific GUID literal — so they continue to pass unchanged; no test needed a value update. Added regression tests `SyntheticAlarmGuidTests.EmptyReference_ProducesNonEmptyGuid` (length-fold guard against a degenerate all-zero result) and `ForReference_UnderFipsEnforcement_DoesNotThrowAndStaysDeterministic` (sets the managed `UseLegacyFipsThrow` AppContext switch and asserts the derivation still succeeds deterministically; a regression reintroducing a FIPS-gated provider would throw here).
### Worker-028
| Field | Value |
|---|---|
| Severity | Low |
| Category | Code organization & conventions |
| Location | `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/SubtagAlarmStateMachine.cs:43-52`, `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/SubtagAlarmConsumer.cs:70-75` |
| Status | Resolved |
**Description:** `SubtagAlarmStateMachine.Bind` throws `ArgumentException` on a duplicate subtag **item address** (the documented dup-address guard), but neither the state machine nor `SubtagAlarmConsumer` guards against a duplicate `AlarmFullReference` in the watch list. When two `AlarmSubtagTarget` entries share an `AlarmFullReference` but use different subtag addresses, `_statesByReference[target.AlarmFullReference] = state` and `targetsByReference[reference] = target` each silently overwrite the earlier entry, while the earlier target's subtag addresses are still bound to an orphaned `AlarmState`. The orphaned state is mutated by incoming value changes but is invisible to `SnapshotActive` (which iterates only the surviving `_statesByReference.Values`) and to ack resolution (which uses the surviving `targetsByReference`). The result is silently inconsistent synthesized state for that reference. This is a watch-list configuration error (the gateway resolves the watch list), so impact is limited, but the asymmetry — addresses are guarded, references are not — is surprising and silent.
**Recommendation:** Add a duplicate-`AlarmFullReference` guard symmetric with the dup-address guard: throw a descriptive `ArgumentException` from the `SubtagAlarmStateMachine` (or `SubtagAlarmConsumer`) constructor when two watch-list entries share a reference, so a misconfigured watch list fails fast at subscribe time rather than producing silently inconsistent state. Cover it with a unit test.
**Resolution:** 2026-06-15 — Added a duplicate-`AlarmFullReference` guard in the `SubtagAlarmStateMachine` constructor symmetric with the existing dup-address guard in `Bind`: before adding each target's `_statesByReference` entry it checks `ContainsKey` (the dictionary is `OrdinalIgnoreCase`, matching the consumer's `targetsByReference` lookup) and throws a descriptive `ArgumentException` ("Duplicate alarm full reference '{reference}' is bound to more than one alarm target."). Because `SubtagAlarmConsumer` constructs the state machine before populating its own `targetsByReference`, this guard fires before the consumer's silent overwrite too, covering both dictionaries from one canonical check. Regression test `SubtagAlarmStateMachineTests.DuplicateAlarmFullReference_Throws` (two targets sharing a reference but using distinct active subtags → `ArgumentException`).
+156
View File
@@ -790,3 +790,159 @@ Post-ack transition: kind=Clear …
10s cadence held throughout; full proto fields populated correctly;
ack registered server-side without errors.
## Subtag-monitoring fallback provider
When the wnwrap alarm-manager source fails, the gateway worker switches to
`SubtagAlarmConsumer` — a synthetic alarm source that advises each alarm
attribute's subtags via the existing MXAccess `AddItem`/`Advise` pipeline and
derives alarm transitions from the resulting value-change stream. This is a
non-parity, degraded-mode source; every transition and snapshot it produces
carries `degraded = true`.
### Watch-list discovery
`GatewayAlarmMonitor` resolves the subtag watch-list at subscribe time by
calling `IAlarmWatchListResolver.GetAlarmAttributesAsync`. The resolver merges:
1. Galaxy Repository SQL (`GetAlarmAttributesAsync`) — objects that have alarm
extensions in the configured area.
2. Config overrides — `IncludeAttributes` adds explicit entries;
`ExcludeAttributes` removes Repository-derived ones. The config list takes
effect even when `UseGalaxyRepository` is `false`.
The resolved list is a set of `AlarmSubtagTarget` messages sent to the worker
inside `SubscribeAlarmsCommand.watch_list`. Each target carries the composed
MXAccess item addresses for the `InAlarm`, `Acked`, `AckMsg`, and `Priority`
subtags (confirmed AVEVA `AlarmExtension` field names, verified against the live
ZB Galaxy `attribute_definition` rows). The gateway re-runs discovery on its
reconcile cadence and pushes an updated watch-list when the model changes.
Each target's canonical `AlarmFullReference` is composed as
`Galaxy!{area}.{reference}` (literal `Galaxy` provider). The `{area}` is the
alarm object's **real Galaxy area** — discovered per object via
`gobject.area_gobject_id` (`GetAlarmAttributesAsync` projects it as `area_name`)
— so the synthesized reference's group matches exactly the area the native
alarmmgr (wnwrap) emits for the same alarm (e.g. `TestMachine_001` in `TestArea`
yields `Galaxy!TestArea.TestMachine_001.TestAlarm001`). The configured
`Discovery.Area` / `DefaultArea` is **only** the fallback for explicit
`IncludeAttributes` entries, which carry no discovered area.
### Subtag advise and `LmxSubtagAlarmSource`
`LmxSubtagAlarmSource` (implements `ISubtagAlarmSource`) owns a separate
`LMXProxyServerClass` instance on the worker STA — it does not share the
session's main MXAccess object. For each watch-list target it calls
`AddItem`/`Advise` on the configured subtag addresses. When a subtag value
changes, it raises `ValueChanged` on the STA and `SubtagAlarmConsumer`
forwards it to `SubtagAlarmStateMachine`.
`PollOnce()` on the subtag consumer is a no-op — the path is event-driven
through `Advise`, not poll-driven.
### Synthesis rules
`SubtagAlarmStateMachine` tracks `(active, acked)` per watch-list entry and
emits `MxAlarmTransitionEvent` records on change:
| Subtag change | Emitted transition | Notes |
|---|---|---|
| `InAlarm` false → true | Raise (`UNACK_ALM`) | `original_raise_timestamp` = first observed active time for this episode |
| `Acked` false → true, while `InAlarm` | Acknowledge (`ACK_ALM`) | `AckedDuringEpisode` latch set |
| `InAlarm` true → false | Clear | `AckRtn` if `AckedDuringEpisode` is set, else `UnackRtn` |
| `Acked` true → false, while `InAlarm` | (none) | Latch is NOT cleared; the episode retains its acknowledged status at clear |
The `AckedDuringEpisode` latch addresses out-of-order subtag delivery:
MXAccess does not guarantee the `Acked = false` update arrives before the
`InAlarm = false` update. The latch ensures a clear always emits `ACK_RTN`
when the alarm was acknowledged at any point during the active episode.
`SnapshotActive()` returns one `MxAlarmSnapshotRecord` per currently-active
alarm. State mapping:
- `InAlarm && !Acked``UNACK_ALM`
- `InAlarm && Acked``ACK_ALM`
- `!InAlarm` → not included in the snapshot
### Synthetic GUID
The alarmmgr provider supplies a native GUID per alarm record. The subtag
provider has no native GUID. `SubtagAlarmConsumer` derives a deterministic
GUID by hashing `alarm_full_reference` (via `SyntheticAlarmGuid.ForReference`).
The same reference always produces the same GUID within a session, so
GUID-based ack routing resolves correctly. The GUID is not stable across
different alarm references or gateway restarts in the sense of matching any
AVEVA-internal GUID.
### Acknowledge in subtag mode
`AlarmDispatcher` routes ack calls by active provider mode:
- **Alarm-manager mode:** `AlarmAckByName` on `wwAlarmConsumerClass` (unchanged).
- **Subtag mode:** `SubtagAlarmConsumer.AcknowledgeByName` resolves the
watch-list entry's `ack_comment_subtag` and issues a `Write(comment)` on
the STA via `LmxSubtagAlarmSource`. Writing the `AckMsg` subtag performs
the acknowledge in AVEVA (`AckMsg` is the confirmed `AlarmExtension` ack-comment
write target).
If the alarm has no writable ack-comment subtag (`AckComment` config key is
empty, or the entry's `ack_comment_subtag` field is empty), the ack call
returns a failure code that the gateway surfaces as `FailedPrecondition`.
`AcknowledgeByGuid` maps the synthetic GUID back to its reference via an
internal dictionary, then calls the same write path.
`SubtagAlarmConsumer.Subscribe` advises the ack-comment subtag alongside the
observed ones (active/acked/priority). This is required: MXAccess rejects a
write to an item that has been added but not advised with `E_INVALIDARG`
("Value does not fall within the expected range"). Advising it at subscribe
time makes it an active item so the later ack write succeeds — its value
changes carry no transition (the state machine ignores unmapped addresses).
### Live validation
The subtag path was validated against live MXAccess on the dev rig
(`DESKTOP-6JL3KKO`, Galaxy `DEV`, `TestMachine_001.TestAlarm001`):
- `…​.InAlarm``True` (Boolean), `…​.Acked``False` (Boolean),
`…​.Priority``500` (Int32), `…​.AckMsg` → string — confirming the field
names **and** the runtime reference shape `<Object>.<AlarmAttr>.<field>`
with **no** intermediate alarm-condition segment.
- `AcknowledgeByName` (AckMsg write) returned `0` once the ack-comment subtag
was advised — confirming the ack-by-comment-write mechanism end to end.
### Fidelity limitations
The following fields are not available or have lower quality in subtag mode:
| Field | Subtag-mode behavior |
|-------|---------------------|
| `alarm_guid` | Synthetic deterministic GUID from `alarm_full_reference`; not an AVEVA-native GUID |
| `original_raise_timestamp` | First observed `active = true` time; no AVEVA-native raise time |
| `transition_timestamp` | `OnDataChange` source timestamp from MXAccess |
| `severity` | From priority subtag if advised; 0 otherwise |
| `category` / `description` | Not populated (no subtag for these) |
| `current_value` / `limit_value` | Not populated unless corresponding subtags are in the watch-list |
| `alarm_type_name` | Not populated |
| `operator_user` / `operator_comment` | Not populated on synthesized raise/clear transitions |
| `retrigger` transition | Not synthesized (no re-alarm counter subtag is observed) |
Every transition and snapshot record carries `degraded = true` and
`source_provider = ALARM_PROVIDER_MODE_SUBTAG`. Clients that require full
fidelity must wait for failback to the alarm manager.
### Provider mode reflection
When `FailoverAlarmConsumer` switches between providers, it raises
`ProviderModeChanged`. `AlarmDispatcher` enqueues an
`OnAlarmProviderModeChangedEvent` (carried as an `MxEvent`), which the
gateway receives and reflects into:
- `AlarmFeedMessage.provider_status` emitted to every `StreamAlarms`
subscriber.
- The `/hubs/alarms` SignalR hub for the dashboard.
- Metrics: `mxgateway.alarms.provider_mode` gauge and
`mxgateway.alarms.provider_switches` counter.
On every switch `GatewayAlarmMonitor` also forces a reconcile
(`QueryActiveAlarms`) against the now-active provider so the gateway cache
reflects the post-switch state without a spurious raise/clear storm.
+52
View File
@@ -411,6 +411,58 @@ a per-channel skip-verify hook:
See [Gateway Configuration — Automatic self-signed certificate](./GatewayConfiguration.md#automatic-self-signed-certificate)
and the per-client READMEs for the as-built behavior.
## Alarm-Manager to Subtag Fallback
Decision: add a second alarm provider (subtag monitoring) that the worker
activates automatically when the native wnwrap alarm manager fails, and fails
back to automatically when the manager recovers.
### Worker-side synthesis
Synthesis of alarm transitions from subtag value changes happens entirely in
the worker (`SubtagAlarmConsumer` / `SubtagAlarmStateMachine`). The gateway
still forwards only events the worker emits and synthesizes nothing itself.
This satisfies the parity rule even though the subtag path is inherently
non-parity: the parity rule governs where synthesis lives, not whether
synthesis is permitted when the native source is unavailable.
### Degraded is explicit
Every subtag-mode transition carries `degraded = true` on the
`OnAlarmTransitionEvent` and `ActiveAlarmSnapshot` proto messages, and the
`AlarmFeedMessage` feed carries an `AlarmProviderStatus` payload on stream
open and on every switch. No client can mistake a subtag-mode alarm for an
authoritative alarmmgr record. Subtag mode has lower fidelity: synthetic
deterministic GUID (SHA-derived from the alarm reference), best-effort
original-raise timestamp, narrower field set. Clients that need full fidelity
must wait for failback.
### Failover trigger
The failover trigger is N consecutive wnwrap COM failures — a `COMException`
thrown by `Subscribe` or `PollOnce`, or a failure HRESULT from
`GetXmlCurrentAlarms2`. A single poll failure does not trigger a switch; the
threshold (default 3, floored at 1) guards against transient COM hiccups. The
counter resets on any clean poll so a flapping provider does not permanently
latch in subtag mode.
### Acknowledge via ack-comment write
In subtag mode, `AcknowledgeAlarm` writes the operator comment to the alarm
attribute's ack-comment subtag (`Fallback:Subtags:AckComment`). The write
performs the native ack in AVEVA. This differs from alarmmgr mode, where
`AlarmAckByName` on `wwAlarmConsumerClass` is called directly. The `AckComment`
subtag name is empty by default; configuring it is required for ack to work in
subtag mode. The exact AVEVA subtag names are not hard-coded — the `Subtags`
config block exists precisely so names are not guessed without validation
against the live MXAccess attribute set.
### Related documentation
- [Gateway Configuration — Alarm Fallback options](./GatewayConfiguration.md#alarm-fallback-options)
- [Alarm Client Discovery — Subtag provider](./AlarmClientDiscovery.md)
- [gRPC Contract — provider_status and degraded fields](./Grpc.md)
## Later Revisit Items
These are explicit post-v1 revisit items, not open blockers:
+69
View File
@@ -230,6 +230,75 @@ behavior.
The alarm monitor is independent of client sessions: `AcknowledgeAlarm` and
`StreamAlarms` are session-less RPCs served by the monitor.
### Alarm fallback options
The `Fallback` sub-section controls how the alarm feed selects between the
native wnwrap alarm-manager provider and the subtag-monitoring fallback.
| Option | Default | Description |
|--------|---------|-------------|
| `MxGateway:Alarms:Fallback:Mode` | `Auto` | Provider selection mode. `Auto` uses the alarm manager as primary and fails over to subtag monitoring after consecutive COM failures, then fails back automatically. `ForceAlarmManager` disables failover. `ForceSubtag` forces subtag monitoring on from startup. Values are case-insensitive. |
| `MxGateway:Alarms:Fallback:ConsecutiveFailureThreshold` | `3` | Number of consecutive wnwrap COM failures (`COMException` or failure HRESULT from `Subscribe` / `GetXmlCurrentAlarms2`) before the monitor switches to subtag mode. Floored at 1. |
| `MxGateway:Alarms:Fallback:FailbackProbeIntervalSeconds` | `30` | While in subtag mode, how often (in seconds) the monitor probes the wnwrap provider to detect recovery. Floored at 1. |
| `MxGateway:Alarms:Fallback:FailbackStableProbes` | `3` | Number of consecutive clean wnwrap probes required before the monitor switches back to the alarm manager. Floored at 1. |
| `MxGateway:Alarms:Fallback:Discovery:UseGalaxyRepository` | `true` | When `true`, the monitor queries the Galaxy Repository SQL database to build the subtag watch-list for the configured area. |
| `MxGateway:Alarms:Fallback:Discovery:Area` | _(empty)_ | Galaxy area to scope the Repository query to. Falls back to `MxGateway:Alarms:DefaultArea` when empty. Ignored when `UseGalaxyRepository` is `false`. This area is **not** used to compose a Repository-derived alarm's canonical `Galaxy!{area}.{reference}`: each discovered alarm uses its object's real Galaxy area (discovered via `gobject.area_gobject_id`), so the reference's group matches what the native alarmmgr emits. `Discovery:Area` / `DefaultArea` is used as the composition area only for explicit `IncludeAttributes` entries, which carry no discovered area. |
| `MxGateway:Alarms:Fallback:Discovery:IncludeAttributes` | _(empty)_ | Explicit MXAccess attribute paths to add to the subtag watch-list, supplementing (or replacing, when `UseGalaxyRepository` is `false`) the Repository-derived list. |
| `MxGateway:Alarms:Fallback:Discovery:ExcludeAttributes` | _(empty)_ | Attribute paths to remove from the merged watch-list (case-insensitive). The exclude is applied after the Repository-derived rows and the explicit `IncludeAttributes` entries are combined, so an exclude that matches an explicit include suppresses it too — excludes win. Ignored when `UseGalaxyRepository` is `false`. |
| `MxGateway:Alarms:Fallback:Subtags:Active` | `InAlarm` | Subtag name for the in-alarm boolean. Confirmed AVEVA `AlarmExtension` field name. |
| `MxGateway:Alarms:Fallback:Subtags:Acked` | `Acked` | Subtag name for the acknowledged boolean. Confirmed AVEVA `AlarmExtension` field name. |
| `MxGateway:Alarms:Fallback:Subtags:AckComment` | `AckMsg` | Subtag name for the acknowledgement comment write target. Writing this subtag performs the acknowledge in AVEVA. Confirmed AVEVA `AlarmExtension` field name. When empty, the ack-comment write path is disabled. |
| `MxGateway:Alarms:Fallback:Subtags:Priority` | `Priority` | Subtag name for the alarm priority / severity value. Confirmed AVEVA `AlarmExtension` field name. |
Validation rules:
- `Mode` must be `Auto`, `ForceAlarmManager`, or `ForceSubtag` (case-insensitive).
- `Mode = ForceSubtag` with both `UseGalaxyRepository = false` and an empty
`IncludeAttributes` list produces a startup validation warning: the subtag
provider has no attributes to advise.
- `ConsecutiveFailureThreshold`, `FailbackProbeIntervalSeconds`, and
`FailbackStableProbes` are floored at 1 by `GatewayOptionsValidator`.
Full example with non-default fallback settings:
```json
{
"MxGateway": {
"Alarms": {
"Enabled": true,
"SubscriptionExpression": "\\\\SCADA01\\Galaxy!PlantArea",
"DefaultArea": "PlantArea",
"ReconcileIntervalSeconds": 30,
"Fallback": {
"Mode": "Auto",
"ConsecutiveFailureThreshold": 3,
"FailbackProbeIntervalSeconds": 30,
"FailbackStableProbes": 3,
"Discovery": {
"UseGalaxyRepository": true,
"Area": "",
"IncludeAttributes": [],
"ExcludeAttributes": []
},
"Subtags": {
"Active": "InAlarm",
"Acked": "Acked",
"AckComment": "AckMsg",
"Priority": "Priority"
}
}
}
}
}
```
The defaults (`InAlarm`/`Acked`/`AckMsg`/`Priority`) are the confirmed AVEVA
`AlarmExtension` primitive field names, verified by querying the live ZB Galaxy
`attribute_definition` rows. The `Subtags` block exists so names can be
overridden without a code change if a site's alarm template uses different
attribute names. See `docs/AlarmClientDiscovery.md` for the synthesis rules that
depend on these names.
## Host Endpoints and Transport Security (Kestrel)
The listening endpoints are **not** part of the `MxGateway` section. The gateway
+12 -7
View File
@@ -215,13 +215,18 @@ beyond "LDAP is up." See the "Adding a gw-specific group" section of
`glauth.md` for the provisioning step that adds `GwAdmin` and grants it to
`admin`.
The suite covers both the success path and the `DashboardAuthenticator` failure
branches: `admin` whose LDAP groups resolve to the `Admin` role succeeds and
emits the role claim; `readonly` is denied because no group in their `memberOf`
appears in `GroupToRole`; `admin` with a wrong password is rejected by the
candidate bind without leaking the password into `FailureMessage`; an unknown
username yields no candidate; and an unreachable LDAP server is absorbed into a
failed result rather than throwing.
`DashboardAuthenticator` delegates the LDAP bind and group search to the shared
`ZB.MOM.WW.Auth.Ldap` provider (`LdapAuthService`) and only maps the resulting
groups to dashboard roles via `DashboardGroupRoleMapper`; the bind/search
mechanics that decide each outcome live in that shared provider, not in
`DashboardAuthenticator`.
The suite covers both the success path and the failure outcomes: `admin` whose
LDAP groups resolve to the `Admin` role succeeds and emits the role claim;
`readonly` is denied because no group in their `memberOf` appears in
`GroupToRole`; `admin` with a wrong password fails authentication without leaking
the password into `FailureMessage`; an unknown username fails authentication; and
an unreachable LDAP server is absorbed into a failed result rather than throwing.
Run the LDAP live tests explicitly:
+67
View File
@@ -94,6 +94,73 @@ Carrying the enqueue timestamp into the worker layer is what lets queue-wait tim
`StreamAlarms` is a server-streaming, **session-less** RPC that attaches to the gateway's central alarm feed. The handler delegates to `IGatewayAlarmService.StreamAsync`. The stream opens with one `AlarmFeedMessage` carrying an `active_alarm` per currently-active alarm (the ConditionRefresh snapshot), then a single `snapshot_complete`, then a `transition` for every subsequent raise / acknowledge / clear. It is served by the always-on `GatewayAlarmMonitor`, which owns a single gateway-managed worker session and fans out to every attached client — clients no longer open a session of their own. `alarm_filter_prefix`, when set, scopes the stream to a sub-tree.
#### Provider status on the alarm feed
`AlarmFeedMessage` has a fourth `payload` case, `provider_status`, carrying
an `AlarmProviderStatus` message:
```protobuf
message AlarmProviderStatus {
AlarmProviderMode mode = 1;
bool degraded = 2; // true whenever mode == SUBTAG
string reason = 3; // human-readable switch reason
google.protobuf.Timestamp since = 4;
}
```
The gateway emits `provider_status` once when a client first subscribes
(immediately after the initial snapshot and before the first live transition)
and again on every failover or failback. A late-joining client therefore
always learns the current provider mode without waiting for the next switch.
`AlarmProviderMode` is an enum with three values:
| Value | Meaning |
|-------|---------|
| `ALARM_PROVIDER_MODE_UNSPECIFIED` (0) | Default / unset |
| `ALARM_PROVIDER_MODE_ALARMMGR` (1) | Native wnwrap alarm-manager source |
| `ALARM_PROVIDER_MODE_SUBTAG` (2) | Subtag-monitoring fallback (degraded) |
#### Degraded and source-provider fields on transitions and snapshots
`OnAlarmTransitionEvent` and `ActiveAlarmSnapshot` both carry two new fields:
- `bool degraded` (field 14) — `true` when the record came from the subtag
fallback, not the native alarmmgr.
- `AlarmProviderMode source_provider` (field 15) — which provider produced
this record (`ALARMMGR` or `SUBTAG`).
Both fields are proto3 defaults (`false` / `UNSPECIFIED`) in alarmmgr mode,
so existing clients that do not read them continue to function without change.
Clients that care about provenance — for example, an OPC UA server that
applies different quality flags to degraded alarms — should inspect `degraded`
before consuming the transition.
Subtag-mode records are a non-parity source. They carry synthetic GUIDs,
best-effort timestamps, and reduced field coverage. See
`docs/AlarmClientDiscovery.md` for the full fidelity table.
#### Provider-mode-changed event
The worker emits `OnAlarmProviderModeChangedEvent` (family
`MX_EVENT_FAMILY_ON_ALARM_PROVIDER_MODE_CHANGED`) on each switch between
providers:
```protobuf
message OnAlarmProviderModeChangedEvent {
AlarmProviderMode mode = 1;
string reason = 2;
int32 hresult = 3; // COM HRESULT that triggered failover; 0 on failback
google.protobuf.Timestamp at = 4;
}
```
This event arrives on the `StreamEvents` stream of the alarm monitor's
internal gateway session (not on client sessions). `GatewayAlarmMonitor`
consumes it and reflects the new mode into the `StreamAlarms` feed's
`provider_status`, the dashboard hub, and metrics. Client sessions do not
receive this event directly.
## Validation Rules
`MxAccessGrpcRequestValidator` rejects requests with `StatusCode.InvalidArgument` before any session work happens. The rules are intentionally narrow — anything that requires session state (for example, "session does not exist") is left for `ISessionManager` so the validator can stay synchronous and side-effect free.
@@ -0,0 +1,316 @@
# Alarm Subtag-Monitoring Fallback — Design
**Date:** 2026-06-13
**Status:** Superseded by implementation (merged to `main`). This is the original
brainstorming design; a few details below were refined during implementation —
see the inline **Superseded** notes. The shipped behaviour is documented in
`docs/AlarmClientDiscovery.md`, the client READMEs, and the contracts.
**Branch:** `feat/alarm-subtag-fallback`
## Problem
The gateway's central alarm feed (`GatewayAlarmMonitor` → worker
`WnWrapAlarmConsumer`) depends on the AVEVA wnwrap COM consumer
(`WNWRAPCONSUMERLib.wwAlarmConsumerClass`), which polls `GetXmlCurrentAlarms2`
on the worker STA. That provider can fail at the COM boundary (the older
`aaAlarmManagedClient` crashed on FILETIME marshaling; wnwrap can still return
failure HRESULTs or throw `COMException`). When it does, the gateway loses all
alarm visibility.
This design adds a **second alarm source** — direct monitoring of each alarm
attribute's subtags (`.active`, `.acked`, …) via the existing MXAccess
`AddItem`/`Advise` pipeline — and **fails over to it automatically when the
wnwrap provider breaks, then fails back automatically when it recovers**. The
subtag source can also be forced on by config.
## Decisions (locked during brainstorming)
| Decision | Choice |
|---|---|
| Failover model | **Auto-failover + auto-failback** (both directions, runtime) |
| Watch-list source | **Galaxy Repository SQL discovery + config override** |
| Acknowledge in subtag mode | **Write the operator comment to the alarm's ack-comment subtag** (the write performs the ack) |
| Failure signal | **N consecutive wnwrap COM failures** (Subscribe / `GetXmlCurrentAlarms2` throws or returns a failure HRESULT) |
| Degraded-state visibility | **Both** — explicit field in the gRPC contract **and** dashboard + metrics |
| Synthesis location | **Worker-side** (`Approach A`) — keeps the parity rule "the gateway forwards only events the worker emits; it never synthesizes events" |
## Core principle
Subtag monitoring is, by definition, a **non-parity, lower-fidelity** alarm
source: it synthesizes alarm transitions from raw data changes, has no native
alarm GUID, no native original-raise timestamp, and a narrower field set. Per
`CLAUDE.md`, synthesizing events is allowed only as an explicit opt-in
non-parity mode. This design satisfies that by (a) doing the synthesis **inside
the worker** (so the gateway still only forwards worker-emitted events) and
(b) marking every degraded event and the whole feed as degraded so no client
mistakes it for the authoritative alarmmgr feed.
## Architecture
```
GATEWAY (.NET 10, x64)
┌─────────────────────────────────────────────────────────────────┐
│ GatewayAlarmMonitor (BackgroundService) │
│ • resolves watch-list: Galaxy Repository SQL + config override │
│ • arms the worker with the watch-list at subscribe time │
│ • consumes AlarmProviderModeChanged → reflects mode into feed, │
│ /hubs/alarms dashboard hub, and metrics │
│ • forces a cache reconcile (QueryActiveAlarms) on every switch │
└───────────────────────────────┬───────────────────────────────────┘
│ IPC (WorkerEnvelope frames)
│ · SubscribeAlarms{ watch_list, failover cfg }
│ · AlarmProviderModeChanged{ mode, reason, hresult }
│ · OnAlarmTransitionEvent (degraded flag set in subtag mode)
WORKER (.NET FW 4.8, x86, STA)
┌─────────────────────────────────────────────────────────────────┐
│ AlarmDispatcher → FailoverAlarmConsumer : IMxAccessAlarmConsumer │
│ ├─ primary : WnWrapAlarmConsumer (wnwrap COM poll, unchanged) │
│ └─ standby : SubtagAlarmConsumer (AddItem/Advise on subtags) │
│ │
│ FailoverAlarmConsumer owns the state machine: │
│ PrimaryActive ──(N consecutive wnwrap COM failures)──▶ Degraded │
│ Degraded ──(M consecutive clean wnwrap probe polls)──▶ Primary │
│ on each switch: snapshot the now-active provider, hand off │
└─────────────────────────────────────────────────────────────────┘
```
The failover state machine lives **worker-local** so the switch is instant — no
IPC round-trip at the moment alarmmgr dies. The gateway *arms* the standby
consumer up front (passes the watch-list at subscribe time) so it is ready
before it is ever needed.
## Components
### Worker (`src/ZB.MOM.WW.MxGateway.Worker/MxAccess/`)
**`SubtagAlarmConsumer : IMxAccessAlarmConsumer` (new)** — the standby provider.
- On `Subscribe`, instead of wnwrap registration it `AddItem`/`Advise`s the
configured subtags for each watch-list entry on the existing STA (reuses the
worker's item-subscription machinery). Per attribute it advises at minimum
`.active` and `.acked`; optionally `.priority`/severity, `.descr`, value/limit
if present.
- Converts each `OnDataChange` into the same `MxAlarmTransitionEvent` the wnwrap
consumer emits, via the synthesis rules below, and raises
`AlarmTransitionEmitted`. Marks each as **degraded**.
- `SnapshotActiveAlarms()` returns the currently-active set computed from
last-known subtag values.
- `AcknowledgeByName(...)` resolves the watch-list entry's ack-comment subtag and
issues a `Write(comment)` on the STA. `AcknowledgeByGuid(...)` maps the
synthetic GUID (see below) back to a reference, then does the same. If the
attribute exposes no writable ack-comment subtag, returns a failure code that
the gateway surfaces as `FailedPrecondition`.
- `PollOnce()` is a no-op (subtag mode is event-driven via Advise).
**`FailoverAlarmConsumer : IMxAccessAlarmConsumer` (new)** — composite + state
machine. Owns the wnwrap consumer (primary) and the subtag consumer (standby),
forwards `AlarmTransitionEmitted` from whichever child is active, and raises a
new `ProviderModeChanged` event on every switch.
- **Failure counting:** wraps `Subscribe`/`PollOnce` on the primary; a thrown
`COMException` or a failure HRESULT increments a consecutive-failure counter,
reset to zero on any clean poll.
- **Failover** (`PrimaryActive → Degraded`): at `ConsecutiveFailureThreshold`
(default 3), ensures the standby is subscribed (it was armed at startup), sets
active = standby, snapshots the standby's active set for hand-off, and emits
`ProviderModeChanged(SUBTAG, reason, hresult)`.
- **Failback probe** (`Degraded → PrimaryActive`): while degraded, every
`FailbackProbeIntervalSeconds` (default 30) it re-attempts wnwrap
`Subscribe`+`PollOnce` on the STA. After `FailbackStableProbes` (default 3)
consecutive clean polls it switches active = primary, returns the standby to
standby, and emits `ProviderModeChanged(ALARMMGR, "recovered")`.
- **Hand-off:** on every switch it takes `SnapshotActiveAlarms()` from the
now-active provider so the gateway can reconcile and avoid spurious
raise/clear storms.
**`AlarmDispatcher` / `MxAccessAlarmEventSink` / `AlarmCommandHandler`
(changed, minimal)** — `AlarmDispatcher` holds a `FailoverAlarmConsumer` instead
of a bare `WnWrapAlarmConsumer`; it subscribes to `ProviderModeChanged` and
enqueues a mode-changed worker event. The ack path routes by active mode (native
wnwrap ack in alarmmgr mode; ack-comment write in subtag mode), but that routing
is entirely inside the consumer — the dispatcher just calls
`AcknowledgeByName`/`AcknowledgeByGuid`.
### Gateway (`src/ZB.MOM.WW.MxGateway.Server/`)
**Galaxy Repository discovery (new query)** — alongside the existing GR SQL
browse RPCs, a query "attributes that have alarms configured, with their
ack-comment subtag and area", scoped to the configured area. Merged with the
config override (explicit includes/excludes). Produces the watch-list of
`AlarmSubtagTarget`s.
**`GatewayAlarmMonitor` (changed)** — resolves the watch-list at subscribe time
and passes it to the worker; consumes `AlarmProviderModeChanged` and reflects
the current provider mode into (a) the `AlarmFeedMessage` provider-status,
(b) the `/hubs/alarms` dashboard hub, and (c) metrics; forces a reconcile
(`QueryActiveAlarms`) on every switch. Re-runs discovery on its existing
reconcile cadence and pushes an updated watch-list when the model changes.
**`AlarmsOptions` (extended)** — new `Fallback` sub-section (below).
### Contract (`src/ZB.MOM.WW.MxGateway.Contracts/Protos/`)
**`mxaccess_gateway.proto`:**
- `enum AlarmProviderMode { ALARM_PROVIDER_MODE_UNSPECIFIED = 0; ALARMMGR = 1; SUBTAG = 2; }`
- New `AlarmFeedMessage` oneof case `AlarmProviderStatus provider_status`,
carrying `{ AlarmProviderMode mode; bool degraded; string reason;
google.protobuf.Timestamp since; }`. Emitted on stream open and on every
change so a late-joining client immediately learns the mode.
- Add `bool degraded` + `AlarmProviderMode source_provider` to
`OnAlarmTransitionEvent` **and** `ActiveAlarmSnapshot`, so per-item provenance
is visible even mid-stream. All additions are new field numbers — backward
compatible; existing clients ignore them and keep seeing alarms.
**`mxaccess_worker.proto`:**
> **Superseded:** these additions shipped in `mxaccess_gateway.proto`, not
> `mxaccess_worker.proto` — the worker imports the gateway proto and the alarm
> commands/events live there (`AlarmSubtagTarget`,
> `OnAlarmProviderModeChangedEvent`, the extended subscribe command).
- Extend the alarm-subscribe command with: `AlarmProviderMode forced_mode`
(`UNSPECIFIED` = auto), `int32 consecutive_failure_threshold`,
`int32 failback_probe_interval_seconds`, `int32 failback_stable_probes`, and
`repeated AlarmSubtagTarget watch_list`, where `AlarmSubtagTarget =
{ string alarm_full_reference; string source_object_reference;
string active_subtag; string acked_subtag; string ack_comment_subtag;
string priority_subtag; }`.
- New worker→gateway event `AlarmProviderModeChanged { AlarmProviderMode mode;
string reason; int32 hresult; google.protobuf.Timestamp at; }`.
> Generated code under `Generated/` and `clients/*/generated*/` is rebuilt from
> these `.proto` files — never hand-edited. Every generated client touched by
> the contract is rebuilt per the source-update workflow.
## Data flow
### Subtag synthesis rules
`SubtagAlarmConsumer` keeps last-known `(active, acked)` per watch-list entry and
emits transitions on change:
| Subtag change | Emitted transition | Notes |
|---|---|---|
| `active` false → true | `RAISE` (state `UNACK_ALM`) | `original_raise_timestamp` = first-observed active time |
| `acked` false → true while `active` | `ACKNOWLEDGE` | `operator_user`/`operator_comment` from ack-comment subtag if advised |
| `active` true → false | `CLEAR` | maps to `AckRtn` if acked at clear, else `UnackRtn` |
| `active` stays true, re-alarm | `RETRIGGER` | **only** if a re-alarm counter subtag exists; otherwise not synthesized (documented limitation) |
Snapshot state mapping for `ActiveAlarmSnapshot.current_state`:
`active && !acked → ACTIVE`, `active && acked → ACTIVE_ACKED`,
`!active → INACTIVE`.
Field degradation in subtag mode:
- `alarm_full_reference` — from the watch-list entry (stable, drives ack-by-ref).
- Synthetic, deterministic GUID derived by hashing `alarm_full_reference` so
GUID-based ack still resolves; flagged `degraded = true`.
- `severity` — from the priority subtag if advised, else 0.
- `original_raise_timestamp` — first-observed active time (best effort).
- `transition_timestamp` — the `OnDataChange` timestamp.
- `category`/`description`/`current_value`/`limit_value` — populated only if the
corresponding subtag is advised; otherwise empty.
### Acknowledge
`AcknowledgeAlarm`/`AcknowledgeAlarmByName` are unchanged at the RPC surface.
`AlarmDispatcher` routes by active provider mode:
- **alarmmgr mode:** native wnwrap `AlarmAckByName`/`AlarmAckByGUID` (unchanged).
- **subtag mode:** resolve the target's `ack_comment_subtag`, `Write` the
operator comment via the existing worker write path on the STA. No writable
ack-comment subtag → `FailedPrecondition`.
### Provider-mode reflection
Worker `AlarmProviderModeChanged``GatewayAlarmMonitor` → (a) emit/refresh
`AlarmFeedMessage.provider_status` to every `StreamAlarms` subscriber, (b) push
to `/hubs/alarms`, (c) update metrics, (d) force a reconcile.
## Error handling
- **Both providers down** (subtag advise also failing): the monitor stays
faulted and keeps retrying both; acknowledge returns `Unavailable`. No silent
data loss — the feed reports degraded with reason.
- **Empty watch-list in subtag mode** (GR SQL unavailable, no config override):
log + metric `alarm_fallback_watchlist_empty`; the feed reports degraded +
empty; the gateway keeps re-running discovery on its reconcile cadence and
pushes an updated watch-list when one becomes available.
- **Switch hand-off:** every switch snapshots the now-active provider and
reconciles against the gateway cache to avoid a raise/clear storm.
- **STA affinity:** all subtag advise/write and wnwrap probe calls run on the
worker STA (reuse the existing affinity guard) to satisfy
`ThreadingModel=Apartment`.
### Metrics
- `mxgateway_alarm_provider_mode` (gauge: 1 = alarmmgr, 2 = subtag)
- `mxgateway_alarm_provider_switch_total{from,to,reason}` (counter)
- `mxgateway_alarm_fallback_watchlist_size` (gauge)
> **Superseded:** the shipped meter names are `mxgateway.alarms.provider_mode`
> (gauge) and `mxgateway.alarms.provider_switches{from,to,reason}` (counter,
> `reason` bounded to `failover`/`failback`/`unknown`). The watch-list-size /
> watch-list-empty gauges were not implemented; an empty watch-list is surfaced
> via a warning log and the feed's degraded `ProviderStatus` instead.
## Configuration
```jsonc
"MxGateway": {
"Alarms": {
"Enabled": true,
"SubscriptionExpression": "\\\\DESKTOP-6JL3KKO\\Galaxy!DEV",
"DefaultArea": "DEV",
"ReconcileIntervalSeconds": 30,
"Fallback": {
"Mode": "Auto", // Auto | ForceAlarmManager | ForceSubtag
"ConsecutiveFailureThreshold": 3,
"FailbackProbeIntervalSeconds": 30,
"FailbackStableProbes": 3,
"Discovery": {
"UseGalaxyRepository": true,
"Area": "", // defaults to Alarms.DefaultArea
"IncludeAttributes": [], // explicit additions
"ExcludeAttributes": []
},
"Subtags": {
"Active": "active",
"Acked": "acked",
"AckComment": "", // verified against MXAccess analysis
"Priority": "priority"
}
}
}
}
```
`GatewayOptionsValidator` additions: `Mode = ForceSubtag` with empty discovery
result and no explicit `IncludeAttributes` → startup validation warning;
threshold/interval/probe values floored at sane minimums.
## Open item to confirm during implementation
The exact AVEVA subtag names (`.active`, `.acked`, the ack-comment attribute,
priority) must be confirmed against the MXAccess analysis project
(`C:\Users\dohertj2\Desktop\mxaccess`, `docs/MXAccess-Public-API.md`) and the
live Galaxy before wiring `SubtagAlarmConsumer`. The config `Subtags` block
exists precisely so the resolved names are not hard-coded.
## Testing
| Layer | Tests |
|---|---|
| Worker unit (`MxGateway.Worker.Tests`, x86) | `SubtagAlarmConsumer` synthesis — feed `OnDataChange` sequences, assert raise/ack/clear transitions, snapshot states, degraded flag, synthetic-GUID stability, ack-comment write routing |
| Worker unit | `FailoverAlarmConsumer` state machine — fake wnwrap throwing after K polls: assert switch at threshold, failback after stable probes, `ProviderModeChanged` emitted, no duplicate transitions across switch (hand-off reconcile) |
| Gateway unit (`MxGateway.Tests`, fake worker) | discovery + config-override merge; `GatewayAlarmMonitor` reflects mode into feed + hub; metrics increment on switch |
| Contract | proto round-trip for new fields; existing alarm tests unchanged (alarmmgr-mode regression — parity preserved) |
| Live (opt-in, `MXGATEWAY_RUN_LIVE_MXACCESS_TESTS=1`) | real subtag advise + ack-comment write against a live alarm; GR SQL discovery query against the `ZB` DB (gated like existing GR tests) |
## Docs to update in the same change
`gateway.md` (alarm provider section), `docs/DesignDecisions.md` (record the
fallback decision), `docs/GatewayConfiguration.md` (the `Fallback` block),
`docs/AlarmClientDiscovery.md` (subtag provider + synthesis rules),
`docs/Grpc.md` (the new `provider_status` / `degraded` fields), and any client
READMEs whose generated alarm types gain fields.
@@ -0,0 +1,860 @@
# Alarm Subtag-Monitoring Fallback — Implementation Plan
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:executing-plans (or subagent-driven-development) to implement this plan task-by-task.
**Goal:** Add a second alarm source — direct MXAccess subtag monitoring — that the gateway auto-fails-over to when the wnwrap alarmmgr provider breaks, auto-fails-back to when it recovers, and can be forced on by config.
**Architecture:** Worker-side synthesis (parity rule preserved). A new `SubtagAlarmConsumer` (own `LMXProxyServerClass`, `AddItem`/`Advise` on alarm subtags) and a `FailoverAlarmConsumer` composite (state machine over the wnwrap primary + subtag standby) both implement the existing `IMxAccessAlarmConsumer` seam. The gateway resolves the subtag watch-list (Galaxy Repository SQL + config override), arms the worker at subscribe time, and reflects the live provider mode into the gRPC alarm feed, the dashboard hub, and metrics.
**Tech Stack:** .NET 10 (gateway, x64) + .NET Framework 4.8 (worker, x86, STA), protobuf/gRPC, `Microsoft.Data.SqlClient` (Galaxy Repository), SignalR (dashboard), `System.Diagnostics.Metrics`, xUnit (plain `Assert`, no FluentAssertions).
**Design source:** `docs/plans/2026-06-13-alarm-subtag-fallback-design.md`
**Branch:** `feat/alarm-subtag-fallback` (already created)
---
## Conventions for every task
- **TDD:** write the failing test, run it red, implement, run it green, commit.
- **xUnit, plain `Assert.*`**, naming `Subject_Condition_Expected`. Worker fakes are sealed private nested classes that raise events.
- **Build/test commands:**
- Contracts regen: `dotnet build src/ZB.MOM.WW.MxGateway.Contracts/ZB.MOM.WW.MxGateway.Contracts.csproj`
- Gateway: `dotnet build src/ZB.MOM.WW.MxGateway.Server` ; `dotnet test src/ZB.MOM.WW.MxGateway.Tests/ZB.MOM.WW.MxGateway.Tests.csproj`
- Worker (x86): `dotnet build src/ZB.MOM.WW.MxGateway.Worker/ZB.MOM.WW.MxGateway.Worker.csproj -p:Platform=x86` ; `dotnet test src/ZB.MOM.WW.MxGateway.Worker.Tests/ZB.MOM.WW.MxGateway.Worker.Tests.csproj -p:Platform=x86`
- Single test: append `--filter FullyQualifiedName~<ClassOrMethod>`
- **Build is strict:** `TreatWarningsAsErrors=true`, nullable enabled. Add XML doc comments on public members (the repo runs a doc checker).
- **Generated code** under `Generated/` is never hand-edited — rebuild the contracts project to regenerate.
- **Namespaces:** worker MxAccess types live in `ZB.MOM.WW.MxGateway.Worker.MxAccess`; proto C# types in `ZB.MOM.WW.MxGateway.Contracts.Proto`.
---
## Phase 0 — Contracts
### Task 1: Worker proto — subtag watch-list, failover config, provider-mode enum
**Classification:** high-risk
**Estimated implement time:** ~4 min
**Parallelizable with:** none (Task 2 imports these types)
**Files:**
- Modify: `src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto` (real `SubscribeAlarmsCommand` at ~line 324; `MxCommand` references it at 123-125)
> **CORRECTION (execution):** The alarm command messages and `MxCommand` live in **`mxaccess_gateway.proto`**, not the worker proto. `mxaccess_worker.proto` *imports* the gateway proto (`WorkerCommand.command` is `mxaccess_gateway.v1.MxCommand`), so the gateway proto is the base and the worker proto needs **no** change. `AlarmProviderMode` and the new types are added to the gateway proto and are visible to worker code as `mxaccess_gateway.v1` types. Tasks 1 and 2 are executed as a single combined edit on this one file.
**Step 1: Add the enum and messages.** In `mxaccess_gateway.proto`, extend the existing `SubscribeAlarmsCommand` message (line 324) and add the new types after it:
```protobuf
// Provider selection / current provider for the alarm feed. Defined here in
// the worker contract because the worker SubscribeAlarmsCommand references it;
// mxaccess_gateway.proto imports this file and reuses the same enum.
enum AlarmProviderMode {
ALARM_PROVIDER_MODE_UNSPECIFIED = 0; // auto: alarmmgr primary, subtag fallback
ALARM_PROVIDER_MODE_ALARMMGR = 1;
ALARM_PROVIDER_MODE_SUBTAG = 2;
}
message SubscribeAlarmsCommand {
string subscription_expression = 1; // existing field — keep
// UNSPECIFIED = auto-failover/failback. ALARMMGR/SUBTAG force one provider.
AlarmProviderMode forced_mode = 2;
// Subtag watch-list resolved by the gateway (GR SQL + config). Empty in pure
// alarmmgr mode; in subtag mode it bounds what the consumer can observe.
repeated AlarmSubtagTarget watch_list = 3;
AlarmFailoverConfig failover = 4;
}
// One alarm attribute the subtag consumer advises. Addresses are full MXAccess
// item references the worker passes straight to AddItem.
message AlarmSubtagTarget {
string alarm_full_reference = 1; // e.g. "Galaxy!Area.Tank01.Level.HiHi"
string source_object_reference = 2; // e.g. "Tank01"
string active_subtag = 3; // item address of the in-alarm boolean
string acked_subtag = 4; // item address of the acknowledged boolean
string ack_comment_subtag = 5; // writable ack-comment attribute (ack write target)
string priority_subtag = 6; // optional severity source; empty if absent
}
message AlarmFailoverConfig {
int32 consecutive_failure_threshold = 1; // wnwrap COM failures before switching (>=1)
int32 failback_probe_interval_seconds = 2; // probe cadence while degraded (>=1)
int32 failback_stable_probes = 3; // clean probes before switching back (>=1)
}
```
`UnsubscribeAlarmsCommand` and `AcknowledgeAlarmCommand` are unchanged.
**Step 2: Regenerate & verify it compiles.**
Run: `dotnet build src/ZB.MOM.WW.MxGateway.Contracts/ZB.MOM.WW.MxGateway.Contracts.csproj`
Expected: build succeeds; generated `AlarmProviderMode`, `AlarmSubtagTarget`, `AlarmFailoverConfig` types appear.
**Step 3: Commit.**
```bash
git add src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_worker.proto
git commit -m "contracts(worker): subtag watch-list + failover config + AlarmProviderMode"
```
---
### Task 2: Gateway proto — provider status on the feed, degraded provenance, mode-changed event
**Classification:** high-risk
**Estimated implement time:** ~5 min
**Parallelizable with:** none (depends on Task 1; Task 3 tests both)
**Files:**
- Modify: `src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto` (`OnAlarmTransitionEvent` ~719-771, `ActiveAlarmSnapshot` ~783-803, `AlarmFeedMessage` ~860-870, `MxEvent` family enum + body oneof, `MxEventFamily` enum)
**Step 1: Add degraded provenance to the two alarm payloads.** Append to `OnAlarmTransitionEvent` (next free field 14):
```protobuf
// True when this transition came from the subtag-monitoring fallback rather
// than the native alarmmgr provider — i.e. it was synthesized from data
// changes and carries reduced fidelity (synthetic GUID, no native raise time).
bool degraded = 14;
// Which provider produced this transition.
AlarmProviderMode source_provider = 15;
```
Append the identical two fields to `ActiveAlarmSnapshot` (next free field 14):
```protobuf
bool degraded = 14;
AlarmProviderMode source_provider = 15;
```
**Step 2: Add provider status to the feed oneof.** Add a new oneof case to `AlarmFeedMessage` (next free field 4) and a new message:
```protobuf
message AlarmFeedMessage {
oneof payload {
ActiveAlarmSnapshot active_alarm = 1;
bool snapshot_complete = 2;
OnAlarmTransitionEvent transition = 3;
// Provider-mode status. Emitted once on stream open and again on every
// failover/failback so late joiners learn the current mode immediately.
AlarmProviderStatus provider_status = 4;
}
}
message AlarmProviderStatus {
AlarmProviderMode mode = 1;
bool degraded = 2; // true whenever mode == SUBTAG
string reason = 3; // human-readable switch reason
google.protobuf.Timestamp since = 4;
}
```
**Step 3: Add the worker→gateway mode-changed event to `MxEvent`.** Find the `MxEventFamily` enum and the `MxEvent` body oneof. Add a family member and a body message + oneof case (use the next free family value and the next free `MxEvent` body field number — check the file):
```protobuf
// in MxEventFamily enum:
MX_EVENT_FAMILY_ON_ALARM_PROVIDER_MODE_CHANGED = <next>;
// new message near OnAlarmTransitionEvent:
message OnAlarmProviderModeChangedEvent {
AlarmProviderMode mode = 1;
string reason = 2;
int32 hresult = 3; // COM HRESULT that triggered failover; 0 on failback
google.protobuf.Timestamp at = 4;
}
// in MxEvent body oneof:
OnAlarmProviderModeChangedEvent on_alarm_provider_mode_changed = <next>;
```
`AlarmProviderMode` is defined in `mxaccess_worker.proto`; confirm `mxaccess_gateway.proto` already has `import "mxaccess_worker.proto";` (it references `SubscribeAlarmsCommand`, so it does) and reference the enum unqualified or via its package as the existing references do.
**Step 4: Regenerate & verify.**
Run: `dotnet build src/ZB.MOM.WW.MxGateway.Contracts/ZB.MOM.WW.MxGateway.Contracts.csproj`
Expected: build succeeds.
**Step 5: Commit.**
```bash
git add src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto
git commit -m "contracts(gateway): AlarmProviderStatus feed case, degraded provenance, mode-changed event"
```
---
### Task 3: Proto round-trip tests for the new alarm fields
**Classification:** small
**Estimated implement time:** ~3 min
**Parallelizable with:** none (depends on Tasks 1-2)
**Files:**
- Modify: `src/ZB.MOM.WW.MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs`
**Step 1: Add tests** mirroring the existing `Event_RoundTripsOnAlarmTransitionWithFullPayload` style:
```csharp
[Fact]
public void Feed_RoundTripsProviderStatus()
{
var since = Timestamp.FromDateTime(new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc));
var original = new AlarmFeedMessage
{
ProviderStatus = new AlarmProviderStatus
{
Mode = AlarmProviderMode.Subtag,
Degraded = true,
Reason = "wnwrap poll failed 3x (HRESULT 0x80004005)",
Since = since,
},
};
var parsed = AlarmFeedMessage.Parser.ParseFrom(original.ToByteArray());
Assert.Equal(original, parsed);
Assert.Equal(AlarmFeedMessage.PayloadOneofCase.ProviderStatus, parsed.PayloadCase);
Assert.True(parsed.ProviderStatus.Degraded);
Assert.Equal(AlarmProviderMode.Subtag, parsed.ProviderStatus.Mode);
}
[Fact]
public void Transition_RoundTripsDegradedProvenance()
{
var t = new OnAlarmTransitionEvent
{
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
TransitionKind = AlarmTransitionKind.Raise,
Degraded = true,
SourceProvider = AlarmProviderMode.Subtag,
};
var parsed = OnAlarmTransitionEvent.Parser.ParseFrom(t.ToByteArray());
Assert.True(parsed.Degraded);
Assert.Equal(AlarmProviderMode.Subtag, parsed.SourceProvider);
}
```
**Step 2: Run red→green.**
Run: `dotnet test src/ZB.MOM.WW.MxGateway.Tests/ZB.MOM.WW.MxGateway.Tests.csproj --filter FullyQualifiedName~ProtobufContractRoundTripTests`
Expected: PASS.
**Step 3: Commit.**
```bash
git add src/ZB.MOM.WW.MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs
git commit -m "test(contracts): round-trip provider status + degraded provenance"
```
---
## Phase 1 — Worker: subtag consumer + failover
### Task 4: Subtag value-source abstraction + synthesis state holder
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** none (Task 5 builds on it)
A testable seam so synthesis logic is unit-tested without COM. The COM wiring lands in Task 6.
**Files:**
- Create: `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/ISubtagAlarmSource.cs`
- Create: `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/SubtagAlarmStateMachine.cs`
- Test: `src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/SubtagAlarmStateMachineTests.cs`
**Step 1: Define the source abstraction.** `ISubtagAlarmSource` advises subtag addresses and raises a normalized value-change callback on the STA:
```csharp
namespace ZB.MOM.WW.MxGateway.Worker.MxAccess;
/// <summary>A change in one advised subtag value, normalized off the COM boundary.</summary>
public sealed class SubtagValueChange
{
/// <summary>The full item address that changed (matches an AlarmSubtagTarget subtag).</summary>
public string ItemAddress { get; init; } = string.Empty;
/// <summary>The new value (boolean for .active/.acked, numeric for priority).</summary>
public object? Value { get; init; }
/// <summary>The change timestamp in UTC.</summary>
public DateTime TimestampUtc { get; init; }
}
/// <summary>
/// Advises a set of MXAccess subtag addresses and surfaces value changes.
/// The production implementation (Task 6) owns its own LMXProxyServerClass;
/// tests substitute a fake that pushes <see cref="SubtagValueChange"/>s.
/// </summary>
public interface ISubtagAlarmSource : IDisposable
{
/// <summary>Raised on the STA when an advised subtag's value changes.</summary>
event EventHandler<SubtagValueChange>? ValueChanged;
/// <summary>Advises every subtag in the supplied addresses; idempotent per address.</summary>
void Advise(IReadOnlyCollection<string> itemAddresses);
/// <summary>Writes a value to an item address (used for the ack-comment write).</summary>
void Write(string itemAddress, object? value);
}
```
**Step 2: Write the state-machine tests first.** `SubtagAlarmStateMachine` maps `(active, acked)` changes per target to `MxAlarmTransitionEvent`s. Test the four core transitions:
```csharp
namespace ZB.MOM.WW.MxGateway.Worker.Tests.MxAccess;
public sealed class SubtagAlarmStateMachineTests
{
private static AlarmSubtagTarget Target() => new()
{
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
SourceObjectReference = "Tank01",
ActiveSubtag = "Tank01.Level.HiHi.active",
AckedSubtag = "Tank01.Level.HiHi.acked",
AckCommentSubtag = "Tank01.Level.HiHi.ackmsg",
};
[Fact]
public void ActiveFalseToTrue_EmitsRaise_FlaggedDegraded()
{
var sm = new SubtagAlarmStateMachine(new[] { Target() });
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
var events = sm.Apply("Tank01.Level.HiHi.active", true, ts);
var e = Assert.Single(events);
Assert.Equal(MxAlarmStateKind.UnackAlm, e.Record.State);
Assert.Equal(MxAlarmStateKind.Unspecified, e.PreviousState);
Assert.Equal("Tank01.Level.HiHi", e.Record.TagName); // reference minus provider/area
}
[Fact]
public void AckedTrueWhileActive_EmitsAckTransition()
{
var sm = new SubtagAlarmStateMachine(new[] { Target() });
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
sm.Apply("Tank01.Level.HiHi.active", true, ts);
var events = sm.Apply("Tank01.Level.HiHi.acked", true, ts.AddSeconds(5));
var e = Assert.Single(events);
Assert.Equal(MxAlarmStateKind.AckAlm, e.Record.State);
Assert.Equal(MxAlarmStateKind.UnackAlm, e.PreviousState);
}
[Fact]
public void ActiveTrueToFalse_WhileUnacked_EmitsUnackRtn()
{
var sm = new SubtagAlarmStateMachine(new[] { Target() });
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
sm.Apply("Tank01.Level.HiHi.active", true, ts);
var events = sm.Apply("Tank01.Level.HiHi.active", false, ts.AddSeconds(10));
var e = Assert.Single(events);
Assert.Equal(MxAlarmStateKind.UnackRtn, e.Record.State);
}
[Fact]
public void Snapshot_ReflectsActiveAndAckedState()
{
var sm = new SubtagAlarmStateMachine(new[] { Target() });
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
sm.Apply("Tank01.Level.HiHi.active", true, ts);
sm.Apply("Tank01.Level.HiHi.acked", true, ts);
var snap = Assert.Single(sm.SnapshotActive());
Assert.Equal(MxAlarmStateKind.AckAlm, snap.State);
}
}
```
Run: `dotnet test ...Worker.Tests... -p:Platform=x86 --filter FullyQualifiedName~SubtagAlarmStateMachineTests` → FAIL (type missing).
**Step 3: Implement `SubtagAlarmStateMachine`.** Build an address→target index (active/acked/priority/comment addresses), hold per-reference `(bool active, bool acked, DateTime firstRaiseUtc, int priority)`, and emit on change:
- active `false→true``UnackAlm`, set `firstRaiseUtc`, `PreviousState` from prior state.
- acked `false→true` while active ⇒ `AckAlm`.
- active `true→false``AckRtn` if currently acked else `UnackRtn`; then reset acked.
- priority change ⇒ update stored priority, no transition.
- `TagName` = `alarm_full_reference` with any `Provider!Area.` prefix stripped (match `WnWrapAlarmConsumer`'s reference shape so `GatewayAlarmMonitor` keys align). Set `ProviderName`, `Group`, `Priority`, `AlarmComment` from the target/last values. Mark a `Degraded`/source flag (carried via a new field — see Task 5 wiring).
- `SnapshotActive()` returns `MxAlarmSnapshotRecord` for references whose active is true.
**Step 4: Run green.** Expected: PASS.
**Step 5: Commit.**
```bash
git add src/ZB.MOM.WW.MxGateway.Worker/MxAccess/ISubtagAlarmSource.cs \
src/ZB.MOM.WW.MxGateway.Worker/MxAccess/SubtagAlarmStateMachine.cs \
src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/SubtagAlarmStateMachineTests.cs
git commit -m "worker(alarms): subtag value-source seam + synthesis state machine"
```
---
### Task 5: `SubtagAlarmConsumer` over the source seam (no COM yet)
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** none (depends on Task 4)
**Files:**
- Create: `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/SubtagAlarmConsumer.cs`
- Test: `src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/SubtagAlarmConsumerTests.cs`
**Step 1: Test with a fake `ISubtagAlarmSource`.** Drive value changes through the source, assert `AlarmTransitionEmitted` fires with synthesized records and that ack writes the comment to the ack-comment subtag:
```csharp
public sealed class SubtagAlarmConsumerTests
{
private sealed class FakeSource : ISubtagAlarmSource
{
public event EventHandler<SubtagValueChange>? ValueChanged;
public List<string> Advised { get; } = new();
public (string Address, object? Value)? LastWrite { get; private set; }
public void Advise(IReadOnlyCollection<string> a) => Advised.AddRange(a);
public void Write(string a, object? v) => LastWrite = (a, v);
public void Raise(string addr, object? val, DateTime ts) =>
ValueChanged?.Invoke(this, new SubtagValueChange { ItemAddress = addr, Value = val, TimestampUtc = ts });
public void Dispose() { }
}
private static AlarmSubtagTarget Target() => new()
{
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
ActiveSubtag = "Tank01.Level.HiHi.active",
AckedSubtag = "Tank01.Level.HiHi.acked",
AckCommentSubtag = "Tank01.Level.HiHi.ackmsg",
};
[Fact]
public void Subscribe_AdvisesAllSubtags()
{
var src = new FakeSource();
using var c = new SubtagAlarmConsumer(src, new[] { Target() });
c.Subscribe("ignored-in-subtag-mode");
Assert.Contains("Tank01.Level.HiHi.active", src.Advised);
Assert.Contains("Tank01.Level.HiHi.acked", src.Advised);
}
[Fact]
public void ValueChange_RaisesSynthesizedTransition()
{
var src = new FakeSource();
using var c = new SubtagAlarmConsumer(src, new[] { Target() });
c.Subscribe("x");
MxAlarmTransitionEvent? seen = null;
c.AlarmTransitionEmitted += (_, e) => seen = e;
src.Raise("Tank01.Level.HiHi.active", true, new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc));
Assert.NotNull(seen);
Assert.Equal(MxAlarmStateKind.UnackAlm, seen!.Record.State);
}
[Fact]
public void AcknowledgeByName_WritesCommentToAckCommentSubtag()
{
var src = new FakeSource();
using var c = new SubtagAlarmConsumer(src, new[] { Target() });
c.Subscribe("x");
int rc = c.AcknowledgeByName("Tank01.Level.HiHi", "Galaxy", "Area",
"ack from HMI", "op1", "node", "dom", "Op One");
Assert.Equal(0, rc);
Assert.Equal(("Tank01.Level.HiHi.ackmsg", (object?)"ack from HMI"), src.LastWrite);
}
}
```
**Step 2: Implement `SubtagAlarmConsumer : IMxAccessAlarmConsumer`.**
- Constructor `(ISubtagAlarmSource source, IReadOnlyList<AlarmSubtagTarget> watchList)`; build a `SubtagAlarmStateMachine`; index `alarm_full_reference`→target for ack routing.
- `Subscribe(_)`: call `source.Advise(<all subtag addresses>)`; subscribe to `source.ValueChanged`, feed each into the state machine, and re-raise each produced `MxAlarmTransitionEvent` via `AlarmTransitionEmitted` (mark degraded).
- `AcknowledgeByName(alarmName, …, comment, …)`: resolve the target by reference; if no `AckCommentSubtag`, return a non-zero failure code; else `source.Write(target.AckCommentSubtag, comment)` and return 0.
- `AcknowledgeByGuid(guid, …)`: map the synthetic GUID (deterministic hash of reference — see Task 8 helper, or a local copy) back to a reference, then delegate to the name path; unknown GUID ⇒ non-zero.
- `SnapshotActiveAlarms()`: from the state machine.
- `PollOnce()`: no-op.
- `Dispose()`: unsubscribe + dispose source.
**Step 3: Run green.** `dotnet test ...Worker.Tests... -p:Platform=x86 --filter FullyQualifiedName~SubtagAlarmConsumerTests`.
**Step 4: Commit.**
```bash
git add src/ZB.MOM.WW.MxGateway.Worker/MxAccess/SubtagAlarmConsumer.cs \
src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/SubtagAlarmConsumerTests.cs
git commit -m "worker(alarms): SubtagAlarmConsumer synthesizing transitions over the source seam"
```
---
### Task 6: COM-backed `LmxSubtagAlarmSource` (own LMXProxyServerClass)
**Classification:** high-risk
**Estimated implement time:** ~5 min
**Parallelizable with:** none
The only piece that touches live COM. Like `WnWrapAlarmConsumer`, it owns its own MXAccess server object so the subtag source is self-contained and isolated from the session's item pipeline. Logic stays thin (advise/write/marshal); real verification is the live smoke test in Task 17.
**Files:**
- Create: `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/LmxSubtagAlarmSource.cs`
- Test: `src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/LmxSubtagAlarmSourceTests.cs` (constructor/guard tests only; COM path is live-gated)
**Step 1: Implement `LmxSubtagAlarmSource : ISubtagAlarmSource`.**
- Own an `LMXProxyServerClass` (reuse the worker's `IMxAccessServer`/`MxAccessComServer` wrapper + `IMxAccessComObjectFactory` so it is fakeable; constructor takes the factory).
- `Advise(addresses)`: `RegisterServer` (topic) once; per address `AddItem``itemHandle`, `Advise`, and record `itemHandle→address`. Subscribe to the proxy's `OnDataChange`; in the handler, look up the address by `phItemHandle`, normalize `pvItemValue` (VARIANT→bool/double) and `pftItemTimeStamp`→UTC, and raise `ValueChanged`. All calls run on the STA (the worker STA pumps messages, so `OnDataChange` delivers).
- `Write(address, value)`: resolve/create the item handle, `server.Write(serverHandle, itemHandle, value, userId: 0)`.
- `Dispose()`: `UnAdvise`/`RemoveItem`/`Unregister`/release COM.
**Step 2: Tests** — only the non-COM guards (null factory throws; `Write` before `Advise` resolves a handle or throws a clear error). Mark the COM round-trip `[LiveMxAccessFact]` and `Skip` per the `AlarmsLiveSmokeTests` precedent.
**Step 3: Build x86 + run unit tests.**
`dotnet build src/ZB.MOM.WW.MxGateway.Worker/ZB.MOM.WW.MxGateway.Worker.csproj -p:Platform=x86`
`dotnet test ...Worker.Tests... -p:Platform=x86 --filter FullyQualifiedName~LmxSubtagAlarmSourceTests`
**Step 4: Commit.**
```bash
git add src/ZB.MOM.WW.MxGateway.Worker/MxAccess/LmxSubtagAlarmSource.cs \
src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/LmxSubtagAlarmSourceTests.cs
git commit -m "worker(alarms): COM-backed LmxSubtagAlarmSource advising alarm subtags"
```
---
### Task 7: `FailoverAlarmConsumer` state machine
**Classification:** high-risk
**Estimated implement time:** ~5 min
**Parallelizable with:** none (depends on Task 5)
**Files:**
- Create: `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/FailoverAlarmConsumer.cs`
- Create: `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/AlarmProviderModeChange.cs` (small EventArgs)
- Test: `src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/FailoverAlarmConsumerTests.cs`
**Step 1: Test the switch/failback with a fake primary that throws.**
```csharp
public sealed class FailoverAlarmConsumerTests
{
private sealed class FlakyPrimary : IMxAccessAlarmConsumer
{
public event EventHandler<MxAlarmTransitionEvent>? AlarmTransitionEmitted;
public int PollsUntilHeal = int.MaxValue; // becomes healthy after N polls while degraded
public bool ThrowOnPoll = true;
private int _polls;
public void Subscribe(string s) { if (ThrowOnPoll) throw new COMException("boom", unchecked((int)0x80004005)); }
public void PollOnce()
{
_polls++;
if (ThrowOnPoll && _polls < PollsUntilHeal) throw new COMException("boom", unchecked((int)0x80004005));
}
public int AcknowledgeByGuid(Guid g, string c, string a, string b, string d, string e) => 0;
public int AcknowledgeByName(string n, string p, string gr, string c, string a, string b, string d, string e) => 0;
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms() => Array.Empty<MxAlarmSnapshotRecord>();
public void Dispose() { }
}
private sealed class StubStandby : IMxAccessAlarmConsumer { /* records Subscribe, no-op rest */ }
[Fact]
public void Primary_FailsThresholdTimes_SwitchesToSubtagAndEmitsModeChange()
{
var primary = new FlakyPrimary();
var standby = new StubStandby();
using var c = new FailoverAlarmConsumer(primary, standby,
new FailoverSettings(threshold: 3, probeIntervalSeconds: 30, stableProbes: 3));
AlarmProviderModeChange? change = null;
c.ProviderModeChanged += (_, e) => change = e;
c.Subscribe("\\\\host\\Galaxy!Area"); // primary.Subscribe throws -> counts as failure 1
c.PollOnce(); // failure 2
c.PollOnce(); // failure 3 -> switch
Assert.NotNull(change);
Assert.Equal(AlarmProviderMode.Subtag, change!.Mode);
}
[Fact]
public void WhileDegraded_PrimaryHeals_FailsBackAfterStableProbes()
{
var primary = new FlakyPrimary { PollsUntilHeal = 0 }; // will heal once we stop throwing
var standby = new StubStandby();
using var c = new FailoverAlarmConsumer(primary, standby,
new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 2));
var modes = new List<AlarmProviderMode>();
c.ProviderModeChanged += (_, e) => modes.Add(e.Mode);
c.Subscribe("x"); // failure -> switch to subtag
primary.ThrowOnPoll = false;
c.ProbeOnce(); // clean probe 1
c.ProbeOnce(); // clean probe 2 -> failback
Assert.Equal(AlarmProviderMode.Subtag, modes[0]);
Assert.Equal(AlarmProviderMode.Alarmmgr, modes[^1]);
}
}
```
**Step 2: Implement.**
- `record FailoverSettings(int threshold, int probeIntervalSeconds, int stableProbes)`; `AlarmProviderModeChange : EventArgs { AlarmProviderMode Mode; string Reason; int HResult; DateTime AtUtc; }`.
- Constructor `(IMxAccessAlarmConsumer primary, IMxAccessAlarmConsumer standby, FailoverSettings settings)`; forced-mode variants handled in Task 9 wiring (forced ⇒ skip the other consumer).
- Forward `AlarmTransitionEmitted` from the **active** child only (swap the subscription on switch).
- Wrap `Subscribe`/`PollOnce` on the primary: on `COMException` (or a failure HRESULT) while `PrimaryActive`, increment a counter; at `threshold`, ensure standby `Subscribe`d, set active=standby, snapshot standby for hand-off, raise `ProviderModeChanged(Subtag, reason, hresult, now)`. Reset counter on any clean primary poll.
- `ProbeOnce()` (driven by the poll loop while degraded, gated by `probeIntervalSeconds`): try primary `Subscribe`+`PollOnce`; count consecutive clean probes; at `stableProbes`, set active=primary, return standby to standby, raise `ProviderModeChanged(Alarmmgr, "recovered", 0, now)`.
- `Acknowledge*` / `SnapshotActiveAlarms` delegate to the **active** child.
- `PollOnce()` drives the active child's poll, and—while degraded—also drives the failback probe cadence.
**Step 3: Run green** (x86 filter `FailoverAlarmConsumerTests`).
**Step 4: Commit.**
```bash
git add src/ZB.MOM.WW.MxGateway.Worker/MxAccess/FailoverAlarmConsumer.cs \
src/ZB.MOM.WW.MxGateway.Worker/MxAccess/AlarmProviderModeChange.cs \
src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/FailoverAlarmConsumerTests.cs
git commit -m "worker(alarms): FailoverAlarmConsumer auto-failover/failback state machine"
```
---
### Task 8: Synthetic-GUID helper + degraded flag on the event sink path
**Classification:** standard
**Estimated implement time:** ~4 min
**Parallelizable with:** Task 9
Carry `degraded` + `source_provider` from the worker synthesis into the emitted `OnAlarmTransitionEvent`.
**Files:**
- Modify: `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/MxAlarmSnapshot.cs` (add `bool Degraded`)
- Modify: `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/MxAccessAlarmEventSink.cs` (`EnqueueTransition` carries degraded)
- Modify: `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/MxAccessEventMapper.cs` (`CreateOnAlarmTransition` sets `Degraded`/`SourceProvider`)
- Create: `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/SyntheticAlarmGuid.cs`
- Test: add cases to `src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/AlarmDispatcherTests.cs` and a new `SyntheticAlarmGuidTests.cs`
**Step 1: `SyntheticAlarmGuid.ForReference(string reference)`** — deterministic GUID from a stable hash (e.g. MD5 of the UTF-8 reference → `new Guid(bytes)`), so subtag-mode acks resolve by GUID. Test determinism + difference:
```csharp
[Fact] public void SameReference_SameGuid() =>
Assert.Equal(SyntheticAlarmGuid.ForReference("A.B.C"), SyntheticAlarmGuid.ForReference("A.B.C"));
[Fact] public void DifferentReference_DifferentGuid() =>
Assert.NotEqual(SyntheticAlarmGuid.ForReference("A.B.C"), SyntheticAlarmGuid.ForReference("A.B.D"));
```
**Step 2: Thread `degraded`** through `MxAlarmSnapshotRecord.Degraded`, `EnqueueTransition(... bool degraded)`, and `CreateOnAlarmTransition(... bool degraded, AlarmProviderMode sourceProvider)`. Default `degraded=false`, `sourceProvider=Alarmmgr` so the wnwrap path is unchanged (regression: existing `AlarmDispatcherTests` still pass with `Degraded=false`).
**Step 3: Tests** — extend `AlarmDispatcherTests` with a subtag-style transition asserting `body.Degraded == true` and `SourceProvider == Subtag`.
**Step 4: Build x86 + run** worker tests for `AlarmDispatcherTests`, `SyntheticAlarmGuidTests`.
**Step 5: Commit.**
```bash
git add src/ZB.MOM.WW.MxGateway.Worker/MxAccess/MxAlarmSnapshot.cs \
src/ZB.MOM.WW.MxGateway.Worker/MxAccess/MxAccessAlarmEventSink.cs \
src/ZB.MOM.WW.MxGateway.Worker/MxAccess/MxAccessEventMapper.cs \
src/ZB.MOM.WW.MxGateway.Worker/MxAccess/SyntheticAlarmGuid.cs \
src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/
git commit -m "worker(alarms): synthetic GUID + degraded provenance on emitted transitions"
```
---
### Task 9: Wire watch-list + failover config through `AlarmCommandHandler`; emit mode-changed event
**Classification:** high-risk
**Estimated implement time:** ~5 min
**Parallelizable with:** none (depends on Tasks 5, 7, 8)
**Files:**
- Modify: `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/AlarmCommandHandler.cs`
- Modify: `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/IAlarmCommandHandler.cs`
- Modify: `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/MxAccessCommandExecutor.cs` (`ExecuteSubscribeAlarms`, ~lines 588-616)
- Modify: `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/MxAccessStaSession.cs` (consumer factory wiring; mode-change → event queue)
- Test: `src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/AlarmCommandHandlerTests.cs` (extend or create)
**Step 1: Carry the subscribe payload.** Change the alarm subscribe entry point from `Subscribe(string subscription)` to `Subscribe(SubscribeAlarmsCommand command)` (the command now has `ForcedMode`, `WatchList`, `Failover`). In `AlarmCommandHandler.Subscribe`:
- Build the active provider per `ForcedMode`:
- `ALARMMGR``WnWrapAlarmConsumer` only.
- `SUBTAG``SubtagAlarmConsumer(new LmxSubtagAlarmSource(factory), watchList)` only.
- `UNSPECIFIED``FailoverAlarmConsumer(primary: wnwrap, standby: subtag, settings-from-Failover)`.
- Use the existing `consumerFactory` seam but widen it to `Func<SubscribeAlarmsCommand, IMxAccessAlarmConsumer>` so tests inject fakes and production builds the failover composite. Subscribe to `FailoverAlarmConsumer.ProviderModeChanged` and enqueue an `OnAlarmProviderModeChangedEvent` MxEvent via the event queue (new mapper method `CreateOnAlarmProviderModeChanged`).
**Step 2: Executor + STA wiring.** `ExecuteSubscribeAlarms` passes the full `SubscribeAlarmsCommand` (not just the expression). In `MxAccessStaSession`, the `alarmCommandHandlerFactory` must give the handler access to the `IMxAccessComObjectFactory` so the subtag source can create its own proxy server on the STA; keep the `EnsureOnAlarmConsumerThread` affinity guard on every path.
**Step 3: Test** — fake consumer factory; assert that a `SUBTAG` forced command builds the subtag consumer and advises; that an auto command building a fake failover composite, when it raises `ProviderModeChanged`, enqueues an `OnAlarmProviderModeChangedEvent` on the queue.
**Step 4: Build x86 + worker tests.**
**Step 5: Commit.**
```bash
git add src/ZB.MOM.WW.MxGateway.Worker/MxAccess/ src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/
git commit -m "worker(alarms): route watch-list/failover config; emit provider-mode-changed event"
```
---
## Phase 2 — Gateway: discovery, options, monitor, metrics, dashboard
### Task 10: `AlarmsOptions.Fallback` + validation
**Classification:** standard
**Estimated implement time:** ~4 min
**Parallelizable with:** Task 11, Task 13
**Files:**
- Modify: `src/ZB.MOM.WW.MxGateway.Server/Configuration/AlarmsOptions.cs`
- Create: `src/ZB.MOM.WW.MxGateway.Server/Configuration/AlarmFallbackOptions.cs`
- Modify: `src/ZB.MOM.WW.MxGateway.Server/Configuration/GatewayOptionsValidator.cs` (`ValidateAlarms`, ~lines 234-258)
- Test: `src/ZB.MOM.WW.MxGateway.Tests/Configuration/GatewayOptionsValidatorTests.cs` (extend)
**Step 1:** Add `AlarmFallbackOptions Fallback { get; init; } = new();` to `AlarmsOptions`. `AlarmFallbackOptions`: `string Mode = "Auto"` (`Auto|ForceAlarmManager|ForceSubtag`), `int ConsecutiveFailureThreshold = 3`, `int FailbackProbeIntervalSeconds = 30`, `int FailbackStableProbes = 3`, a `Discovery` sub-object (`bool UseGalaxyRepository = true`, `string Area = ""`, `string[] IncludeAttributes = []`, `string[] ExcludeAttributes = []`), and a `Subtags` sub-object (`Active="active"`, `Acked="acked"`, `AckComment=""`, `Priority="priority"`).
**Step 2:** In `ValidateAlarms`, when `Enabled` and `Mode == "ForceSubtag"` and `Discovery.UseGalaxyRepository == false` and `IncludeAttributes` empty ⇒ add a validation error ("ForceSubtag requires Galaxy Repository discovery or an explicit IncludeAttributes list"). Floor the three numeric values at 1. Validate `Mode` is one of the three literals.
**Step 3-5:** Test the new validation cases (red→green), build the server, commit.
---
### Task 11: Galaxy Repository "alarm attributes" discovery query
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 10, Task 13
**Files:**
- Modify: `src/ZB.MOM.WW.MxGateway.Server/Galaxy/GalaxyRepository.cs` (add `GetAlarmAttributesAsync` + SQL constant, following `GetAttributesAsync` ~lines 86-115 and `AttributesSql` ~line 176)
- Modify: `src/ZB.MOM.WW.MxGateway.Server/Galaxy/IGalaxyRepository.cs`
- Create: `src/ZB.MOM.WW.MxGateway.Server/Galaxy/GalaxyAlarmAttributeRow.cs`
- Test: `src/ZB.MOM.WW.MxGateway.Tests/Galaxy/` (projection unit test; live SQL gated)
**Step 1:** `GalaxyAlarmAttributeRow { string FullTagReference; string SourceObjectReference; string AckCommentSubtag; }` (and any priority subtag). `GetAlarmAttributesAsync` reuses the existing `is_alarm` detection (the `AlarmExtension` primitive join already in `AttributesSql`) filtered to `is_alarm = 1`, projecting the alarm reference + its ack-comment attribute. Follow the exact `SqlConnection`/`SqlCommand`/`SqlDataReader` pattern from `GetAttributesAsync`.
**Step 2:** Unit-test the row→`AlarmSubtagTarget` mapping (a pure mapper function); gate any live-DB test like the existing Galaxy live tests (or `Skip` with a note, matching `AlarmsLiveSmokeTests`).
**Step 3-5:** red→green, build server, commit.
---
### Task 12: Watch-list resolver (GR SQL + config override → `AlarmSubtagTarget[]`)
**Classification:** standard
**Estimated implement time:** ~4 min
**Parallelizable with:** none (depends on Tasks 10, 11)
**Files:**
- Create: `src/ZB.MOM.WW.MxGateway.Server/Alarms/AlarmWatchListResolver.cs`
- Create: `src/ZB.MOM.WW.MxGateway.Server/Alarms/IAlarmWatchListResolver.cs`
- Test: `src/ZB.MOM.WW.MxGateway.Tests/Alarms/AlarmWatchListResolverTests.cs`
**Step 1: Test the merge** with a fake `IGalaxyRepository`:
- discovery rows + `IncludeAttributes` are unioned; `ExcludeAttributes` removed; each becomes an `AlarmSubtagTarget` with `.active`/`.acked`/`.ackmsg` addresses composed from the configured `Subtags` names (`<reference>.<Active>`, etc.); empty config subtag names fall back to defaults; GR unavailable + no includes ⇒ empty list + a logged warning flag.
**Step 2: Implement** `ResolveAsync(AlarmsOptions, CancellationToken) → IReadOnlyList<AlarmSubtagTarget>`.
**Step 3-5:** red→green, build, commit.
---
### Task 13: Gateway metrics — provider-mode gauge + switch counter
**Classification:** small
**Estimated implement time:** ~3 min
**Parallelizable with:** Task 10, Task 11
**Files:**
- Modify: `src/ZB.MOM.WW.MxGateway.Server/Metrics/GatewayMetrics.cs` (ctor ~lines 55-79; add counter + observable gauge following the existing pattern)
- Test: `src/ZB.MOM.WW.MxGateway.Tests/Metrics/GatewayMetricsTests.cs` (if present; else assert via a `MeterListener`)
**Step 1:** Add `mxgateway.alarms.provider_switches` counter (tagged `from`,`to`,`reason`) and `mxgateway.alarms.provider_mode` observable gauge (1=alarmmgr, 2=subtag), plus `AlarmProviderSwitched(int from, int to, string reason)` and a private `GetAlarmProviderMode()` (lock on `_syncRoot` like the others).
**Step 2-4:** test, build, commit.
---
### Task 14: `GatewayAlarmMonitor` — arm watch-list, reflect provider mode, reconcile on switch
**Classification:** high-risk
**Estimated implement time:** ~5 min
**Parallelizable with:** none (depends on Tasks 9, 12, 13)
**Files:**
- Modify: `src/ZB.MOM.WW.MxGateway.Server/Alarms/GatewayAlarmMonitor.cs` (ctor ~41-49; `SubscribeAlarmsAsync` ~210-233; event-drain loop; `StreamAsync` ~386-434)
- Test: `src/ZB.MOM.WW.MxGateway.Tests/Alarms/GatewayAlarmMonitorProviderModeTests.cs` (new, using `FakeWorkerHarness`)
**Step 1:** Inject `IAlarmWatchListResolver` and `GatewayMetrics`. In `SubscribeAlarmsAsync`, resolve the watch-list and build the `SubscribeAlarmsCommand` with `ForcedMode` (from `Fallback.Mode`), `WatchList`, and `Failover` populated from options — instead of the bare `{ SubscriptionExpression }`.
**Step 2:** In the worker-event drain path, handle `OnAlarmProviderModeChangedEvent`: update a `_providerStatus` field (mode/degraded/reason/since), `Broadcast(new AlarmFeedMessage { ProviderStatus = … })` to every subscriber, call `metrics.AlarmProviderSwitched(...)`, and force a `ReconcileAsync` so the cache re-seeds from the now-active provider (avoids raise/clear storms).
**Step 3:** In `StreamAsync`, emit the current `provider_status` as the **first** message (before the snapshot) so a late joiner immediately knows the mode.
**Step 4: Test** — stand up the monitor with `FakeWorkerHarness`; emit an `OnAlarmProviderModeChangedEvent(Subtag)`; assert a `StreamAsync` subscriber receives a `ProviderStatus{ Mode=Subtag, Degraded=true }` and that the switch counter incremented. Also assert a transition emitted in subtag mode flows through with `Degraded=true`.
**Step 5:** build server, run the new test, commit.
---
### Task 15: Dashboard — push provider status to `/hubs/alarms` + UI indicator
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** none (depends on Task 14)
**Files:**
- Modify: `src/ZB.MOM.WW.MxGateway.Server/Dashboard/Hubs/AlarmsHubPublisher.cs` (forward `ProviderStatus` messages — they already flow through `StreamAsync`, so confirm the existing `SendAsync(AlarmMessage, message)` carries them; add a dedicated `"ProviderModeChanged"` client method if the dashboard needs a distinct channel)
- Modify: the alarms dashboard page/component (Bootstrap-only badge: green "alarmmgr" / amber "degraded — subtag") — find under `src/ZB.MOM.WW.MxGateway.Server/Dashboard/`
- Test: `src/ZB.MOM.WW.MxGateway.Tests/` dashboard model test (e.g. a `DashboardAlarmProviderStatus.FromFeed` mapper, mirroring `DashboardActiveAlarm.FromSnapshot`)
**Constraint:** Bootstrap CSS/JS only — no MudBlazor/Radzen/FluentUI.
**Steps:** TDD the model mapper, wire the publisher + badge, build, commit.
---
## Phase 3 — Integration, docs, live smoke
### Task 16: End-to-end fake-worker failover test
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 18
**Files:**
- Test: `src/ZB.MOM.WW.MxGateway.Tests/Alarms/AlarmFailoverEndToEndTests.cs`
Drive the full gateway path with `FakeWorkerHarness`: subscribe (assert the `SubscribeAlarmsCommand` carries a watch-list), emit a wnwrap-style transition (assert `Degraded=false`), emit `OnAlarmProviderModeChangedEvent(Subtag)`, emit a synthesized transition (assert `Degraded=true`, `SourceProvider=Subtag`), then `OnAlarmProviderModeChangedEvent(Alarmmgr)` and assert the feed reports recovery. Build, run, commit.
---
### Task 17: Live subtag smoke test (opt-in)
**Classification:** small
**Estimated implement time:** ~4 min
**Parallelizable with:** Task 18
**Files:**
- Test: `src/ZB.MOM.WW.MxGateway.IntegrationTests/...AlarmSubtagLiveSmokeTests.cs` (or the worker live suite)
A `[LiveMxAccessFact]`, `Skip`-by-default test (per `AlarmsLiveSmokeTests` precedent) that, against a live Galaxy + alarm flip script: advises the real `.active`/`.acked` subtags via `LmxSubtagAlarmSource`, asserts a synthesized raise/clear, and performs an ack via the ack-comment write. Document the exact subtag names discovered (resolves the design's open item). Commit.
---
### Task 18: Documentation
**Classification:** trivial
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 16, Task 17
**Files:**
- Modify: `gateway.md` (alarm provider section: dual provider + auto-failover/failback)
- Modify: `docs/DesignDecisions.md` (record the fallback decision + parity rationale)
- Modify: `docs/GatewayConfiguration.md` (the `MxGateway:Alarms:Fallback` block)
- Modify: `docs/AlarmClientDiscovery.md` (subtag provider, synthesis rules, ack-comment write)
- Modify: `docs/Grpc.md` (new `provider_status` feed case + `degraded`/`source_provider` fields)
Follow `StyleGuide.md` (PascalCase filenames, present tense, explain *why*). No code; commit.
---
## Execution order & parallelism summary
- **Serial spine:** 1 → 2 → 3 → 4 → 5 → 6 → 7 → 8/9 → 10/11 → 12 → 13 → 14 → 15 → 16 → 17/18.
- **Parallelizable clusters:** {8, 9 partially}, {10, 11, 13}, {16, 17, 18}.
- **High-risk tasks** (full review chain): 1, 2, 6, 7, 9, 14. **Standard:** 4, 5, 8, 10, 11, 12, 15, 16. **Small/trivial:** 3, 13, 17, 18.
## Risk notes for the executor
- **Field-number collisions:** Task 2 must read the live `MxEvent`/`MxEventFamily` numbers before adding — the agent map gave alarm-payload maxima but not `MxEvent`'s. Verify before editing.
- **STA discipline:** every COM call in `LmxSubtagAlarmSource` and every consumer swap runs on the worker STA; keep the `EnsureOnAlarmConsumerThread` guard. The worker STA already pumps Windows messages, which is required for the subtag `OnDataChange` to deliver.
- **Parity regression:** alarmmgr-mode output must be byte-for-byte unchanged. Existing `AlarmDispatcherTests` and `ProtobufContractRoundTripTests` are the guardrail — they must stay green with `Degraded=false` defaults.
- **Subtag names unverified:** the design leaves exact AVEVA subtag names (`.active`, `.acked`, ack-comment) to confirm against `C:\Users\dohertj2\Desktop\mxaccess` + a live Galaxy (Task 17). The config `Subtags` block exists so names are not hard-coded.
@@ -0,0 +1,147 @@
{
"planPath": "docs/plans/2026-06-13-alarm-subtag-fallback.md",
"tasks": [
{
"id": 54,
"subject": "Task 1: Worker proto \u2014 watch-list, failover config, AlarmProviderMode",
"status": "completed"
},
{
"id": 55,
"subject": "Task 2: Gateway proto \u2014 provider status, degraded provenance, mode-changed event",
"status": "completed",
"blockedBy": [
54
]
},
{
"id": 56,
"subject": "Task 3: Proto round-trip tests for new alarm fields",
"status": "completed",
"blockedBy": [
54,
55
]
},
{
"id": 57,
"subject": "Task 4: Subtag value-source abstraction + synthesis state machine",
"status": "completed",
"blockedBy": [
54
]
},
{
"id": 58,
"subject": "Task 5: SubtagAlarmConsumer over the source seam",
"status": "completed",
"blockedBy": [
57
]
},
{
"id": 59,
"subject": "Task 6: COM-backed LmxSubtagAlarmSource",
"status": "completed",
"blockedBy": [
57
]
},
{
"id": 60,
"subject": "Task 7: FailoverAlarmConsumer state machine",
"status": "completed",
"blockedBy": [
58
]
},
{
"id": 61,
"subject": "Task 8: Synthetic GUID + degraded flag on event sink path",
"status": "completed",
"blockedBy": [
55
]
},
{
"id": 62,
"subject": "Task 9: Wire watch-list/failover through AlarmCommandHandler; emit mode-changed",
"status": "completed",
"blockedBy": [
58,
60,
61
]
},
{
"id": 63,
"subject": "Task 10: AlarmsOptions.Fallback + validation",
"status": "completed"
},
{
"id": 64,
"subject": "Task 11: Galaxy Repository alarm-attributes discovery query",
"status": "completed"
},
{
"id": 65,
"subject": "Task 12: Watch-list resolver (GR SQL + config override)",
"status": "completed",
"blockedBy": [
54,
63,
64
]
},
{
"id": 66,
"subject": "Task 13: Metrics \u2014 provider-mode gauge + switch counter",
"status": "completed"
},
{
"id": 67,
"subject": "Task 14: GatewayAlarmMonitor \u2014 arm watch-list, reflect mode, reconcile on switch",
"status": "completed",
"blockedBy": [
55,
62,
65,
66
]
},
{
"id": 68,
"subject": "Task 15: Dashboard \u2014 push provider status + UI badge",
"status": "completed",
"blockedBy": [
67
]
},
{
"id": 69,
"subject": "Task 16: End-to-end fake-worker failover test",
"status": "completed",
"blockedBy": [
67
]
},
{
"id": 70,
"subject": "Task 17: Live subtag smoke test (opt-in)",
"status": "completed",
"blockedBy": [
59,
62
]
},
{
"id": 71,
"subject": "Task 18: Documentation",
"status": "completed",
"blockedBy": [
67
]
}
],
"lastUpdated": "2026-06-13T13:30:00Z"
}
+270
View File
@@ -0,0 +1,270 @@
# Deferred Follow-ups Implementation Plan
**Date:** 2026-06-14
**Status:** Plan only — NOT yet executed. Saved for review.
**Context:** After the alarm-subtag-fallback cleanup (merged `5976770`) and its redeploy to
windev (10.100.0.48), five items remain deferred. This plan handles all five. They are
independent — execute in any order, or cherry-pick. Items D1D2 are code (branch off `main`);
D3 is a dev-rig validation; D4D5 are ops on the deployed hosts (no code).
Source of the deferred list: the post-deploy review on 2026-06-14. See also memory
`project_deploy_mechanics`, `project_wonder_deployment`, `project_alarm_subtag_fallback`,
`project_rig_alarms_object_driven`.
---
## D1 — Surface `AlarmProviderSwitchCount` on the dashboard metric list
**Classification:** small · **Est:** ~10 min · **Where:** Server (net10), build+test on macOS
**Why deferred:** the dashboard reads provider *state* from the feed's `ProviderStatus` badge,
not the metrics snapshot, so the snapshot field (added in B1) and the OTEL counter were enough.
This makes the cumulative switch count visible in the dashboard's metric table too.
**Files:**
- Modify: `src/ZB.MOM.WW.MxGateway.Server/Dashboard/DashboardSnapshotService.cs` (`CreateMetricSummaries`, ~line 178198)
- Test: `src/ZB.MOM.WW.MxGateway.Tests/Dashboard/DashboardSnapshotServiceTests.cs` (or the existing snapshot test file)
**Steps:**
1. In `CreateMetricSummaries`, add to the `metrics` list (near the other counters):
`new("mxgateway.alarms.provider_switches", snapshot.AlarmProviderSwitchCount),`
Also consider adding the current mode as a gauge row if useful:
the snapshot does not carry the mode int (only the feed does), so DO NOT invent one —
only add the switch count, which the snapshot already exposes.
2. Add/extend a unit test asserting the summary list contains a
`mxgateway.alarms.provider_switches` row equal to the snapshot's `AlarmProviderSwitchCount`
after calling `metrics.AlarmProviderSwitched(...)`.
3. Verify: `dotnet build src/ZB.MOM.WW.MxGateway.Server` (macOS) and
`dotnet test src/ZB.MOM.WW.MxGateway.Tests --filter FullyQualifiedName~DashboardSnapshot`.
4. Commit.
**Rollback:** revert the one-line list addition.
---
## D2 — Reproduce and fix the `AmbiguousMatchException` on `GET /`
**Classification:** standard · **Est:** ~3045 min (repro + fix) · **Where:** Server (net10)
**Why deferred:** observed only in *old* (06/05, Development) logs:
`AmbiguousMatchException: The request matched multiple endpoints. Matches: / (/) / (/)`.
Unauthenticated `GET /` safely 302s to `/login`; the ambiguity would only throw for an
*authenticated* request that reaches routing. Pre-existing and unrelated to the alarm work,
but it would surface as a dashboard-home 500. Confirm it still reproduces before fixing.
**Root-cause candidates (the two endpoints both matching `/`):**
- `DashboardHome.razor` (`@page "/"`) mapped by `MapRazorComponents<App>()`
(`Dashboard/DashboardEndpointRouteBuilderExtensions.cs:92`).
- `MapStaticAssets(...)` (`GatewayApplication.cs:190`) — in .NET 810 the static-assets
endpoint can register a fingerprinted root that collides with the home page.
**Step 0 — Reproduce (do this first; if it does NOT repro, downgrade to a documentation note):**
1. Log into the dashboard as `multi-role`/`password` (Administrator) at
`http://10.100.0.48:5130/login`.
2. Navigate to `http://10.100.0.48:5130/` (the "Dashboard" home).
3. If it renders the Overview page → not a live bug on this build; record that and stop
(the dev error-page throw was a Development-only artifact). If it 500s with
`AmbiguousMatchException` → proceed to fix.
**Fix (only if it reproduces):**
- Files: `src/ZB.MOM.WW.MxGateway.Server/GatewayApplication.cs` (endpoint mapping order/region),
`src/ZB.MOM.WW.MxGateway.Server/Dashboard/DashboardEndpointRouteBuilderExtensions.cs`,
`src/ZB.MOM.WW.MxGateway.Server/Dashboard/Components/Pages/DashboardHome.razor`.
- Candidate fixes, in order of preference:
1. Inspect the actual two matched endpoint display names at runtime (the exception lists
them) — temporarily enable detailed errors or read the stderr `Matches:` block — to learn
exactly which two collide.
2. If `MapStaticAssets` owns the second `/`: it must run, but confirm it isn't double-mapping
the home route; if it is, move `MapStaticAssets` ordering or scope it so it doesn't claim
the literal `/` content path. (Do NOT remove `MapStaticAssets` — Blazor needs it.)
3. If two component pages both declare `@page "/"`: give the non-home one a distinct route.
4. Last resort: give `DashboardHome` an explicit route (e.g. `@page "/overview"`) and add a
trivial `MapGet("/", () => Results.Redirect("/overview"))` with `.RequireAuthorization(...)`.
- Add a test that exercises the root route once resolved (a WebApplicationFactory-based
integration test asserting authenticated `GET /` returns 200, if the dashboard test harness
supports auth; otherwise a routing-uniqueness assertion).
- Verify: `dotnet build src/ZB.MOM.WW.MxGateway.Server`; re-run the authenticated `GET /` repro.
**Rollback:** revert the routing change; the home page returns to its prior (ambiguous) state.
**Note:** this fix ships only via a Server redeploy (D-deploy applies, see D5/windev redeploy
procedure in `project_deploy_mechanics`). Decide whether to bundle it with the next redeploy.
---
## D3 — Validate the failover path end-to-end on the dev rig (ForceSubtag)
**Classification:** standard (validation, no production code) · **Est:** ~3045 min · **Where:** dev rig / windev
**Why deferred:** `provider_mode 1` (healthy alarmmgr) is verified in production, but the actual
alarmmgr→subtag **failover**, the **degraded badge**, the synthesized subtag alarms, and the
`provider_switches{from,to,reason}` counter (with the new bounded `failover`/`failback`/`unknown`
tag) have never fired live — a healthy system never switches, and the rig's alarms can't be made
to fail COM on demand. Use the explicit `ForceSubtag` config mode to exercise the degraded path
deterministically without an alarmmgr fault.
**Approach:** run a *temporary* gateway instance (NOT the production service) in `ForceSubtag`
mode against the dev Galaxy, drive it with the operator/IDE alarm toggle, and confirm the
degraded surface. Do this on windev in a throwaway run so production stays on `Auto`.
**Steps:**
1. On windev, from a build worktree at `main`, run the Server locally (or a second instance on
alt ports) with `MxGateway:Alarms:Fallback:Mode=ForceSubtag` (env
`MxGateway__Alarms__Fallback__Mode=ForceSubtag`), pointed at the dev Galaxy
(`\\DESKTOP-6JL3KKO\Galaxy!DEV`). Use distinct Kestrel ports to avoid clashing with the
production service on 5120/5130.
2. Subscribe an alarm client (or open the dashboard alarms page for that instance) and confirm:
- The provider badge shows **"Subtag monitoring (degraded)"** (amber `bg-warning`).
- `curl .../metrics` shows `mxgateway_alarms_provider_mode 2`.
- Active alarms appear with `degraded=true` and a synthetic (MD5-derived) GUID, with the
reference shape `Galaxy!<realArea>.<object>.<attr>` (e.g. `Galaxy!TestArea.TestMachine_001.TestAlarm001`).
3. Drive a transition: have the operator/IDE toggle a `TestMachine_NNN.TestAlarmNNN` true→false
(external MXAccess writes are ignored — see `project_rig_alarms_object_driven`); confirm a
synthesized Raise then Clear, and an `AckMsg` write via AcknowledgeByName returns 0.
4. (Optional, to exercise the switch counter) run in `Auto` and induce a primary fault if a safe
way exists; otherwise document that the counter is unit-tested only and `ForceSubtag` covers
the degraded surface. Note: `ForceSubtag` may not increment `provider_switches` (no runtime
switch) — that counter's live exercise remains the one gap; record it explicitly rather than
claiming coverage.
5. Tear down the temporary instance. Production service is untouched (stays `Auto`).
6. Record results in `project_alarm_subtag_fallback` memory (degraded badge + synthesized
subtag alarms now live-validated; switch-counter still unit-test-only if not exercised).
**Rollback:** none — temporary instance only; nothing in production changes.
---
## D4 — Prune stale deploy backups on windev
**Classification:** trivial (ops) · **Est:** ~10 min · **Where:** windev (10.100.0.48), no code
**Why deferred:** backups accumulate on every deploy; harmless but cluttering
`C:\publish\mxaccessgw\`.
**Current backups observed (2026-06-14):**
- `Server.bak-20260526T143341`, `Server.bak-20260529T090320`, `Server.bak529.20260604T122616`,
`Server.bak-theme030-20260605-053056`, `Server.bak-theme031-20260605-083410`,
`Server.bak-20260614-prefallback` (today's), `Worker.bak-20260614-prefallback` (today's).
- Inside `Server\`: `ZB.MOM.WW.MxGateway.Server.dll.bak-20260604-loginfix`,
`appsettings.json.bak-20260604-{glauth35,ldapkeys}`.
**Steps:**
1. **Keep** `Server.bak-20260614-prefallback` and `Worker.bak-20260614-prefallback` — the
immediate rollback for the current deploy. Keep until the new build has soaked (e.g. a few
days / one business cycle).
2. Delete the clearly-superseded older backups: the `Server.bak-2026052*`, `Server.bak529.*`,
and `Server.bak-theme03*` dirs. Confirm each is a Server dir (not something live) before
`rmdir /s /q`.
3. Optionally remove the in-`Server` `.bak-*` sidecar files (`*.dll.bak-*`,
`appsettings.json.bak-*`) — but FIRST confirm the live `appsettings.json` is correct (it is,
post-deploy) so the `.bak` ldap/glauth copies aren't the only record of a needed value.
4. Verify the service is unaffected: `Get-Service MxAccessGw` Running, `/health/ready` 200.
**Rollback:** none needed (deletions only; the current-deploy backup is retained). If unsure
about any single dir, skip it.
---
## D5 — Redeploy the wonder host to bring it to parity
**Classification:** high-risk (separate production-ish box, fiddly access, divergent build) · **Est:** ~12 h · **Where:** wonder-app-vd03.zmr.zimmer.com (10.220.157.247)
**Why deferred:** wonder still runs the pre-feature build — it lacks the entire alarm-fallback
feature and the cleanup. Bring it to `5976770` if parity is wanted. This box is materially
different from windev (read `project_wonder_deployment` fully before starting).
**Key differences from windev (do NOT reuse the windev recipe blindly):**
- Access: no usable `ssh host "cmd"` exec. Port 22 OpenSSH (PowerShell) was observed CLOSED
2026-05-26; the reliable path is **servecli on port 2222** (cmd.exe PTY + SFTP only) using the
base64-`-EncodedCommand` pattern in `project_wonder_deployment` / `~/Desktop/servecli/instructions.md`.
Re-test port 22 first; if up, prefer it.
- Build shape: wonder runs a **self-contained single-file win-x64** Server (~118 MB), renamed to
`MxGateway.Server.exe` for the NSSM path `E:\ApiInstall\MxGateway\Server\MxGateway.Server.exe`.
So publish must be `-r win-x64 --self-contained -p:PublishSingleFile=true` (NOT the
framework-dependent windev recipe), then rename the exe to `MxGateway.Server.exe`.
- Static assets quirk: ship BOTH `ZB.MOM.WW.MxGateway.Server.staticwebassets.endpoints.json` and
`MxGateway.Server.staticwebassets.endpoints.json` next to the exe (the ZB-named one is used).
- Config: wonder's `appsettings.json` carries HTTP-switched Kestrel (`http://0.0.0.0:5130`),
`Dashboard.GroupToRole` = `{ SCADA-Admins/Designers/Deploy-All → Admin }`,
`Dashboard.RequireHttpsCookie:false`, LDAP base `dc=scadalink,dc=local`. **Preserve wonder's
appsettings.json** exactly like windev (do not ship the repo default). Note: wonder's
GroupToRole values are `"Admin"` — VERIFY the current build's validator accepts `"Admin"` (the
windev crash showed it requires `'Administrator'`/`'Viewer'`). **If the validator rejects
`"Admin"`, wonder's appsettings GroupToRole values must be updated to `Administrator` BEFORE/with
the deploy or the new build will crash-loop.** This is the single biggest risk — resolve it first.
- Worker: wonder's x86 worker is the original; the alarm-fallback adds new subscribe fields. To
use subtag fallback on wonder, the **x86 worker must also be redeployed** (publish x86, rename
if needed). If only the Server is updated, confirm worker IPC protobuf compat (contracts changed
for alarms — a stale worker may not understand the extended SubscribeAlarms). Safer to deploy
both Server and Worker together.
**Steps (high level — expand against the runbook before executing):**
1. Confirm parity is actually wanted on wonder, and whether the dashboard (disabled there) and
the alarm monitor are even in scope for that box. If alarms/dashboard are off on wonder, this
may reduce to "Server binary parity only."
2. Resolve the **GroupToRole value compatibility** question (Admin vs Administrator) — inspect
the current build's `GatewayOptionsValidator` and decide whether to patch wonder's appsettings.
3. Build self-contained Server (`-r win-x64 --self-contained -p:PublishSingleFile=true`) and x86
Worker from `main` (`5976770`).
4. Transfer via SFTP (servecli) to a staging dir on `E:\ApiInstall\MxGateway\`.
5. Stop `MxAccessGw`; back up `Server``Server.bak.<ts>` and `Worker``Worker.bak.<ts>`;
swap in the new build; **restore wonder's `appsettings.json`** (+ patched GroupToRole if
needed); ensure both staticwebassets manifests are present; rename Server exe to
`MxGateway.Server.exe`.
6. Start `MxAccessGw` (no dependents). Verify stderr line count does not grow (no crash loop),
`curl -k https://...:5130/health/live` (or HTTP per current config) Healthy, gRPC 5120 up.
7. Verify the new metric is present if the metrics endpoint is exposed there.
**Rollback (documented in memory):** stop service,
`Move-Item Server Server.failed; Move-Item Server.bak.<ts> Server`, restore appsettings from the
backup, start.
---
## Suggested order
1. **D2 repro** (5 min — just log in and hit `/`): decides whether D2 is a real fix or a no-op.
2. **D1** (small code) + **D2 fix if needed** — bundle into one Server branch; they ship together
on the next Server redeploy.
3. **D4** (prune windev backups) — quick ops, independent.
4. **D3** (ForceSubtag validation) — exercises the degraded surface; do before/after the D1+D2
redeploy so you also confirm the new Server build is healthy.
5. **D5** (wonder) — largest and riskiest; do last, only if parity is wanted, and resolve the
GroupToRole-value compatibility question first.
## Execution note
This plan is intentionally NOT executed. When ready, execute on a branch off `main`
(`feat/deferred-followups` or per-item branches) — do not commit to `main` directly. D1/D2 need a
Server redeploy to take effect; D4/D5 are host operations; D3 touches nothing permanent.
---
## D2 — Resolution (2026-06-14)
Static source determination on build `5976770` (runtime repro out of scope). What was checked:
- **Razor `@page "/"` count:** exactly ONE — `Dashboard/Components/Pages/DashboardHome.razor`.
All other pages declare distinct routes (`/login`, `/sessions`, `/galaxy`, `/browse`,
`/apikeys`, `/workers`, `/events`, `/alarms`, `/settings`, `/sessions/{SessionId}`).
- **No root index.html:** `src/ZB.MOM.WW.MxGateway.Server/wwwroot/` contains only `css/` and
`lib/` subdirectories; no `index.html` anywhere under `wwwroot/`.
- **No `UseDefaultFiles` / no `MapFallback`:** neither appears anywhere in the Server project.
- **`MapStaticAssets` mapped once** (`GatewayApplication.cs:190`) and `UseStaticFiles()` once
(`:41`); the static-assets manifest serves fingerprinted CSS/JS assets, not a literal `/`.
- **No `MapGet("/")`:** the dashboard endpoint builder
(`Dashboard/DashboardEndpointRouteBuilderExtensions.cs`) maps only `/auth/login`, `/logout`,
`/denied`, `/hubs/{snapshot,alarms,events}`, `/hubs/token`, then `MapRazorComponents<App>()`.
None use pattern `/`.
- **`MapZbHealth` / `MapZbMetrics`** come from the external `ZB.MOM.WW.Health` shared library
(not in this repo) and map health/metrics paths, not `/`.
**Root cause of the 2026-06-05 log:** `code-reviews/Server/findings.md` (re-review at `42b0037`,
2026-05-24) records that commit `de7639a` **removed the legacy `MapGet("/", ...)` redirect that
was colliding with the Blazor `@page "/"` (a real 500)**. That legacy registration was the source
of the `AmbiguousMatchException`. It is gone on the current build, so the second `/` endpoint no
longer exists.
**Conclusion:** No duplicate `/` endpoint on build `5976770`. The AmbiguousMatchException is not
reproducible from source — it was a stale Development-only artifact from before `de7639a` reached
the deployed instance. **No source change made** (no-op).
**Residual:** A 100% confirmation still requires an authenticated runtime `GET /` against a
deployed instance (the only path that exercises routing past the unauthenticated 302-to-`/login`).
Recommend a spot-check of authenticated `GET /` after the next Server redeploy; if it returns 200
(not 500), this item can be fully closed.
@@ -0,0 +1,116 @@
# ForceSubtag Mode Fix Implementation Plan
> Fixes the two defects surfaced by the D3 live validation (2026-06-15): forced-subtag
> doesn't actually run subtag (#1), and the gateway never reflects a forced provider mode
> into the gauge/feed (#2).
**Goal:** Make `MxGateway:Alarms:Fallback:Mode=ForceSubtag` actually serve degraded subtag
alarms AND have the gateway advertise `provider_mode=2` / degraded badge.
**Evidence:** Live ForceSubtag run returned alarmmgr-sourced active alarms (May raise
timestamps, `degraded=false`) and `provider_mode` stuck at 1, despite ForceSubtag binding
(proven by invalid-value crash) and the deployed worker containing the ForcedMode routing
(`3f5e5fc``5976770`, worker dated 2026-06-14).
---
## Defect #2 (CONFIRMED code defect) — gateway never reflects forced mode
**Root cause:** `GatewayAlarmMonitor.RunMonitorAsync` hard-baselines `_providerMode=Alarmmgr`
and sets the gauge to 1, ignoring `_options.Fallback.Mode`. `_providerMode` only advances on a
worker `OnAlarmProviderModeChanged` event, which is raised ONLY by `FailoverAlarmConsumer`
(Auto mode). Forced-subtag builds `SubtagAlarmConsumer` directly → no event → gauge/feed stay
Alarmmgr forever.
### Task 2: Seed provider mode from configured forced mode (gateway, net10)
**Classification:** small · **Parallelizable with:** none (precedes the diagnostic build)
**Files:**
- Modify: `src/ZB.MOM.WW.MxGateway.Server/Alarms/GatewayAlarmMonitor.cs` (`RunMonitorAsync`, ~lines 160-172; add a permanent observability log in `SubscribeAlarmsAsync` ~line 257)
- Test: `src/ZB.MOM.WW.MxGateway.Tests/Alarms/GatewayAlarmMonitorProviderModeTests.cs`
**Change:** in `RunMonitorAsync`, compute `initialMode = MapForcedMode(_options.Fallback.Mode)`
mapped as `Subtag→Subtag`, `Alarmmgr→Alarmmgr`, `Unspecified→Alarmmgr` (Auto starts on the
alarmmgr primary). Set `_providerMode/_providerDegraded(=Subtag)/_providerReason/_providerSince`
and `_metrics.SetAlarmProviderMode(ModeToInt(initialMode))` — using the existing no-switch gauge
seam so `provider_switches` does NOT increment. Add a log in `SubscribeAlarmsAsync`:
`"Alarm subscribe forcedMode={ForcedMode} (configMode={ConfigMode}) watchList={Count}"`.
**Tests (fake-worker, no MXAccess):** with `Fallback:Mode=ForceSubtag` assert (a) first
`StreamAlarms` message is `ProviderStatus{Mode=Subtag,Degraded=true}`; (b) gauge==2; (c)
`provider_switches`==0. Add `ForceAlarmManager`→gauge 1 and `Auto`→gauge 1 baseline cases.
**Verify:** `dotnet build src/ZB.MOM.WW.MxGateway.Server`; `dotnet test src/ZB.MOM.WW.MxGateway.Tests --filter FullyQualifiedName~GatewayAlarmMonitorProviderMode`.
---
## Defect #1 (runtime bug — needs diagnosis) — ForceSubtag runs alarmmgr
The HEAD source path (gateway `MapForcedMode``SubscribeAlarmsCommand.ForcedMode=Subtag`
IPC → worker `AlarmCommandHandler.BuildConsumer``SubtagAlarmConsumer`) is statically correct,
yet the runtime ran alarmmgr. A runtime diagnostic must locate where `forcedMode` becomes
`Unspecified`.
### Task 1: Add worker BuildConsumer observability log (worker, net48 x86)
**Classification:** small (net48 — no init-only props) · **Parallelizable with:** Task 2
**Files:**
- Modify: `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/AlarmCommandHandler.cs` (`BuildConsumer`, ~line 206)
**Change:** log at entry of `BuildConsumer`:
`"BuildConsumer forcedMode={ForcedMode} watchList={Count}"`. This is permanent observability.
**Verify (windev only):** `dotnet build src/ZB.MOM.WW.MxGateway.Worker -p:Platform=x86`.
### Task 3: Diagnostic run — capture the real forcedMode
**Classification:** standard (ops/diagnostic on windev) · **Parallelizable with:** none
**Steps:** rebuild worker (x86) + server from the branch, stand up the D3-style temp ForceSubtag
instance (alt ports 5122/5132, isolated `.D3` worker name, WMI-detached, Http2, Development env
for file logs), trigger the always-on monitor, and read the two new log lines:
- Gateway `Alarm subscribe forcedMode=...` — what the gateway SENDS.
- Worker `BuildConsumer forcedMode=...` — what the worker RECEIVES.
Decision matrix:
- Gateway logs `Subtag`, worker logs `Unspecified` → IPC/serialization drops the enum → fix the
send/translation path (likely a worker-proto vs gateway-proto `SubscribeAlarmsCommand` mismatch
in the named-pipe envelope).
- Worker logs `Subtag` but alarmmgr data appears → bug in `BuildStandby`/`SubtagAlarmConsumer`/
`AlarmDispatcher` snapshot path.
- Gateway logs `Unspecified` despite config ForceSubtag → gateway config/options read.
### Task 4: Fix #1 per Task 3 diagnosis
**Classification:** standard/high-risk (depends on where the defect is) · the exact change is
determined by Task 3. Add a regression test at the identified layer (worker unit test for
BuildConsumer→SubtagAlarmConsumer, or a contract/IPC round-trip test if the enum is dropped).
---
## Final: build, test, redeploy, re-validate
- Build gateway (macOS) + worker (windev x86); run gateway + worker test suites.
- Redeploy windev Server (and Worker if Task 4 changed it) per `project_deploy_mechanics`,
preserving appsettings.
- Re-validate live with the temp ForceSubtag instance: active alarms `degraded=true` /
`source_provider=SUBTAG` with recent timestamps, `provider_mode 2`. Tear down temp instance;
production untouched.
## Execution note
Branch off `main`. #2 is the clean confirmed fix; #1 is diagnose-then-fix. Net48 worker
constraints apply (no init-only props/positional records). Do NOT increment `provider_switches`
on an initial forced-mode set.
---
## Resolution (2026-06-15)
**#1 was NOT a bug — it was a grpcurl proto-mismatch artifact.** End-to-end instrumentation
proved: the gateway sends `forcedMode=Subtag`, the worker's `BuildConsumer` builds the
`SubtagAlarmConsumer`, and the worker `QueryActive` + gateway `ApplyReconcile`/`StreamAlarms`
all carry `degraded=true` / `source_provider=SUBTAG`. The original "degraded=false" observation
came from running grpcurl against the host checkout's proto (branch `feat/lazy-browse-children`),
which predates the feature and lacks the `degraded`/`source_provider` fields — grpcurl silently
dropped them. With the matching proto, every active alarm shows `degraded=true`. No code change.
Tasks 1/3/4 (worker diagnostics + #1 fix) were dropped; the temporary diagnostics were reverted.
**#2 was a real defect — fixed.** Gateway now seeds `_providerMode`/gauge/feed from the configured
forced mode (`fix: gateway reflects configured forced provider mode`). Verified live:
`provider_mode=2`, first `ProviderStatus` = `Mode=Subtag, degraded=true,
reason="Forced subtag mode (configuration)"`. Auto mode unchanged → windev production (Auto)
unaffected; no redeploy required. Gateway tests: 163 passed.
+61
View File
@@ -143,6 +143,67 @@ session if the worker faults. Gated by `MxGateway:Alarms:Enabled` — see
`docs/DesignDecisions.md` for why this reverses the v1 single-subscriber rule
for the alarm subsystem.
### Alarm providers and failover
The alarm feed has two providers, both implemented worker-side:
- **Alarm manager (primary):** `WnWrapAlarmConsumer` polls
`wwAlarmConsumerClass.GetXmlCurrentAlarms2` on the worker STA. This is the
authoritative native source.
- **Subtag monitoring (standby):** `SubtagAlarmConsumer` advises each alarm
attribute's subtags (`.active`, `.acked`, optionally `.priority`) via the
existing `AddItem`/`Advise` pipeline through `LmxSubtagAlarmSource` and
synthesizes alarm transitions with `SubtagAlarmStateMachine`. This is a
non-parity, lower-fidelity source — synthetic GUIDs, no native raise
timestamps, narrower fields.
`FailoverAlarmConsumer` wraps both and owns the state machine:
- **Auto-failover:** after `ConsecutiveFailureThreshold` (default 3)
consecutive wnwrap COM failures — `Subscribe` or `PollOnce` throws or
returns a failure HRESULT — it activates the standby. The standby is armed
(subscribed and adviseing) from the start so its state is warm at the moment
of switch.
- **Auto-failback:** while degraded, every `FailbackProbeIntervalSeconds`
(default 30) it re-probes the still-subscribed primary. After
`FailbackStableProbes` (default 3) consecutive clean polls it switches back
to the alarm manager.
- **On every switch:** the consumer snapshots the now-active provider and
emits `OnAlarmProviderModeChangedEvent` so the gateway can reconcile its
cache without a raise/clear storm.
Synthesis is worker-side. This preserves the parity rule — the gateway
forwards only events the worker emits and never synthesizes transitions
itself. The synthesis rules are documented in
`docs/AlarmClientDiscovery.md`.
**Acknowledge in subtag mode:** the ack-by-name path writes the operator
comment to the alarm attribute's ack-comment subtag. The write performs the
ack. If the attribute has no writable ack-comment subtag configured, the RPC
returns `FailedPrecondition`. In alarm-manager mode, `AlarmAckByName` is
used as before.
**Degraded state visibility:** every subtag-mode transition carries
`degraded = true` and `source_provider = ALARM_PROVIDER_MODE_SUBTAG` on the
`OnAlarmTransitionEvent` and `ActiveAlarmSnapshot` proto fields. The
`AlarmFeedMessage` feed emits an `AlarmProviderStatus` message (the
`provider_status` oneof case) on stream open and on every switch. The
dashboard shows a Bootstrap badge: green ("Alarm Manager") when healthy, amber
("Subtag monitoring (degraded)") on an unexpected failover, and cyan ("Subtag
monitoring (forced)") when subtag mode is the configured `Fallback:Mode=ForceSubtag`
— the latter distinguished by the well-known `AlarmProviderStatus.reason`
(`AlarmProviderReasons.ForcedSubtag`) so an intentional configuration is not shown
as a fault. Metrics: `mxgateway.alarms.provider_mode` gauge (1 = alarmmgr,
2 = subtag) and `mxgateway.alarms.provider_switches` counter.
Forced modes are available via `MxGateway:Alarms:Fallback:Mode`:
`ForceAlarmManager` disables failover; `ForceSubtag` forces the standby
on from startup; `Auto` (default) enables failover and failback. Watch-list
discovery for the subtag provider uses Galaxy Repository SQL with config
overrides. See `docs/GatewayConfiguration.md` for the full `Fallback` option
block and `docs/AlarmClientDiscovery.md` for synthesis rules and fidelity
limitations.
Dashboard authentication is LDAP-backed (distinct from the API-key model on
the gRPC API). `/login` accepts username and password in a form body, binds
against `MxGateway:Ldap`, maps the user's LDAP groups to `Admin` or `Viewer`
+8 -4
View File
@@ -39,10 +39,14 @@ if ($Version -notmatch '^v\d+\.\d+\.\d+(-[A-Za-z0-9.-]+)?$') {
$tag = "clients/go/$Version"
Write-Host "Creating Go-module tag: $tag" -ForegroundColor Cyan
# Verify we're on a clean checkout — refuse to tag with uncommitted changes.
$status = (git status --porcelain) -join "`n"
if ($status -and -not ($status -match '^\?\?')) {
throw "Working tree has tracked changes. Commit or stash before tagging."
# Verify we're on a clean checkout — refuse to tag with uncommitted tracked
# changes. Test each porcelain line individually: any entry that is not an
# untracked file (`??`) is a tracked change, regardless of how the entries
# happen to sort. Joining and anchoring `^\?\?` against the whole blob would
# only inspect the first line and miss tracked changes behind an untracked one.
$dirty = (git status --porcelain) | Where-Object { $_ -and ($_ -notmatch '^\?\?') }
if ($dirty) {
throw "Working tree has tracked changes. Commit or stash before tagging.`n$($dirty -join "`n")"
}
# Verify the tag doesn't already exist.
File diff suppressed because it is too large Load Diff
@@ -315,6 +315,21 @@ message SubscribeBulkCommand {
repeated string tag_addresses = 2;
}
// Provider selection / current provider for the alarm feed. The zero value
// has two distinct meanings depending on the use site:
// - As SubscribeAlarmsCommand.forced_mode, UNSPECIFIED means auto: alarmmgr
// primary with subtag fallback.
// - As a provenance value (OnAlarmTransitionEvent.source_provider,
// ActiveAlarmSnapshot.source_provider, OnAlarmProviderModeChangedEvent.mode,
// AlarmProviderStatus.mode), the worker always emits ALARMMGR or SUBTAG and
// never UNSPECIFIED; clients should treat a UNSPECIFIED provenance value as
// "unknown / not yet determined".
enum AlarmProviderMode {
ALARM_PROVIDER_MODE_UNSPECIFIED = 0;
ALARM_PROVIDER_MODE_ALARMMGR = 1;
ALARM_PROVIDER_MODE_SUBTAG = 2;
}
// Subscribe the worker's alarm consumer to an AVEVA alarm provider.
// Subscription expression follows the canonical
// `\\<machine>\Galaxy!<area>` format (literal "Galaxy" provider). The
@@ -323,6 +338,12 @@ message SubscribeBulkCommand {
// SubscribeAlarms to reconfigure).
message SubscribeAlarmsCommand {
string subscription_expression = 1;
// UNSPECIFIED = auto-failover/failback. ALARMMGR/SUBTAG force one provider.
AlarmProviderMode forced_mode = 2;
// Subtag watch-list resolved by the gateway (GR SQL + config). Empty in pure
// alarmmgr mode; in subtag mode it bounds what the consumer can observe.
repeated AlarmSubtagTarget watch_list = 3;
AlarmFailoverConfig failover = 4;
}
// Tear down the worker's alarm consumer. No-op if no subscription is
@@ -330,6 +351,23 @@ message SubscribeAlarmsCommand {
message UnsubscribeAlarmsCommand {
}
// One alarm attribute the subtag fallback consumer advises. Addresses are full
// MXAccess item references the worker passes straight to AddItem.
message AlarmSubtagTarget {
string alarm_full_reference = 1; // e.g. "Galaxy!Area.Tank01.Level.HiHi"
string source_object_reference = 2; // e.g. "Tank01"
string active_subtag = 3; // item address of the in-alarm boolean
string acked_subtag = 4; // item address of the acknowledged boolean
string ack_comment_subtag = 5; // writable ack-comment attribute (ack write target)
string priority_subtag = 6; // optional severity source; empty if absent
}
message AlarmFailoverConfig {
int32 consecutive_failure_threshold = 1; // wnwrap COM failures before switching (>=1)
int32 failback_probe_interval_seconds = 2; // probe cadence while degraded (>=1)
int32 failback_stable_probes = 3; // clean probes before switching back (>=1)
}
// Acknowledge a single alarm by its GUID. Operator identity fields are
// recorded atomically with the ack transition in the alarm-history log.
// The reply's hresult / native_status surfaces AVEVA's
@@ -684,6 +722,7 @@ message MxEvent {
OperationCompleteEvent operation_complete = 22;
OnBufferedDataChangeEvent on_buffered_data_change = 23;
OnAlarmTransitionEvent on_alarm_transition = 24;
OnAlarmProviderModeChangedEvent on_alarm_provider_mode_changed = 25;
}
}
@@ -694,6 +733,7 @@ enum MxEventFamily {
MX_EVENT_FAMILY_OPERATION_COMPLETE = 3;
MX_EVENT_FAMILY_ON_BUFFERED_DATA_CHANGE = 4;
MX_EVENT_FAMILY_ON_ALARM_TRANSITION = 5;
MX_EVENT_FAMILY_ON_ALARM_PROVIDER_MODE_CHANGED = 6;
}
message OnDataChangeEvent {
@@ -768,6 +808,20 @@ message OnAlarmTransitionEvent {
// Limit/threshold value that triggered the transition for limit alarms.
// Optional; populated for AnalogLimitAlarm-family transitions.
MxValue limit_value = 13;
// True when this transition came from the subtag-monitoring fallback rather
// than the native alarmmgr provider synthesized from data changes, reduced
// fidelity (synthetic GUID, no native raise time).
bool degraded = 14;
// Which provider produced this transition.
AlarmProviderMode source_provider = 15;
}
message OnAlarmProviderModeChangedEvent {
AlarmProviderMode mode = 1;
string reason = 2;
int32 hresult = 3; // COM HRESULT that triggered failover; 0 on failback
google.protobuf.Timestamp at = 4;
}
enum AlarmTransitionKind {
@@ -800,6 +854,15 @@ message ActiveAlarmSnapshot {
string operator_comment = 11;
MxValue current_value = 12;
MxValue limit_value = 13;
// True when this snapshot came from the subtag-monitoring fallback rather
// than the native alarmmgr provider synthesized from data changes, reduced
// fidelity (synthetic GUID, no native raise time). Mirrors
// OnAlarmTransitionEvent.degraded.
bool degraded = 14;
// Which provider produced this snapshot. Mirrors
// OnAlarmTransitionEvent.source_provider; always ALARMMGR or SUBTAG on the
// wire (never UNSPECIFIED).
AlarmProviderMode source_provider = 15;
}
enum AlarmConditionState {
@@ -866,9 +929,19 @@ message AlarmFeedMessage {
bool snapshot_complete = 2;
// A live alarm state change (raise / acknowledge / clear).
OnAlarmTransitionEvent transition = 3;
// Provider-mode status. Emitted once on stream open and again on every
// failover/failback so late joiners learn the current mode immediately.
AlarmProviderStatus provider_status = 4;
}
}
message AlarmProviderStatus {
AlarmProviderMode mode = 1;
bool degraded = 2; // true whenever mode == SUBTAG
string reason = 3; // human-readable switch reason
google.protobuf.Timestamp since = 4;
}
message MxStatusProxy {
// Mirrors the `success` member of the MXAccess MXSTATUS_PROXY struct
// (a 16-bit signed value in the COM struct, widened to int32 on the
@@ -7,7 +7,7 @@
<PropertyGroup>
<IsPackable>true</IsPackable>
<PackageId>ZB.MOM.WW.MxGateway.Contracts</PackageId>
<Version>0.1.0</Version>
<Version>0.1.1</Version>
<Authors>Joseph Doherty</Authors>
<Company>ZB MOM WW</Company>
<Copyright>Copyright (c) ZB MOM WW. All rights reserved.</Copyright>
@@ -1,4 +1,5 @@
using System.Security.Claims;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.Auth.Abstractions.Ldap;
@@ -137,26 +138,35 @@ public sealed class DashboardLdapLiveTests
}
/// <summary>
/// Builds the shared library <see cref="LibraryLdapOptions"/> from the gateway's
/// default LDAP settings so the live tests exercise the same seeded directory the
/// gateway connects to (localhost:3893, plaintext, with AllowInsecure for dev).
/// Builds the shared library <see cref="LibraryLdapOptions"/> by binding the real
/// <c>MxGateway:Ldap</c> configuration section the same way production does in
/// <c>AddZbLdapAuth(configuration, "MxGateway:Ldap")</c>, rather than hand-copying the
/// gateway shadow <c>LdapOptions</c> defaults field by field (IntegrationTests-028).
/// Binding the section directly onto the shared type means the live tests exercise the
/// exact option-binding path production uses, pick up every shared field (including
/// <see cref="LibraryLdapOptions.ConnectionTimeoutMs"/>, which governs the
/// unreachable-server test's timing) at whatever value the operator configured, and
/// cannot silently drop a field added to the shared type. The gateway's
/// <c>appsettings.json</c> seeds the dev directory connection (localhost:3893,
/// plaintext, AllowInsecure).
/// </summary>
private static LibraryLdapOptions LibraryOptions()
{
ZB.MOM.WW.MxGateway.Server.Configuration.LdapOptions gateway = new();
return new LibraryLdapOptions
{
Enabled = gateway.Enabled,
Server = gateway.Server,
Port = gateway.Port,
Transport = gateway.Transport,
AllowInsecure = gateway.AllowInsecure,
SearchBase = gateway.SearchBase,
ServiceAccountDn = gateway.ServiceAccountDn,
ServiceAccountPassword = gateway.ServiceAccountPassword,
UserNameAttribute = gateway.UserNameAttribute,
DisplayNameAttribute = gateway.DisplayNameAttribute,
GroupAttribute = gateway.GroupAttribute,
};
string repositoryRoot = IntegrationTestEnvironment.ResolveRepositoryRoot(AppContext.BaseDirectory);
string appSettingsPath = Path.Combine(
repositoryRoot,
"src",
"ZB.MOM.WW.MxGateway.Server",
"appsettings.json");
IConfiguration configuration = new ConfigurationBuilder()
.AddJsonFile(appSettingsPath, optional: false)
.Build();
// Same section production binds in AddZbLdapAuth(configuration, "MxGateway:Ldap").
// Get<T> returns null only when the section is absent; appsettings.json always
// carries it, so fall back to shared defaults defensively rather than throw.
return configuration.GetSection("MxGateway:Ldap").Get<LibraryLdapOptions>()
?? new LibraryLdapOptions();
}
}
@@ -0,0 +1,35 @@
using ZB.MOM.WW.MxGateway.Contracts.Proto;
using ZB.MOM.WW.MxGateway.Server.Alarms;
using ZB.MOM.WW.MxGateway.Server.Configuration;
namespace ZB.MOM.WW.MxGateway.IntegrationTests.TestSupport;
/// <summary>
/// No-op <see cref="IAlarmWatchListResolver"/> for integration tests that
/// construct a <see cref="GatewayAlarmMonitor"/> only to satisfy the
/// <see cref="ZB.MOM.WW.MxGateway.Server.Grpc.MxAccessGatewayService"/>
/// constructor. The live MXAccess smoke tests never exercise the alarm
/// monitor, so the resolver always yields an empty watch-list rather than
/// touching the Galaxy Repository the production
/// <see cref="AlarmWatchListResolver"/> requires. Singleton because the type
/// holds no state — every consumer can share <see cref="Instance"/>.
/// </summary>
public sealed class EmptyAlarmWatchListResolver : IAlarmWatchListResolver
{
/// <summary>Shared no-op instance.</summary>
public static readonly EmptyAlarmWatchListResolver Instance = new();
private static readonly IReadOnlyList<AlarmSubtagTarget> Empty = [];
private EmptyAlarmWatchListResolver()
{
}
/// <inheritdoc />
public Task<IReadOnlyList<AlarmSubtagTarget>> ResolveAsync(
AlarmsOptions options,
CancellationToken cancellationToken = default)
{
return Task.FromResult(Empty);
}
}
@@ -1097,6 +1097,8 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output)
_loggerFactory.CreateLogger<MxAccessGatewayService>(),
new ZB.MOM.WW.MxGateway.Server.Alarms.GatewayAlarmMonitor(
sessionManager,
EmptyAlarmWatchListResolver.Instance,
_metrics,
options,
_loggerFactory.CreateLogger<ZB.MOM.WW.MxGateway.Server.Alarms.GatewayAlarmMonitor>()));
}
@@ -12,6 +12,22 @@
<PackageReference Include="xunit.runner.visualstudio" Version="3.1.4" />
</ItemGroup>
<!--
DashboardLdapLiveTests directly references the shared LDAP provider's public
types (LdapAuthService / LdapOptions), so declare direct dependencies rather
than rely on the transitive flow through the Server ProjectReference
(IntegrationTests-027). Versions match the Server's pinned 0.1.2. The
configuration packages back the production-fidelity MxGateway:Ldap binding the
live-test fixture uses in place of the old field-by-field hand-copy
(IntegrationTests-028).
-->
<ItemGroup>
<PackageReference Include="ZB.MOM.WW.Auth.Abstractions" Version="0.1.2" />
<PackageReference Include="ZB.MOM.WW.Auth.Ldap" Version="0.1.2" />
<PackageReference Include="Microsoft.Extensions.Configuration.Json" Version="10.0.7" />
<PackageReference Include="Microsoft.Extensions.Configuration.Binder" Version="10.0.7" />
</ItemGroup>
<ItemGroup>
<Using Include="Xunit" />
</ItemGroup>
@@ -0,0 +1,18 @@
namespace ZB.MOM.WW.MxGateway.Server.Alarms;
/// <summary>
/// Well-known <c>reason</c> strings carried on the alarm feed's
/// <c>AlarmProviderStatus</c> message. Shared between the producer
/// (<see cref="GatewayAlarmMonitor" />) and consumers (e.g. the dashboard
/// provider badge) so the two cannot drift on a magic string.
/// </summary>
public static class AlarmProviderReasons
{
/// <summary>
/// Reason set when the monitor starts in subtag mode because
/// <c>MxGateway:Alarms:Fallback:Mode</c> is <c>ForceSubtag</c> — a
/// deliberate configuration, not a runtime failover. Lets the dashboard
/// distinguish a forced subtag mode from an unexpected degraded failover.
/// </summary>
public const string ForcedSubtag = "Forced subtag mode (configuration)";
}
@@ -0,0 +1,180 @@
using ZB.MOM.WW.MxGateway.Contracts.Proto;
using ZB.MOM.WW.MxGateway.Server.Configuration;
using ZB.MOM.WW.MxGateway.Server.Galaxy;
namespace ZB.MOM.WW.MxGateway.Server.Alarms;
/// <summary>
/// Default <see cref="IAlarmWatchListResolver"/>. Merges Galaxy Repository
/// alarm-attribute discovery with the configured include/exclude overrides
/// and composes the per-attribute subtag item addresses from the configured
/// subtag names.
/// </summary>
// NOTE: The exact subtag names and the canonical AlarmFullReference shape
// ("Galaxy!{area}.{reference}") are validated against a live Galaxy in the
// Task 17 live smoke test. The config Subtags block exists precisely so these
// names are not hard-coded here. The {area} is the alarm object's REAL Galaxy
// area discovered via gobject.area_gobject_id (the alarm group the native
// alarmmgr emits), giving exact reference parity with wnwrap. The configured
// Discovery.Area/DefaultArea is only the fallback for explicit IncludeAttributes
// entries, which carry no discovered area.
public sealed class AlarmWatchListResolver : IAlarmWatchListResolver
{
private const string ProviderLiteral = "Galaxy";
private const string DefaultActiveSubtag = "InAlarm";
private const string DefaultAckedSubtag = "Acked";
private readonly IGalaxyRepository _repository;
private readonly ILogger<AlarmWatchListResolver> _logger;
/// <summary>Initializes the watch-list resolver.</summary>
/// <param name="repository">Galaxy Repository used for alarm-attribute discovery.</param>
/// <param name="logger">Diagnostic logger.</param>
public AlarmWatchListResolver(
IGalaxyRepository repository,
ILogger<AlarmWatchListResolver> logger)
{
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc />
public async Task<IReadOnlyList<AlarmSubtagTarget>> ResolveAsync(
AlarmsOptions options,
CancellationToken cancellationToken = default)
{
ArgumentNullException.ThrowIfNull(options);
AlarmDiscoveryOptions discovery = options.Fallback.Discovery;
// Config fallback area used only for explicit IncludeAttributes entries (which
// carry no discovered area): discovery area, else the default area (may be empty).
string configFallbackArea = string.IsNullOrEmpty(discovery.Area) ? options.DefaultArea : discovery.Area;
// 1. Build the ordered, de-duplicated attribute reference set.
// Each entry carries the reference, the source-object reference, and the
// per-entry area used to compose the canonical reference. GR rows contribute
// the object's real Galaxy area; config includes contribute the config
// fallback area (Discovery.Area else DefaultArea).
List<(string Reference, string SourceObject, string Area)> ordered = [];
HashSet<string> seen = new(StringComparer.OrdinalIgnoreCase);
if (discovery.UseGalaxyRepository)
{
List<GalaxyAlarmAttributeRow> rows;
try
{
rows = await _repository.GetAlarmAttributesAsync(cancellationToken).ConfigureAwait(false);
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
// Discovery being unavailable must not crash the resolver: log and
// continue with an empty discovery set. The caller decides what to
// do with the (possibly config-only) result. Cancellation is the one
// exception — an OperationCanceledException propagates per the
// IAlarmWatchListResolver contract so the caller unwinds promptly.
_logger.LogWarning(
ex,
"Galaxy Repository alarm-attribute discovery failed; continuing with configuration-only watch-list.");
rows = [];
}
foreach (GalaxyAlarmAttributeRow row in rows)
{
if (string.IsNullOrEmpty(row.FullTagReference) || !seen.Add(row.FullTagReference))
{
continue;
}
ordered.Add((row.FullTagReference, row.SourceObjectReference, row.Area));
}
}
foreach (string include in discovery.IncludeAttributes)
{
if (string.IsNullOrEmpty(include) || !seen.Add(include))
{
continue;
}
ordered.Add((include, DeriveSourceObject(include), configFallbackArea));
}
// Remove excluded references (case-insensitive), but only when GR discovery
// is active. ExcludeAttributes is documented as "Ignored when
// UseGalaxyRepository is false" (AlarmDiscoveryOptions.ExcludeAttributes).
// Whitespace-only entries are skipped, consistent with the include guard above.
if (discovery.UseGalaxyRepository)
{
HashSet<string> excluded = new(
discovery.ExcludeAttributes.Where(e => !string.IsNullOrWhiteSpace(e)),
StringComparer.OrdinalIgnoreCase);
if (excluded.Count > 0)
{
ordered.RemoveAll(e => excluded.Contains(e.Reference));
}
}
// 2. Resolve subtag names with safe fallbacks.
string active = string.IsNullOrEmpty(options.Fallback.Subtags.Active)
? DefaultActiveSubtag
: options.Fallback.Subtags.Active;
string acked = string.IsNullOrEmpty(options.Fallback.Subtags.Acked)
? DefaultAckedSubtag
: options.Fallback.Subtags.Acked;
string priority = options.Fallback.Subtags.Priority;
string ackComment = options.Fallback.Subtags.AckComment;
// 3. Compose one target per reference, using the PER-ENTRY area: the GR row's
// real Galaxy area (matching the alarmmgr group), or the config fallback for
// explicit includes.
List<AlarmSubtagTarget> targets = new(ordered.Count);
foreach ((string reference, string sourceObject, string area) in ordered)
{
targets.Add(new AlarmSubtagTarget
{
AlarmFullReference = ComposeFullReference(area, reference),
SourceObjectReference = sourceObject,
ActiveSubtag = $"{reference}.{active}",
AckedSubtag = $"{reference}.{acked}",
PrioritySubtag = string.IsNullOrEmpty(priority) ? string.Empty : $"{reference}.{priority}",
AckCommentSubtag = string.IsNullOrEmpty(ackComment) ? string.Empty : $"{reference}.{ackComment}",
});
}
// 4. Report the resolved count; warn when subtag mode was expected to cover
// something (GR enabled, or explicit includes were configured) but resolved
// to nothing. Only emit the Debug line when there is at least one target,
// to avoid a confusing "0 target(s)" noise line.
if (targets.Count == 0 && (discovery.UseGalaxyRepository || discovery.IncludeAttributes.Length > 0))
{
_logger.LogWarning(
"Alarm subtag watch-list resolved to zero targets; subtag-polling fallback will cover no alarms.");
}
else if (targets.Count > 0)
{
_logger.LogDebug("Resolved alarm subtag watch-list with {TargetCount} target(s).", targets.Count);
}
return targets;
}
/// <summary>
/// Derives the source-object reference for a configuration entry: the
/// substring before the first '.', or the whole string when there is no dot.
/// </summary>
private static string DeriveSourceObject(string reference)
{
int dot = reference.IndexOf('.', StringComparison.Ordinal);
return dot < 0 ? reference : reference[..dot];
}
/// <summary>
/// Composes the canonical alarm full reference: <c>Galaxy!{area}.{reference}</c>
/// when an area is set, otherwise <c>Galaxy!{reference}</c>.
/// </summary>
private static string ComposeFullReference(string area, string reference) =>
string.IsNullOrEmpty(area)
? $"{ProviderLiteral}!{reference}"
: $"{ProviderLiteral}!{area}.{reference}";
}
@@ -13,6 +13,7 @@ public static class AlarmsServiceCollectionExtensions
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddGatewayAlarms(this IServiceCollection services)
{
services.AddSingleton<IAlarmWatchListResolver, AlarmWatchListResolver>();
services.AddSingleton<GatewayAlarmMonitor>();
services.AddSingleton<IGatewayAlarmService>(provider => provider.GetRequiredService<GatewayAlarmMonitor>());
services.AddHostedService(provider => provider.GetRequiredService<GatewayAlarmMonitor>());
@@ -1,7 +1,9 @@
using System.Threading.Channels;
using Google.Protobuf.WellKnownTypes;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
using ZB.MOM.WW.MxGateway.Server.Configuration;
using ZB.MOM.WW.MxGateway.Server.Metrics;
using ZB.MOM.WW.MxGateway.Server.Sessions;
namespace ZB.MOM.WW.MxGateway.Server.Alarms;
@@ -23,6 +25,8 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
private static readonly TimeSpan StartupGrace = TimeSpan.FromSeconds(2);
private readonly ISessionManager _sessionManager;
private readonly IAlarmWatchListResolver _watchListResolver;
private readonly GatewayMetrics _metrics;
private readonly AlarmsOptions _options;
private readonly ILogger<GatewayAlarmMonitor> _logger;
@@ -30,20 +34,34 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
private readonly Dictionary<string, ActiveAlarmSnapshot> _alarms = new(StringComparer.Ordinal);
private readonly List<Subscriber> _subscribers = [];
// Current provider status (mode + degraded + reason + since), guarded by _sync.
// Initialized to the alarm-manager, not-degraded baseline so a late joiner sees
// a sensible status even before any OnAlarmProviderModeChanged event arrives.
private AlarmProviderMode _providerMode = AlarmProviderMode.Alarmmgr;
private bool _providerDegraded;
private string _providerReason = string.Empty;
private DateTimeOffset _providerSince = DateTimeOffset.UtcNow;
private volatile GatewayAlarmMonitorState _state = GatewayAlarmMonitorState.Disabled;
private volatile string? _lastError;
private GatewaySession? _session;
/// <summary>Initializes the gateway alarm monitor.</summary>
/// <param name="sessionManager">Gateway session manager.</param>
/// <param name="watchListResolver">Resolver for the subtag-fallback watch-list.</param>
/// <param name="metrics">Gateway metrics sink.</param>
/// <param name="options">Gateway options carrying the alarm configuration.</param>
/// <param name="logger">Diagnostic logger.</param>
public GatewayAlarmMonitor(
ISessionManager sessionManager,
IAlarmWatchListResolver watchListResolver,
GatewayMetrics metrics,
IOptions<GatewayOptions> options,
ILogger<GatewayAlarmMonitor> logger)
{
_sessionManager = sessionManager ?? throw new ArgumentNullException(nameof(sessionManager));
_watchListResolver = watchListResolver ?? throw new ArgumentNullException(nameof(watchListResolver));
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_options = (options ?? throw new ArgumentNullException(nameof(options))).Value.Alarms;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
@@ -139,6 +157,49 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
private async Task RunMonitorAsync(string subscription, CancellationToken stoppingToken)
{
_state = GatewayAlarmMonitorState.Starting;
// Derive the lifecycle baseline from the configured forced mode so a
// ForceSubtag / ForceAlarmManager start advertises the correct mode even
// though no OnAlarmProviderModeChanged event is raised in those modes
// (only Auto/failover produces that event). ForceSubtag starts degraded.
AlarmProviderMode initialMode;
bool initialDegraded;
string initialReason;
switch (MapForcedMode(_options.Fallback.Mode))
{
case AlarmProviderMode.Subtag:
initialMode = AlarmProviderMode.Subtag;
initialDegraded = true;
initialReason = AlarmProviderReasons.ForcedSubtag;
break;
case AlarmProviderMode.Alarmmgr:
initialMode = AlarmProviderMode.Alarmmgr;
initialDegraded = false;
initialReason = string.Empty;
break;
default:
// Unspecified (Auto): the failover consumer starts on the
// alarm-manager primary and only degrades to subtag on failure.
initialMode = AlarmProviderMode.Alarmmgr;
initialDegraded = false;
initialReason = string.Empty;
break;
}
lock (_sync)
{
// Re-baseline the provider status for this lifecycle so a restarted
// monitor advertises the configured mode until told otherwise.
_providerMode = initialMode;
_providerDegraded = initialDegraded;
_providerReason = initialReason;
_providerSince = DateTimeOffset.UtcNow;
}
// Align the observable gauge with the lifecycle baseline without recording
// a switch — the gauge was 0 (unknown) from construction until now.
_metrics.SetAlarmProviderMode(ModeToInt(initialMode));
GatewaySession session = await _sessionManager.OpenSessionAsync(
new SessionOpenRequest(BackendName, MonitorClientName, Guid.NewGuid().ToString("N"), CommandTimeout: null),
MonitorClientName,
@@ -173,6 +234,15 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
{
ApplyTransition(mxEvent.OnAlarmTransition);
}
else if (mxEvent is { BodyCase: MxEvent.BodyOneofCase.OnAlarmProviderModeChanged }
&& mxEvent.OnAlarmProviderModeChanged is not null)
{
await ApplyProviderModeChangeAsync(
session.SessionId,
mxEvent.OnAlarmProviderModeChanged,
linked.Token)
.ConfigureAwait(false);
}
}
}
finally
@@ -209,6 +279,33 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
private async Task SubscribeAlarmsAsync(string sessionId, string subscription, CancellationToken cancellationToken)
{
IReadOnlyList<AlarmSubtagTarget> watchList = await _watchListResolver
.ResolveAsync(_options, cancellationToken)
.ConfigureAwait(false);
AlarmProviderMode forcedMode = MapForcedMode(_options.Fallback.Mode);
_logger.LogInformation(
"Alarm subscribe: forcedMode={ForcedMode} configMode={ConfigMode} watchList={WatchListCount}.",
forcedMode, _options.Fallback.Mode, watchList.Count);
// When the forced mode is Unspecified (the "Auto" case) and the resolved
// watch-list is empty — the common alarmmgr-only deployment — the command
// is identical-in-effect to the historical SubscribeAlarms (wnwrap only):
// the worker builds the wnwrap consumer and no subtag watch-list.
SubscribeAlarmsCommand command = new()
{
SubscriptionExpression = subscription,
ForcedMode = forcedMode,
Failover = new AlarmFailoverConfig
{
ConsecutiveFailureThreshold = _options.Fallback.ConsecutiveFailureThreshold,
FailbackProbeIntervalSeconds = _options.Fallback.FailbackProbeIntervalSeconds,
FailbackStableProbes = _options.Fallback.FailbackStableProbes,
},
};
command.WatchList.AddRange(watchList);
WorkerCommandReply reply = await _sessionManager.InvokeAsync(
sessionId,
new WorkerCommand
@@ -216,7 +313,7 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
Command = new MxCommand
{
Kind = MxCommandKind.SubscribeAlarms,
SubscribeAlarms = new SubscribeAlarmsCommand { SubscriptionExpression = subscription },
SubscribeAlarms = command,
},
},
cancellationToken)
@@ -310,6 +407,104 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
}
}
// Handles the worker's provider-mode-change event: updates the stored provider
// status, broadcasts it to every subscriber (provider status is global, not
// alarm-scoped), records the switch metric, and forces a cache reconcile so the
// active-alarm set reflects whatever the new mode reports.
private async Task ApplyProviderModeChangeAsync(
string sessionId,
OnAlarmProviderModeChangedEvent change,
CancellationToken cancellationToken)
{
AlarmProviderMode toMode = change.Mode;
string reason = change.Reason ?? string.Empty;
AlarmProviderStatus status;
int fromModeInt;
lock (_sync)
{
fromModeInt = ModeToInt(_providerMode);
_providerMode = toMode;
_providerDegraded = toMode == AlarmProviderMode.Subtag;
_providerReason = reason;
_providerSince = DateTimeOffset.UtcNow;
status = BuildProviderStatus();
BroadcastToAll(new AlarmFeedMessage { ProviderStatus = status });
}
AlarmProviderSwitchReason switchReason = toMode switch
{
AlarmProviderMode.Subtag => AlarmProviderSwitchReason.Failover,
AlarmProviderMode.Alarmmgr => AlarmProviderSwitchReason.Failback,
_ => AlarmProviderSwitchReason.Unknown,
};
_metrics.AlarmProviderSwitched(fromModeInt, ModeToInt(toMode), switchReason);
_logger.LogInformation(
"Alarm provider mode changed to {Mode} (degraded={Degraded}): {Reason}",
toMode,
status.Degraded,
reason);
try
{
// Intentionally awaited OUTSIDE _sync: ReconcileAsync acquires _sync itself,
// so holding it across the await here would deadlock. Subscribers therefore
// see the ProviderStatus push (above) slightly before the cache is re-seeded
// by the reconcile — an accepted brief inconsistency.
await ReconcileAsync(sessionId, cancellationToken).ConfigureAwait(false);
}
catch (OperationCanceledException)
{
throw;
}
catch (Exception exception)
{
_logger.LogDebug(
exception,
"Reconcile after alarm provider mode change failed; keeping the current cache.");
}
}
// Caller holds _sync. Builds an AlarmProviderStatus snapshot of the current state.
private AlarmProviderStatus BuildProviderStatus()
{
return new AlarmProviderStatus
{
Mode = _providerMode,
Degraded = _providerDegraded,
Reason = _providerReason,
Since = Timestamp.FromDateTimeOffset(_providerSince),
};
}
// Maps the configured fallback mode string to the forced provider mode the
// worker honours. Case-insensitive; anything other than the two force values
// (including the default "Auto") yields Unspecified ("let the worker decide").
private static AlarmProviderMode MapForcedMode(string? mode)
{
if (string.Equals(mode, "ForceAlarmManager", StringComparison.OrdinalIgnoreCase))
{
return AlarmProviderMode.Alarmmgr;
}
if (string.Equals(mode, "ForceSubtag", StringComparison.OrdinalIgnoreCase))
{
return AlarmProviderMode.Subtag;
}
return AlarmProviderMode.Unspecified;
}
// Maps the provider-mode enum to the integer the metric expects
// (alarmmgr=1, subtag=2, unknown/unspecified=0).
private static int ModeToInt(AlarmProviderMode mode) => mode switch
{
AlarmProviderMode.Alarmmgr => 1,
AlarmProviderMode.Subtag => 2,
_ => 0,
};
// Replaces the cache with the worker's authoritative snapshot, broadcasting
// a synthetic transition for any alarm the live stream missed.
private void ApplyReconcile(IEnumerable<ActiveAlarmSnapshot> snapshots)
@@ -374,6 +569,23 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
}
}
// Caller holds _sync. Pushes a feed message to every subscriber regardless of
// its alarm-filter prefix. Used for provider-status messages, which are global
// rather than scoped to a single alarm reference.
private void BroadcastToAll(AlarmFeedMessage message)
{
for (int index = _subscribers.Count - 1; index >= 0; index--)
{
Subscriber subscriber = _subscribers[index];
if (!subscriber.Channel.Writer.TryWrite(message))
{
subscriber.Channel.Writer.TryComplete(new InvalidOperationException(
"Alarm feed subscriber fell behind and was dropped; reconnect to re-snapshot."));
_subscribers.RemoveAt(index);
}
}
}
private void ClearCache()
{
lock (_sync)
@@ -398,11 +610,14 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
Subscriber subscriber = new(channel, prefix);
ActiveAlarmSnapshot[] snapshot;
AlarmProviderStatus providerStatus;
lock (_sync)
{
// Register before snapshotting under the same lock so no transition
// can slip between the snapshot and the live stream.
// Register before snapshotting under the same lock so neither a
// transition nor a provider-mode change can slip between the snapshot
// and the live stream.
_subscribers.Add(subscriber);
providerStatus = BuildProviderStatus();
snapshot = _alarms.Values
.Where(alarm => prefix.Length == 0
|| alarm.AlarmFullReference.StartsWith(prefix, StringComparison.Ordinal))
@@ -412,6 +627,10 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
try
{
// Emit the current provider status first so a late joiner immediately
// learns the mode (and whether the feed is degraded) before any alarms.
yield return new AlarmFeedMessage { ProviderStatus = providerStatus };
foreach (ActiveAlarmSnapshot alarm in snapshot)
{
yield return new AlarmFeedMessage { ActiveAlarm = alarm };
@@ -624,6 +843,8 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
Description = transition.Description,
OperatorUser = transition.OperatorUser,
OperatorComment = transition.OperatorComment,
Degraded = transition.Degraded,
SourceProvider = transition.SourceProvider,
};
if (transition.OriginalRaiseTimestamp is not null)
{
@@ -660,6 +881,8 @@ public sealed class GatewayAlarmMonitor : BackgroundService, IGatewayAlarmServic
Description = snapshot.Description,
OperatorUser = snapshot.OperatorUser,
OperatorComment = snapshot.OperatorComment,
Degraded = snapshot.Degraded,
SourceProvider = snapshot.SourceProvider,
};
if (snapshot.OriginalRaiseTimestamp is not null)
{
@@ -0,0 +1,30 @@
using ZB.MOM.WW.MxGateway.Contracts.Proto;
using ZB.MOM.WW.MxGateway.Server.Configuration;
namespace ZB.MOM.WW.MxGateway.Server.Alarms;
/// <summary>
/// Resolves the subtag watch-list the gateway sends to the worker when the
/// central alarm monitor operates in subtag-polling fallback mode. Merges
/// Galaxy Repository alarm-attribute discovery with the configured
/// include/exclude overrides and composes the per-attribute subtag item
/// addresses from the configured subtag names.
/// </summary>
public interface IAlarmWatchListResolver
{
/// <summary>
/// Builds the subtag watch-list for the supplied alarm configuration.
/// </summary>
/// <param name="options">Alarm configuration carrying discovery and subtag-name settings.</param>
/// <param name="cancellationToken">Token to cancel the asynchronous operation.</param>
/// <returns>
/// The resolved <see cref="AlarmSubtagTarget"/> watch-list, possibly empty.
/// Discovery being unavailable never throws — it yields an empty (or
/// config-only) list and the caller decides what to do with it. Cancellation
/// is the one exception: a triggered <paramref name="cancellationToken"/>
/// still propagates an <see cref="OperationCanceledException"/>.
/// </returns>
Task<IReadOnlyList<AlarmSubtagTarget>> ResolveAsync(
AlarmsOptions options,
CancellationToken cancellationToken = default);
}
@@ -0,0 +1,134 @@
namespace ZB.MOM.WW.MxGateway.Server.Configuration;
/// <summary>
/// Controls how the central alarm monitor selects between the MXAccess
/// alarm-manager subscription and the subtag-polling fallback, and
/// governs the failure-detection thresholds used when switching.
/// </summary>
public sealed class AlarmFallbackOptions
{
/// <summary>
/// Selects the operating mode for the alarm-manager ↔ subtag fallback
/// mechanism. Accepted values (case-insensitive):
/// <list type="bullet">
/// <item><c>Auto</c> — use the alarm manager; switch to subtag polling
/// automatically when <see cref="ConsecutiveFailureThreshold"/> failures
/// are detected, and probe for failback.</item>
/// <item><c>ForceAlarmManager</c> — always use the alarm manager;
/// never fall back.</item>
/// <item><c>ForceSubtag</c> — always use subtag polling;
/// never try the alarm manager.</item>
/// </list>
/// Default is <c>Auto</c>.
/// </summary>
public string Mode { get; init; } = "Auto";
/// <summary>
/// Number of consecutive alarm-manager failures before the monitor
/// switches to subtag-polling fallback. Must be at least 1. Default 3.
/// </summary>
public int ConsecutiveFailureThreshold { get; init; } = 3;
/// <summary>
/// How often (in seconds) the monitor sends a probe to the alarm manager
/// while operating in subtag-polling fallback mode, to detect recovery.
/// Must be at least 1. Default 30.
/// </summary>
public int FailbackProbeIntervalSeconds { get; init; } = 30;
/// <summary>
/// Number of consecutive successful probes required before the monitor
/// considers the alarm manager recovered and switches back. Must be at
/// least 1. Default 3.
/// </summary>
public int FailbackStableProbes { get; init; } = 3;
/// <summary>
/// Controls how the monitor discovers the set of objects to poll when
/// operating in subtag-polling fallback mode.
/// </summary>
public AlarmDiscoveryOptions Discovery { get; init; } = new();
/// <summary>
/// Configures the subtag names the monitor reads when polling alarm state
/// in subtag-fallback mode.
/// </summary>
public AlarmSubtagNameOptions Subtags { get; init; } = new();
}
/// <summary>
/// Governs how the alarm monitor discovers objects to include in subtag-polling
/// fallback mode. Either the Galaxy Repository query (when
/// <see cref="UseGalaxyRepository"/> is <c>true</c>) or an explicit
/// <see cref="IncludeAttributes"/> list must be supplied when
/// <c>MxGateway:Alarms:Fallback:Mode</c> is <c>ForceSubtag</c>.
/// </summary>
public sealed class AlarmDiscoveryOptions
{
/// <summary>
/// When <c>true</c> the monitor queries the Galaxy Repository SQL database
/// to enumerate alarm objects for the configured area. Default <c>true</c>.
/// </summary>
public bool UseGalaxyRepository { get; init; } = true;
/// <summary>
/// Galaxy area to scope the Repository query to. When empty the monitor
/// falls back to <see cref="AlarmsOptions.DefaultArea"/>. Ignored when
/// <see cref="UseGalaxyRepository"/> is <c>false</c>.
/// </summary>
public string Area { get; init; } = string.Empty;
/// <summary>
/// Explicit list of MXAccess attribute paths to include in subtag polling,
/// supplementing (or replacing, when <see cref="UseGalaxyRepository"/> is
/// <c>false</c>) the Repository-derived list. Default empty.
/// </summary>
public string[] IncludeAttributes { get; init; } = Array.Empty<string>();
/// <summary>
/// Attribute paths to remove from the merged poll list (case-insensitive).
/// The exclude runs after the Repository-derived rows and the explicit
/// <see cref="IncludeAttributes"/> entries are combined, so an exclude that
/// matches an explicit include suppresses it too — excludes win.
/// Ignored when <see cref="UseGalaxyRepository"/> is <c>false</c>.
/// Default empty.
/// </summary>
public string[] ExcludeAttributes { get; init; } = Array.Empty<string>();
}
/// <summary>
/// Configures the subtag names read by the alarm monitor when it is operating
/// in subtag-polling fallback mode. Names are matched against MXAccess item
/// handles; validation against the live MXAccess attribute list occurs at
/// runtime, not at startup.
/// Defaults are the confirmed AVEVA <c>AlarmExtension</c> primitive field names,
/// verified against the live ZB Galaxy <c>attribute_definition</c> rows.
/// </summary>
public sealed class AlarmSubtagNameOptions
{
/// <summary>
/// Subtag name for the in-alarm boolean. Confirmed AVEVA <c>AlarmExtension</c>
/// field name. Default <c>InAlarm</c>.
/// </summary>
public string Active { get; init; } = "InAlarm";
/// <summary>
/// Subtag name for the acknowledged boolean. Confirmed AVEVA <c>AlarmExtension</c>
/// field name. Default <c>Acked</c>.
/// </summary>
public string Acked { get; init; } = "Acked";
/// <summary>
/// Subtag name for the acknowledgement comment write target. Writing this subtag
/// performs the acknowledge in AVEVA. Confirmed AVEVA <c>AlarmExtension</c>
/// field name. When empty the ack-comment write path is disabled.
/// Default <c>AckMsg</c>.
/// </summary>
public string AckComment { get; init; } = "AckMsg";
/// <summary>
/// Subtag name for the alarm priority / severity. Confirmed AVEVA
/// <c>AlarmExtension</c> field name. Default <c>Priority</c>.
/// </summary>
public string Priority { get; init; } = "Priority";
}
@@ -45,4 +45,12 @@ public sealed class AlarmsOptions
/// the monitor floors it at 5 seconds.
/// </summary>
public int ReconcileIntervalSeconds { get; init; } = 30;
/// <summary>
/// Configuration for the alarm-manager ↔ subtag fallback mechanism:
/// operating mode, failure-detection thresholds, discovery, and subtag
/// names. Defaults (Mode = "Auto") preserve behaviour when the section is
/// omitted from configuration.
/// </summary>
public AlarmFallbackOptions Fallback { get; init; } = new();
}
@@ -231,6 +231,8 @@ public sealed class GatewayOptionsValidator : OptionsValidatorBase<GatewayOption
builder);
}
private static readonly string[] ValidAlarmFallbackModes = ["Auto", "ForceAlarmManager", "ForceSubtag"];
private static void ValidateAlarms(AlarmsOptions options, ValidationBuilder builder)
{
if (!options.Enabled)
@@ -255,6 +257,46 @@ public sealed class GatewayOptionsValidator : OptionsValidatorBase<GatewayOption
builder.Add(
@"MxGateway:Alarms:SubscriptionExpression must start with '\\' (canonical \\<host>\Galaxy!<area> shape).");
}
ValidateAlarmFallback(options.Fallback, builder);
}
private static void ValidateAlarmFallback(AlarmFallbackOptions fallback, ValidationBuilder builder)
{
// Validate Mode is one of the recognised values (case-insensitive).
bool modeValid = Array.Exists(
ValidAlarmFallbackModes,
m => string.Equals(m, fallback.Mode, StringComparison.OrdinalIgnoreCase));
if (!modeValid)
{
builder.Add(
$"MxGateway:Alarms:Fallback:Mode must be one of: {string.Join(", ", ValidAlarmFallbackModes)} (was '{fallback.Mode}').");
}
// ForceSubtag requires either Galaxy Repository discovery or an explicit IncludeAttributes list.
if (modeValid
&& string.Equals(fallback.Mode, "ForceSubtag", StringComparison.OrdinalIgnoreCase)
&& !fallback.Discovery.UseGalaxyRepository
&& fallback.Discovery.IncludeAttributes.Length == 0)
{
builder.Add(
"MxGateway:Alarms:Fallback ForceSubtag requires Galaxy Repository discovery or a non-empty Discovery:IncludeAttributes list.");
}
// Floor validation: numeric thresholds must be at least 1.
AddIfNotPositive(
fallback.ConsecutiveFailureThreshold,
"MxGateway:Alarms:Fallback:ConsecutiveFailureThreshold must be greater than zero.",
builder);
AddIfNotPositive(
fallback.FailbackProbeIntervalSeconds,
"MxGateway:Alarms:Fallback:FailbackProbeIntervalSeconds must be greater than zero.",
builder);
AddIfNotPositive(
fallback.FailbackStableProbes,
"MxGateway:Alarms:Fallback:FailbackStableProbes must be greater than zero.",
builder);
}
private const int MinimumCertValidityYears = 1;
@@ -1,7 +1,10 @@
@page "/alarms"
@implements IAsyncDisposable
@using Microsoft.AspNetCore.SignalR.Client
@using ZB.MOM.WW.MxGateway.Server.Dashboard.Hubs
@inject IDashboardLiveDataService LiveData
@inject IOptions<GatewayOptions> GatewayOptions
@inject DashboardHubConnectionFactory HubFactory
<PageTitle>Dashboard Alarms</PageTitle>
@@ -10,6 +13,12 @@
<h1>Alarms</h1>
<div class="text-secondary">@HeaderLine()</div>
</div>
<div class="d-flex align-items-center gap-2">
<span class="badge @_providerStatus.BadgeCssClass"
title="@ProviderStatusTitle()">
@_providerStatus.Label
</span>
</div>
</div>
@if (!GatewayOptions.Value.Alarms.Enabled)
@@ -163,10 +172,44 @@
private readonly CancellationTokenSource _cts = new();
private Task? _pollTask;
private DashboardAlarmProviderStatus _providerStatus = DashboardAlarmProviderStatus.Healthy;
private HubConnection? _alarmsHub;
/// <inheritdoc />
protected override void OnInitialized()
{
_pollTask = PollLoopAsync();
_ = AttachAlarmsHubAsync();
}
private string? ProviderStatusTitle()
{
return _providerStatus.IsDegraded && !string.IsNullOrWhiteSpace(_providerStatus.Reason)
? _providerStatus.Reason
: null;
}
private async Task AttachAlarmsHubAsync()
{
_alarmsHub = HubFactory.Create("/hubs/alarms");
_alarmsHub.On<AlarmFeedMessage>(AlarmsHub.AlarmMessage, async message =>
{
if (message.PayloadCase == AlarmFeedMessage.PayloadOneofCase.ProviderStatus)
{
_providerStatus = DashboardAlarmProviderStatus.FromFeed(message);
await InvokeAsync(StateHasChanged).ConfigureAwait(false);
}
});
try
{
await _alarmsHub.StartAsync(_cts.Token).ConfigureAwait(false);
}
catch
{
// The badge is best-effort; it stays at the healthy default until
// the hub reconnects and delivers a fresh provider-status message.
}
}
private string HeaderLine()
@@ -268,6 +311,19 @@
public async ValueTask DisposeAsync()
{
await _cts.CancelAsync();
if (_alarmsHub is not null)
{
try
{
await _alarmsHub.DisposeAsync();
}
catch
{
// Disposal-time errors are best-effort.
}
}
if (_pollTask is not null)
{
try
@@ -0,0 +1,101 @@
using ZB.MOM.WW.MxGateway.Contracts.Proto;
using ZB.MOM.WW.MxGateway.Server.Alarms;
namespace ZB.MOM.WW.MxGateway.Server.Dashboard;
/// <summary>
/// Dashboard projection of an <see cref="AlarmProviderStatus" /> message
/// carried on the alarm feed. Maps the protobuf provider mode / degraded
/// flag into Bootstrap-only display fields so the Alarms page can render a
/// status badge without touching protobuf types.
/// </summary>
public sealed record DashboardAlarmProviderStatus(
AlarmProviderMode Mode,
bool IsDegraded,
string Label,
string BadgeCssClass,
string Reason,
DateTimeOffset? SinceUtc)
{
/// <summary>Badge label shown when the alarm-manager provider is healthy.</summary>
public const string AlarmManagerLabel = "Alarm Manager";
/// <summary>Badge label shown when the feed has fallen back to subtag monitoring.</summary>
public const string DegradedLabel = "Subtag monitoring (degraded)";
/// <summary>
/// Badge label shown when the feed is in subtag monitoring because it was
/// deliberately configured (<c>Fallback:Mode=ForceSubtag</c>), as opposed
/// to an unexpected failover. A stable, intended state rather than a fault.
/// </summary>
public const string ForcedSubtagLabel = "Subtag monitoring (forced)";
private const string HealthyBadge = "bg-success";
private const string DegradedBadge = "bg-warning text-dark";
// Cyan/info badge: visually distinct from the amber failover-degraded badge —
// forced subtag is an intentional configuration, not an alarm-manager fault.
private const string ForcedSubtagBadge = "bg-info text-dark";
/// <summary>
/// The default status assumed before the first provider-status message
/// arrives: healthy alarm-manager mode.
/// </summary>
public static DashboardAlarmProviderStatus Healthy { get; } = new(
Mode: AlarmProviderMode.Alarmmgr,
IsDegraded: false,
Label: AlarmManagerLabel,
BadgeCssClass: HealthyBadge,
Reason: string.Empty,
SinceUtc: null);
/// <summary>Projects an alarm-feed provider-status payload into a dashboard badge model.</summary>
/// <param name="status">The provider-status payload from an <see cref="AlarmFeedMessage" />.</param>
/// <returns>The projected dashboard status.</returns>
public static DashboardAlarmProviderStatus FromProviderStatus(AlarmProviderStatus status)
{
ArgumentNullException.ThrowIfNull(status);
// Treat the explicit degraded flag and the SUBTAG mode as equivalent;
// the contract sets degraded=true whenever mode == SUBTAG, but guard
// against either being set independently.
bool degraded = status.Degraded || status.Mode == AlarmProviderMode.Subtag;
string reason = status.Reason ?? string.Empty;
// A configured ForceSubtag start carries the well-known forced reason and
// is a deliberate mode, not a failover — render it distinctly so an
// operator isn't alarmed by a "(degraded)" badge for an intended config.
bool forced = degraded
&& status.Mode == AlarmProviderMode.Subtag
&& string.Equals(reason, AlarmProviderReasons.ForcedSubtag, StringComparison.Ordinal);
string label = !degraded ? AlarmManagerLabel : forced ? ForcedSubtagLabel : DegradedLabel;
string badge = !degraded ? HealthyBadge : forced ? ForcedSubtagBadge : DegradedBadge;
return new DashboardAlarmProviderStatus(
Mode: status.Mode,
IsDegraded: degraded,
Label: label,
BadgeCssClass: badge,
Reason: reason,
SinceUtc: status.Since?.ToDateTimeOffset());
}
/// <summary>Projects an alarm-feed message into a dashboard badge model.</summary>
/// <param name="message">An alarm-feed message whose payload is a provider status.</param>
/// <returns>The projected dashboard status.</returns>
/// <exception cref="ArgumentException">The message does not carry a provider-status payload.</exception>
public static DashboardAlarmProviderStatus FromFeed(AlarmFeedMessage message)
{
ArgumentNullException.ThrowIfNull(message);
if (message.PayloadCase != AlarmFeedMessage.PayloadOneofCase.ProviderStatus)
{
throw new ArgumentException(
"Alarm-feed message does not carry a provider-status payload.",
nameof(message));
}
return FromProviderStatus(message.ProviderStatus);
}
}
@@ -195,6 +195,7 @@ public sealed class DashboardSnapshotService : IDashboardSnapshotService
new("mxgateway.workers.exited", snapshot.WorkerExits),
new("mxgateway.heartbeats.failed", snapshot.HeartbeatFailures),
new("mxgateway.grpc.streams.disconnected", snapshot.StreamDisconnects),
new("mxgateway.alarms.provider_switches", snapshot.AlarmProviderSwitchCount),
];
metrics.AddRange(snapshot.CommandFailuresByMethod
@@ -0,0 +1,48 @@
namespace ZB.MOM.WW.MxGateway.Server.Galaxy;
/// <summary>
/// One alarm-bearing attribute discovered by
/// <see cref="GalaxyRepository.GetAlarmAttributesAsync"/>: an attribute whose owning
/// object configures an <c>AlarmExtension</c> primitive (the same
/// <c>is_alarm</c> detection used by <see cref="GalaxyRepository.GetAttributesAsync"/>).
/// Used to build the subtag-fallback watch-list.
/// </summary>
public sealed class GalaxyAlarmAttributeRow
{
/// <summary>
/// Gets the alarm-bearing attribute reference (e.g. <c>Tank01.Level.HiHi</c>),
/// matching the <c>full_tag_reference</c> projection of
/// <see cref="GalaxyRepository.GetAttributesAsync"/>.
/// </summary>
public string FullTagReference { get; init; } = string.Empty;
/// <summary>
/// Gets the owning object reference (e.g. <c>Tank01</c>). This is the Galaxy
/// <c>tag_name</c> — the segment that precedes the first attribute dot in
/// <see cref="FullTagReference"/>.
/// </summary>
public string SourceObjectReference { get; init; } = string.Empty;
/// <summary>
/// Gets the owning object's Galaxy area (e.g. <c>TestArea</c>) — the alarm group.
/// <para>
/// Resolved via <c>gobject.area_gobject_id</c> in <c>AlarmAttributesSql</c>. The
/// watch-list resolver composes the canonical <c>Galaxy!{area}.{reference}</c> from
/// this so the synthesized reference's group matches the native alarmmgr (wnwrap)
/// for reference parity. May be <see cref="string.Empty"/> when the object has no
/// area; the resolver then falls back to the configured area.
/// </para>
/// </summary>
public string Area { get; init; } = string.Empty;
/// <summary>
/// Gets the writable ack-comment attribute address.
/// <para>
/// The Galaxy Repository schema does not expose an ack-comment subtag address
/// directly, so this is always <see cref="string.Empty"/> here. The watch-list
/// resolver (a later task) composes the concrete address from configuration plus
/// <see cref="SourceObjectReference"/> / <see cref="FullTagReference"/>.
/// </para>
/// </summary>
public string AckCommentSubtag { get; init; } = string.Empty;
}
@@ -114,6 +114,64 @@ public sealed class GalaxyRepository(GalaxyRepositoryOptions options) : IGalaxyR
return rows;
}
/// <summary>
/// Retrieves only the alarm-bearing attributes for the subtag-fallback watch-list.
/// Alarm detection is identical to <see cref="GetAttributesAsync"/>: a row is
/// alarm-bearing when its owning object configures an <c>AlarmExtension</c>
/// primitive (the same <c>is_alarm</c> projection, here applied as a SQL filter).
/// </summary>
/// <param name="ct">Token to cancel the asynchronous operation.</param>
public async Task<List<GalaxyAlarmAttributeRow>> GetAlarmAttributesAsync(CancellationToken ct = default)
{
List<GalaxyAlarmAttributeRow> rows = new();
using SqlConnection conn = new(options.ConnectionString);
await conn.OpenAsync(ct).ConfigureAwait(false);
using SqlCommand cmd = new(AlarmAttributesSql, conn) { CommandTimeout = options.CommandTimeoutSeconds };
using SqlDataReader reader = await cmd.ExecuteReaderAsync(ct).ConfigureAwait(false);
while (await reader.ReadAsync(ct).ConfigureAwait(false))
{
rows.Add(MapAlarmRow(
fullTagReference: reader.GetString(0),
sourceObjectReference: reader.GetString(1),
area: reader.GetString(2)));
}
return rows;
}
/// <summary>
/// Maps a raw alarm-attribute reader row to a <see cref="GalaxyAlarmAttributeRow"/>.
/// <para>
/// <paramref name="sourceObjectReference"/> is the Galaxy <c>tag_name</c> (the
/// owning object), and <paramref name="fullTagReference"/> is
/// <c>tag_name + '.' + attribute_name</c> — the same composition the
/// <c>full_tag_reference</c> projection of <see cref="AttributesSql"/> produces.
/// <see cref="GalaxyAlarmAttributeRow.AckCommentSubtag"/> is left empty here; the
/// schema does not expose an ack-comment address and the watch-list resolver
/// composes it later.
/// </para>
/// <paramref name="area"/> is the owning object's real Galaxy area (its alarm
/// group), resolved via <c>gobject.area_gobject_id</c>; the watch-list resolver
/// composes the canonical reference from it so the synthesized reference's group
/// matches what the native alarmmgr (wnwrap) emits.
/// Exposed internally so the derivation can be unit-tested without a database.
/// </summary>
/// <param name="fullTagReference">The alarm-bearing attribute reference.</param>
/// <param name="sourceObjectReference">The owning object reference (tag name).</param>
/// <param name="area">The owning object's Galaxy area (the alarm group).</param>
internal static GalaxyAlarmAttributeRow MapAlarmRow(
string fullTagReference,
string sourceObjectReference,
string area) => new()
{
FullTagReference = fullTagReference,
SourceObjectReference = sourceObjectReference,
Area = area,
AckCommentSubtag = string.Empty,
};
private const string HierarchySql = @"
;WITH template_chain AS (
SELECT g.gobject_id AS instance_gobject_id, t.gobject_id AS template_gobject_id,
@@ -248,5 +306,62 @@ SELECT
FROM ranked r
LEFT JOIN data_type dt ON dt.mx_data_type = r.mx_data_type
WHERE r.rn = 1
ORDER BY r.tag_name, r.attribute_name";
// Alarm-only discovery for the subtag-fallback watch-list. This reuses the candidate/ranked
// CTE shape and the same `AlarmExtension`-based detection as AttributesSql. Unlike
// AttributesSql it keeps only the user-attribute (dynamic_attribute) candidate branch: an
// alarm anchor is always a user attribute, so the primitive-instance branch AttributesSql
// carries would be filtered out here anyway — a row qualifies only when its user attribute
// anchors an `AlarmExtension` primitive on the owning object. It projects just what the
// watch-list needs — full_tag_reference (tag_name +
// '.' + attribute_name, matching AttributesSql) and the owning object's tag_name as
// source_object_reference. The array `[]` suffix is intentionally omitted: an
// alarm-bearing attribute is a scalar anchor, not an array body. It also projects the
// owning object's real Galaxy area (via gobject.area_gobject_id) as area_name so the
// watch-list resolver composes a reference whose group matches the native alarmmgr.
private const string AlarmAttributesSql = @"
;WITH deployed_package_chain AS (
SELECT g.gobject_id, p.package_id, p.derived_from_package_id, 0 AS depth
FROM gobject g
INNER JOIN package p ON p.package_id = g.deployed_package_id
WHERE g.is_template = 0 AND g.deployed_package_id <> 0
UNION ALL
SELECT dpc.gobject_id, p.package_id, p.derived_from_package_id, dpc.depth + 1
FROM deployed_package_chain dpc
INNER JOIN package p ON p.package_id = dpc.derived_from_package_id
WHERE dpc.derived_from_package_id <> 0 AND dpc.depth < 10
),
candidate AS (
SELECT
dpc.gobject_id, g.tag_name, da.attribute_name, dpc.depth
FROM deployed_package_chain dpc
INNER JOIN dynamic_attribute da ON da.package_id = dpc.package_id
INNER JOIN gobject g ON g.gobject_id = dpc.gobject_id
INNER JOIN template_definition td ON td.template_definition_id = g.template_definition_id
WHERE td.category_id IN (1, 3, 4, 10, 11, 13, 17, 24, 26)
AND da.attribute_name NOT LIKE '[_]%'
AND da.attribute_name NOT LIKE '%.Description'
AND da.mx_attribute_category IN (2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 24)
),
ranked AS (
SELECT c.*, ROW_NUMBER() OVER (
PARTITION BY c.gobject_id, c.attribute_name ORDER BY c.depth) AS rn
FROM candidate c
)
SELECT
r.tag_name + '.' + r.attribute_name AS full_tag_reference,
r.tag_name AS source_object_reference,
ISNULL(area.tag_name, '') AS area_name
FROM ranked r
INNER JOIN gobject g ON g.gobject_id = r.gobject_id
LEFT JOIN gobject area ON area.gobject_id = g.area_gobject_id
WHERE r.rn = 1
AND EXISTS (
SELECT 1 FROM deployed_package_chain dpc2
INNER JOIN primitive_instance pi ON pi.package_id = dpc2.package_id AND pi.primitive_name = r.attribute_name
INNER JOIN primitive_definition pd ON pd.primitive_definition_id = pi.primitive_definition_id AND pd.primitive_name = 'AlarmExtension'
WHERE dpc2.gobject_id = r.gobject_id
)
ORDER BY r.tag_name, r.attribute_name";
}
@@ -27,4 +27,12 @@ public interface IGalaxyRepository
/// <summary>Retrieves all attributes for Galaxy objects from the repository.</summary>
/// <param name="ct">Token to cancel the asynchronous operation.</param>
Task<List<GalaxyAttributeRow>> GetAttributesAsync(CancellationToken ct = default);
/// <summary>
/// Retrieves only the alarm-bearing attributes (those whose owning object
/// configures an <c>AlarmExtension</c> primitive) for building the
/// subtag-fallback watch-list.
/// </summary>
/// <param name="ct">Token to cancel the asynchronous operation.</param>
Task<List<GalaxyAlarmAttributeRow>> GetAlarmAttributesAsync(CancellationToken ct = default);
}
@@ -0,0 +1,20 @@
namespace ZB.MOM.WW.MxGateway.Server.Metrics;
/// <summary>
/// Bounded classification of an alarm-provider switch, used as the low-cardinality
/// <c>reason</c> tag on the <c>mxgateway.alarms.provider_switches</c> counter. The
/// worker supplies a free-text reason (e.g. <c>"primary PollOnce failed"</c>) that
/// stays in the structured log; only this bounded value reaches the metric tag so the
/// time series cannot fan out on operation-specific text.
/// </summary>
public enum AlarmProviderSwitchReason
{
/// <summary>The switch direction could not be classified.</summary>
Unknown = 0,
/// <summary>Switched from the primary (alarmmgr) provider to the subtag standby — degraded.</summary>
Failover = 1,
/// <summary>Switched back from the subtag standby to the primary (alarmmgr) provider — recovered.</summary>
Failback = 2,
}
@@ -1,5 +1,6 @@
using System.Collections.Concurrent;
using System.Diagnostics.Metrics;
using System.Globalization;
namespace ZB.MOM.WW.MxGateway.Server.Metrics;
@@ -22,6 +23,7 @@ public sealed class GatewayMetrics : IDisposable
private readonly Counter<long> _heartbeatFailuresCounter;
private readonly Counter<long> _streamDisconnectsCounter;
private readonly Counter<long> _retryAttemptsCounter;
private readonly Counter<long> _alarmProviderSwitchesCounter;
private readonly Histogram<double> _workerStartupLatencyHistogram;
private readonly Histogram<double> _commandLatencyHistogram;
private readonly Histogram<double> _eventStreamSendLatencyHistogram;
@@ -34,6 +36,7 @@ public sealed class GatewayMetrics : IDisposable
private int _workersRunning;
private int _workerEventQueueDepth;
private int _grpcEventStreamQueueDepth;
private int _alarmProviderMode;
private long _sessionsOpened;
private long _sessionsClosed;
private long _commandsStarted;
@@ -47,6 +50,7 @@ public sealed class GatewayMetrics : IDisposable
private long _heartbeatFailures;
private long _streamDisconnects;
private long _retryAttempts;
private long _alarmProviderSwitches;
private bool _disposed;
/// <summary>
@@ -68,6 +72,7 @@ public sealed class GatewayMetrics : IDisposable
_heartbeatFailuresCounter = _meter.CreateCounter<long>("mxgateway.heartbeats.failed");
_streamDisconnectsCounter = _meter.CreateCounter<long>("mxgateway.grpc.streams.disconnected");
_retryAttemptsCounter = _meter.CreateCounter<long>("mxgateway.retries.attempted");
_alarmProviderSwitchesCounter = _meter.CreateCounter<long>("mxgateway.alarms.provider_switches");
_workerStartupLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.workers.startup.duration", "s");
_commandLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.commands.duration", "s");
_eventStreamSendLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.events.stream_send.duration", "s");
@@ -76,6 +81,7 @@ public sealed class GatewayMetrics : IDisposable
_meter.CreateObservableGauge("mxgateway.workers.running", GetWorkersRunning);
_meter.CreateObservableGauge("mxgateway.events.worker_queue.depth", GetWorkerEventQueueDepth);
_meter.CreateObservableGauge("mxgateway.events.grpc_stream_queue.depth", GetGrpcEventStreamQueueDepth);
_meter.CreateObservableGauge("mxgateway.alarms.provider_mode", GetAlarmProviderMode);
}
/// <summary>
@@ -377,6 +383,41 @@ public sealed class GatewayMetrics : IDisposable
_retryAttemptsCounter.Add(1, new KeyValuePair<string, object?>("area", area));
}
/// <summary>
/// Records that the alarm provider switched modes, increments the switch count, and updates the
/// current provider mode gauge.
/// </summary>
/// <param name="fromMode">Provider mode before the switch (1=alarmmgr, 2=subtag, 0=unknown).</param>
/// <param name="toMode">Provider mode after the switch (1=alarmmgr, 2=subtag, 0=unknown).</param>
/// <param name="reason">Bounded switch classification used as the counter's <c>reason</c> tag.</param>
public void AlarmProviderSwitched(int fromMode, int toMode, AlarmProviderSwitchReason reason)
{
lock (_syncRoot)
{
_alarmProviderMode = toMode;
_alarmProviderSwitches++;
}
_alarmProviderSwitchesCounter.Add(
1,
new KeyValuePair<string, object?>("from", fromMode.ToString(CultureInfo.InvariantCulture)),
new KeyValuePair<string, object?>("to", toMode.ToString(CultureInfo.InvariantCulture)),
new KeyValuePair<string, object?>("reason", ReasonTag(reason)));
}
private static string ReasonTag(AlarmProviderSwitchReason reason) => reason switch
{
AlarmProviderSwitchReason.Failover => "failover",
AlarmProviderSwitchReason.Failback => "failback",
_ => "unknown",
};
/// <summary>Sets the current alarm provider-mode gauge without recording a switch (e.g. startup baseline).</summary>
public void SetAlarmProviderMode(int mode)
{
lock (_syncRoot) { _alarmProviderMode = mode; }
}
/// <summary>
/// Returns a snapshot of all current metric values.
/// </summary>
@@ -402,6 +443,7 @@ public sealed class GatewayMetrics : IDisposable
HeartbeatFailures: _heartbeatFailures,
StreamDisconnects: _streamDisconnects,
RetryAttempts: _retryAttempts,
AlarmProviderSwitchCount: _alarmProviderSwitches,
CommandFailuresByMethod: new Dictionary<string, long>(_commandFailuresByMethod, StringComparer.OrdinalIgnoreCase),
EventsByFamily: new Dictionary<string, long>(_eventsByFamily, StringComparer.OrdinalIgnoreCase),
EventsBySession: new Dictionary<string, long>(_eventsBySession, StringComparer.Ordinal),
@@ -455,6 +497,14 @@ public sealed class GatewayMetrics : IDisposable
}
}
private int GetAlarmProviderMode()
{
lock (_syncRoot)
{
return _alarmProviderMode;
}
}
private static void Increment(Dictionary<string, long> values, string key)
{
values.TryGetValue(key, out long currentValue);
@@ -18,6 +18,7 @@ public sealed record GatewayMetricsSnapshot(
long HeartbeatFailures,
long StreamDisconnects,
long RetryAttempts,
long AlarmProviderSwitchCount,
IReadOnlyDictionary<string, long> CommandFailuresByMethod,
IReadOnlyDictionary<string, long> EventsByFamily,
IReadOnlyDictionary<string, long> EventsBySession,
@@ -11,7 +11,7 @@
<PackageReference Include="ZB.MOM.WW.Auth.ApiKeys" Version="0.1.2" />
<PackageReference Include="ZB.MOM.WW.Auth.AspNetCore" Version="0.1.2" />
<PackageReference Include="ZB.MOM.WW.Audit" Version="0.1.0" />
<PackageReference Include="ZB.MOM.WW.Theme" Version="0.2.1" />
<PackageReference Include="ZB.MOM.WW.Theme" Version="0.3.1" />
<PackageReference Include="ZB.MOM.WW.Configuration" Version="0.1.0" />
<PackageReference Include="ZB.MOM.WW.Health" Version="0.1.0" />
<PackageReference Include="ZB.MOM.WW.Telemetry" Version="0.1.0" />
@@ -0,0 +1,490 @@
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Threading.Channels;
using Google.Protobuf.WellKnownTypes;
using Microsoft.Extensions.Logging.Abstractions;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
using ZB.MOM.WW.MxGateway.Server.Alarms;
using ZB.MOM.WW.MxGateway.Server.Configuration;
using ZB.MOM.WW.MxGateway.Server.Metrics;
using ZB.MOM.WW.MxGateway.Server.Sessions;
namespace ZB.MOM.WW.MxGateway.Tests.Alarms;
/// <summary>
/// Drives a single <see cref="GatewayAlarmMonitor"/> instance through the full
/// alarm-provider failover/failback lifecycle and asserts the live feed reflects
/// each stage. This complements the per-aspect tests in
/// <c>GatewayAlarmMonitorProviderModeTests</c> with one cohesive scenario:
/// subscribe (watch-list + forced mode) -> baseline alarmmgr status -> alarmmgr
/// transition -> failover to subtag (degraded) -> subtag transition -> failback
/// to alarmmgr (recovered).
/// </summary>
/// <remarks>
/// The minimal session-manager / watch-list-resolver harness here is replicated
/// (not shared) from the sibling <c>GatewayAlarmMonitorProviderModeTests</c>. The
/// sibling's harness is a private nested type, and the task forbids changing that
/// test's behaviour; replicating the few members this scenario needs keeps the
/// sibling completely untouched and this file self-contained, at the cost of a
/// small amount of duplication.
/// </remarks>
public sealed class AlarmFailoverEndToEndTests
{
private static readonly TimeSpan WaitTimeout = TimeSpan.FromSeconds(30);
[Fact]
public async Task ProviderFailoverAndFailback_FullLifecycle_ReflectedInFeed()
{
using GatewayMetrics metrics = new();
FakeSessionManager sessions = new();
// Watch-list-bearing, Auto config: a non-empty resolved watch-list with the
// default ("Auto" -> Unspecified) forced mode, so step 1 can assert both the
// resolved watch-list and the forced mode/failover the SubscribeAlarms carries.
StubWatchListResolver resolver = new(
[
new AlarmSubtagTarget { AlarmFullReference = "Galaxy!Area.Tank01.Hi", ActiveSubtag = "Tank01.Hi.active" },
new AlarmSubtagTarget { AlarmFullReference = "Galaxy!Area.Tank02.Lo", ActiveSubtag = "Tank02.Lo.active" },
]);
AlarmsOptions options = new()
{
Enabled = true,
SubscriptionExpression = @"\\NODE\Galaxy!Area",
Fallback = new AlarmFallbackOptions
{
Mode = "Auto",
ConsecutiveFailureThreshold = 3,
FailbackProbeIntervalSeconds = 9,
FailbackStableProbes = 2,
},
};
using GatewayAlarmMonitor monitor = new(
sessions,
resolver,
metrics,
Microsoft.Extensions.Options.Options.Create(new GatewayOptions { Alarms = options }),
NullLogger<GatewayAlarmMonitor>.Instance);
using CancellationTokenSource cts = new();
await monitor.StartAsync(cts.Token);
await sessions.WaitForSubscribeAsync(WaitTimeout);
// --- Step 1: SubscribeAlarms carries the resolved watch-list + forced mode/failover. ---
SubscribeAlarmsCommand sent = Assert.IsType<SubscribeAlarmsCommand>(sessions.LastSubscribeCommand);
Assert.Equal(AlarmProviderMode.Unspecified, sent.ForcedMode); // "Auto"
Assert.Equal(3, sent.Failover.ConsecutiveFailureThreshold);
Assert.Equal(9, sent.Failover.FailbackProbeIntervalSeconds);
Assert.Equal(2, sent.Failover.FailbackStableProbes);
Assert.NotEmpty(sent.WatchList);
Assert.Equal(2, sent.WatchList.Count);
Assert.Contains(sent.WatchList, t => t.AlarmFullReference == "Galaxy!Area.Tank01.Hi");
// Live feed reader collecting every message in order.
List<AlarmFeedMessage> received = [];
TaskCompletionSource baselineReceived = new(TaskCreationOptions.RunContinuationsAsynchronously);
using CancellationTokenSource streamCts = new();
Task reader = Task.Run(async () =>
{
try
{
await foreach (AlarmFeedMessage message in monitor.StreamAsync(null, streamCts.Token))
{
lock (received)
{
received.Add(message);
if (received.Count == 1)
{
// The first message is the baseline ProviderStatus.
baselineReceived.TrySetResult();
}
}
}
}
catch (OperationCanceledException)
{
// Expected when the test cancels the stream.
}
});
// --- Step 2: first message is ProviderStatus{Alarmmgr, Degraded=false}. ---
await baselineReceived.Task.WaitAsync(WaitTimeout);
AlarmFeedMessage baseline;
lock (received)
{
baseline = received[0];
}
Assert.Equal(AlarmFeedMessage.PayloadOneofCase.ProviderStatus, baseline.PayloadCase);
Assert.Equal(AlarmProviderMode.Alarmmgr, baseline.ProviderStatus.Mode);
Assert.False(baseline.ProviderStatus.Degraded);
// --- Step 3: alarmmgr-style transition (Degraded=false, SourceProvider=Alarmmgr, Raise). ---
sessions.EmitEvent(new MxEvent
{
OnAlarmTransition = new OnAlarmTransitionEvent
{
AlarmFullReference = "Galaxy!Area.Tank01.Hi",
SourceObjectReference = "Tank01",
AlarmTypeName = "AnalogLimitAlarm.Hi",
TransitionKind = AlarmTransitionKind.Raise,
Severity = 500,
Degraded = false,
SourceProvider = AlarmProviderMode.Alarmmgr,
TransitionTimestamp = Timestamp.FromDateTimeOffset(DateTimeOffset.UtcNow),
},
});
AlarmFeedMessage alarmmgrTransition = await WaitForAsync(
received,
m => m.PayloadCase == AlarmFeedMessage.PayloadOneofCase.Transition
&& m.Transition.AlarmFullReference == "Galaxy!Area.Tank01.Hi"
&& m.Transition.SourceProvider == AlarmProviderMode.Alarmmgr,
WaitTimeout);
Assert.Equal(AlarmTransitionKind.Raise, alarmmgrTransition.Transition.TransitionKind);
Assert.False(alarmmgrTransition.Transition.Degraded);
Assert.Equal(AlarmProviderMode.Alarmmgr, alarmmgrTransition.Transition.SourceProvider);
// --- Step 4: failover to subtag -> ProviderStatus{Subtag, Degraded=true}. ---
sessions.EmitEvent(new MxEvent
{
OnAlarmProviderModeChanged = new OnAlarmProviderModeChangedEvent
{
Mode = AlarmProviderMode.Subtag,
Reason = "alarmmgr failed",
Hresult = unchecked((int)0x80004005),
At = Timestamp.FromDateTimeOffset(DateTimeOffset.UtcNow),
},
});
AlarmFeedMessage degraded = await WaitForAsync(
received,
m => m.PayloadCase == AlarmFeedMessage.PayloadOneofCase.ProviderStatus
&& m.ProviderStatus.Mode == AlarmProviderMode.Subtag,
WaitTimeout);
Assert.Equal(AlarmProviderMode.Subtag, degraded.ProviderStatus.Mode);
Assert.True(degraded.ProviderStatus.Degraded);
Assert.Equal("alarmmgr failed", degraded.ProviderStatus.Reason);
// --- Step 5: subtag-style transition (Degraded=true, SourceProvider=Subtag, Raise on a different ref). ---
sessions.EmitEvent(new MxEvent
{
OnAlarmTransition = new OnAlarmTransitionEvent
{
AlarmFullReference = "Galaxy!Area.Tank02.Lo",
SourceObjectReference = "Tank02",
AlarmTypeName = "AnalogLimitAlarm.Lo",
TransitionKind = AlarmTransitionKind.Raise,
Severity = 250,
Degraded = true,
SourceProvider = AlarmProviderMode.Subtag,
TransitionTimestamp = Timestamp.FromDateTimeOffset(DateTimeOffset.UtcNow),
},
});
AlarmFeedMessage subtagTransition = await WaitForAsync(
received,
m => m.PayloadCase == AlarmFeedMessage.PayloadOneofCase.Transition
&& m.Transition.AlarmFullReference == "Galaxy!Area.Tank02.Lo"
&& m.Transition.SourceProvider == AlarmProviderMode.Subtag,
WaitTimeout);
Assert.Equal(AlarmTransitionKind.Raise, subtagTransition.Transition.TransitionKind);
Assert.True(subtagTransition.Transition.Degraded);
Assert.Equal(AlarmProviderMode.Subtag, subtagTransition.Transition.SourceProvider);
// --- Step 6: failback to alarmmgr -> ProviderStatus{Alarmmgr, Degraded=false} (recovery). ---
sessions.EmitEvent(new MxEvent
{
OnAlarmProviderModeChanged = new OnAlarmProviderModeChangedEvent
{
Mode = AlarmProviderMode.Alarmmgr,
Reason = "recovered",
Hresult = 0,
At = Timestamp.FromDateTimeOffset(DateTimeOffset.UtcNow),
},
});
// Match the recovery status specifically: an Alarmmgr ProviderStatus that
// carries the "recovered" reason, distinguishing it from the baseline at [0].
AlarmFeedMessage recovered = await WaitForAsync(
received,
m => m.PayloadCase == AlarmFeedMessage.PayloadOneofCase.ProviderStatus
&& m.ProviderStatus.Mode == AlarmProviderMode.Alarmmgr
&& m.ProviderStatus.Reason == "recovered",
WaitTimeout);
Assert.Equal(AlarmProviderMode.Alarmmgr, recovered.ProviderStatus.Mode);
Assert.False(recovered.ProviderStatus.Degraded);
Assert.Equal("recovered", recovered.ProviderStatus.Reason);
await streamCts.CancelAsync();
await reader;
await cts.CancelAsync();
await monitor.StopAsync(CancellationToken.None);
}
[Fact]
public async Task DegradedTransition_CachedThenReplayed_CarriesDegradedAndSourceProviderToNewSubscriber()
{
using GatewayMetrics metrics = new();
FakeSessionManager sessions = new();
StubWatchListResolver resolver = new([]);
AlarmsOptions options = new()
{
Enabled = true,
SubscriptionExpression = @"\\NODE\Galaxy!Area",
Fallback = new AlarmFallbackOptions
{
Mode = "Auto",
ConsecutiveFailureThreshold = 3,
FailbackProbeIntervalSeconds = 9,
FailbackStableProbes = 2,
},
};
using GatewayAlarmMonitor monitor = new(
sessions,
resolver,
metrics,
Microsoft.Extensions.Options.Options.Create(new GatewayOptions { Alarms = options }),
NullLogger<GatewayAlarmMonitor>.Instance);
using CancellationTokenSource cts = new();
await monitor.StartAsync(cts.Token);
await sessions.WaitForSubscribeAsync(WaitTimeout);
// First subscriber: drive the feed past the baseline ProviderStatus so we
// know the monitor's event loop is live before we emit the transition.
List<AlarmFeedMessage> firstReader = [];
TaskCompletionSource baselineReceived = new(TaskCreationOptions.RunContinuationsAsynchronously);
using CancellationTokenSource firstStreamCts = new();
Task reader = Task.Run(async () =>
{
try
{
await foreach (AlarmFeedMessage message in monitor.StreamAsync(null, firstStreamCts.Token))
{
lock (firstReader)
{
firstReader.Add(message);
if (message.PayloadCase == AlarmFeedMessage.PayloadOneofCase.ProviderStatus)
{
baselineReceived.TrySetResult();
}
}
}
}
catch (OperationCanceledException)
{
// Expected when the test cancels the stream.
}
});
await baselineReceived.Task.WaitAsync(WaitTimeout);
// Apply a degraded (subtag) transition. This lands in the monitor's cache
// via SnapshotFromTransition, which must preserve Degraded/SourceProvider.
sessions.EmitEvent(new MxEvent
{
OnAlarmTransition = new OnAlarmTransitionEvent
{
AlarmFullReference = "Galaxy!Area.Tank02.Lo",
SourceObjectReference = "Tank02",
AlarmTypeName = "AnalogLimitAlarm.Lo",
TransitionKind = AlarmTransitionKind.Raise,
Severity = 250,
Degraded = true,
SourceProvider = AlarmProviderMode.Subtag,
TransitionTimestamp = Timestamp.FromDateTimeOffset(DateTimeOffset.UtcNow),
},
});
// Wait for the live transition to be observed by the first subscriber so we
// know the cache has been updated before opening the new stream.
await WaitForAsync(
firstReader,
m => m.PayloadCase == AlarmFeedMessage.PayloadOneofCase.Transition
&& m.Transition.AlarmFullReference == "Galaxy!Area.Tank02.Lo",
WaitTimeout);
// New subscriber: its initial cache snapshot must carry the degraded flags.
// Bound the drain so a regression that never emits SnapshotComplete fails
// with a clean TimeoutException (via cancellation) instead of hanging.
using CancellationTokenSource newStreamCts = new();
using CancellationTokenSource drainTimeoutCts = new();
drainTimeoutCts.CancelAfter(WaitTimeout);
using CancellationTokenSource linkedDrainCts =
CancellationTokenSource.CreateLinkedTokenSource(newStreamCts.Token, drainTimeoutCts.Token);
ActiveAlarmSnapshot? initialActiveAlarm = null;
try
{
await foreach (AlarmFeedMessage message in monitor.StreamAsync(null, linkedDrainCts.Token))
{
if (message.PayloadCase == AlarmFeedMessage.PayloadOneofCase.ActiveAlarm
&& message.ActiveAlarm.AlarmFullReference == "Galaxy!Area.Tank02.Lo")
{
initialActiveAlarm = message.ActiveAlarm;
}
if (message.PayloadCase == AlarmFeedMessage.PayloadOneofCase.SnapshotComplete)
{
break;
}
}
}
catch (OperationCanceledException) when (drainTimeoutCts.IsCancellationRequested)
{
throw new TimeoutException(
"The new subscriber did not receive a SnapshotComplete message within the timeout.");
}
Assert.NotNull(initialActiveAlarm);
Assert.True(initialActiveAlarm!.Degraded);
Assert.Equal(AlarmProviderMode.Subtag, initialActiveAlarm.SourceProvider);
await newStreamCts.CancelAsync();
await firstStreamCts.CancelAsync();
await reader;
await cts.CancelAsync();
await monitor.StopAsync(CancellationToken.None);
}
private static async Task<AlarmFeedMessage> WaitForAsync(
List<AlarmFeedMessage> received,
Func<AlarmFeedMessage, bool> predicate,
TimeSpan timeout)
{
DateTime deadline = DateTime.UtcNow + timeout;
while (DateTime.UtcNow < deadline)
{
lock (received)
{
AlarmFeedMessage? match = received.FirstOrDefault(predicate);
if (match is not null)
{
return match;
}
}
await Task.Delay(25);
}
throw new TimeoutException("No matching AlarmFeedMessage was received in time.");
}
/// <summary><see cref="IAlarmWatchListResolver"/> that returns a fixed watch-list.</summary>
private sealed class StubWatchListResolver(IReadOnlyList<AlarmSubtagTarget> targets) : IAlarmWatchListResolver
{
/// <inheritdoc />
public Task<IReadOnlyList<AlarmSubtagTarget>> ResolveAsync(
AlarmsOptions options,
CancellationToken cancellationToken = default) => Task.FromResult(targets);
}
/// <summary>
/// Minimal <see cref="ISessionManager"/> for driving the monitor: opens a
/// constructed session, records the SubscribeAlarms command, replies OK to
/// every command, and exposes a channel for pushing worker events.
/// </summary>
private sealed class FakeSessionManager : ISessionManager
{
private readonly Channel<WorkerEvent> _events = Channel.CreateUnbounded<WorkerEvent>();
private readonly TaskCompletionSource _subscribed =
new(TaskCreationOptions.RunContinuationsAsynchronously);
/// <summary>The most recent SubscribeAlarms command the monitor sent.</summary>
public SubscribeAlarmsCommand? LastSubscribeCommand { get; private set; }
/// <summary>Pushes a worker event onto the monitor's event stream.</summary>
public void EmitEvent(MxEvent mxEvent) =>
_events.Writer.TryWrite(new WorkerEvent { Event = mxEvent });
/// <summary>Completes once the monitor has issued its SubscribeAlarms command.</summary>
public Task WaitForSubscribeAsync(TimeSpan timeout) => _subscribed.Task.WaitAsync(timeout);
/// <inheritdoc />
public Task<GatewaySession> OpenSessionAsync(
SessionOpenRequest request,
string? clientIdentity,
CancellationToken cancellationToken)
{
GatewaySession session = new(
Guid.NewGuid().ToString("N"),
"Galaxy",
"pipe-test",
"nonce-test",
clientIdentity,
null,
null,
TimeSpan.FromSeconds(30),
TimeSpan.FromSeconds(30),
TimeSpan.FromSeconds(30),
DateTimeOffset.UtcNow);
return Task.FromResult(session);
}
/// <inheritdoc />
public Task<WorkerCommandReply> InvokeAsync(
string sessionId,
WorkerCommand command,
CancellationToken cancellationToken)
{
if (command.Command?.Kind == MxCommandKind.SubscribeAlarms)
{
LastSubscribeCommand = command.Command.SubscribeAlarms;
_subscribed.TrySetResult();
}
MxCommandReply reply = new()
{
ProtocolStatus = new ProtocolStatus { Code = ProtocolStatusCode.Ok },
};
if (command.Command?.Kind == MxCommandKind.QueryActiveAlarms)
{
reply.QueryActiveAlarms = new QueryActiveAlarmsReplyPayload();
}
return Task.FromResult(new WorkerCommandReply { Reply = reply });
}
/// <inheritdoc />
public async IAsyncEnumerable<WorkerEvent> ReadEventsAsync(
string sessionId,
[EnumeratorCancellation] CancellationToken cancellationToken)
{
await foreach (WorkerEvent workerEvent in _events.Reader.ReadAllAsync(cancellationToken))
{
yield return workerEvent;
}
}
/// <inheritdoc />
public bool TryGetSession(string sessionId, [MaybeNullWhen(false)] out GatewaySession session)
{
session = null;
return false;
}
/// <inheritdoc />
public Task<SessionCloseResult> CloseSessionAsync(string sessionId, CancellationToken cancellationToken)
{
_events.Writer.TryComplete();
return Task.FromResult(new SessionCloseResult(sessionId, SessionState.Closed, AlreadyClosed: false));
}
/// <inheritdoc />
public Task<SessionCloseResult> KillWorkerAsync(string sessionId, string reason, CancellationToken cancellationToken) =>
Task.FromResult(new SessionCloseResult(sessionId, SessionState.Closed, AlreadyClosed: false));
/// <inheritdoc />
public Task<int> CloseExpiredLeasesAsync(DateTimeOffset now, CancellationToken cancellationToken) =>
Task.FromResult(0);
/// <inheritdoc />
public Task ShutdownAsync(CancellationToken cancellationToken) => Task.CompletedTask;
}
}
@@ -0,0 +1,441 @@
using Microsoft.Extensions.Logging.Abstractions;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
using ZB.MOM.WW.MxGateway.Server.Alarms;
using ZB.MOM.WW.MxGateway.Server.Configuration;
using ZB.MOM.WW.MxGateway.Server.Galaxy;
namespace ZB.MOM.WW.MxGateway.Tests.Alarms;
/// <summary>
/// Unit tests for <see cref="AlarmWatchListResolver"/>: discovery/config merge,
/// subtag-address composition, canonical reference shaping, and the
/// unavailable-discovery code path.
/// </summary>
public sealed class AlarmWatchListResolverTests
{
private static AlarmWatchListResolver CreateResolver(IGalaxyRepository repository) =>
new(repository, NullLogger<AlarmWatchListResolver>.Instance);
private static AlarmsOptions Options(
bool useGalaxyRepository = true,
string area = "",
string defaultArea = "",
string[]? include = null,
string[]? exclude = null,
AlarmSubtagNameOptions? subtags = null) =>
new()
{
DefaultArea = defaultArea,
Fallback = new AlarmFallbackOptions
{
Discovery = new AlarmDiscoveryOptions
{
UseGalaxyRepository = useGalaxyRepository,
Area = area,
IncludeAttributes = include ?? [],
ExcludeAttributes = exclude ?? [],
},
Subtags = subtags ?? new AlarmSubtagNameOptions(),
},
};
[Fact]
public async Task ResolveAsync_UnionsGalaxyRowsAndIncludes_RemovesExcludes_AndDeduplicates()
{
StubGalaxyRepository repo = new(
[
new GalaxyAlarmAttributeRow { FullTagReference = "Tank01.Level.HiHi", SourceObjectReference = "Tank01", Area = "TestArea" },
new GalaxyAlarmAttributeRow { FullTagReference = "Tank02.Level.HiHi", SourceObjectReference = "Tank02", Area = "TestArea" },
// Duplicate of an include below (case-insensitive) — should appear once.
new GalaxyAlarmAttributeRow { FullTagReference = "Pump01.Fault", SourceObjectReference = "Pump01", Area = "TestArea" },
]);
AlarmWatchListResolver resolver = CreateResolver(repo);
IReadOnlyList<AlarmSubtagTarget> result = await resolver.ResolveAsync(Options(
include: ["pump01.fault", "Valve03.Position.Lo"],
exclude: ["Tank02.Level.HiHi"]));
Assert.Equal(
new[] { "Tank01.Level.HiHi", "Pump01.Fault", "Valve03.Position.Lo" },
result.Select(t => t.ActiveSubtag.Replace(".InAlarm", string.Empty, StringComparison.Ordinal)));
// De-dup preserved first (GR) occurrence; exclude removed Tank02.
Assert.Equal(3, result.Count);
}
[Fact]
public async Task ResolveAsync_ComposesSubtagAddressesFromConfigNames()
{
StubGalaxyRepository repo = new(
[
new GalaxyAlarmAttributeRow { FullTagReference = "Tank01.Level.HiHi", SourceObjectReference = "Tank01", Area = "TestArea" },
]);
AlarmWatchListResolver resolver = CreateResolver(repo);
IReadOnlyList<AlarmSubtagTarget> result = await resolver.ResolveAsync(Options(
subtags: new AlarmSubtagNameOptions
{
Active = "InAlarm",
Acked = "Ack",
Priority = "Pri",
AckComment = "AckCmt",
}));
AlarmSubtagTarget target = Assert.Single(result);
Assert.Equal("Tank01.Level.HiHi.InAlarm", target.ActiveSubtag);
Assert.Equal("Tank01.Level.HiHi.Ack", target.AckedSubtag);
Assert.Equal("Tank01.Level.HiHi.Pri", target.PrioritySubtag);
Assert.Equal("Tank01.Level.HiHi.AckCmt", target.AckCommentSubtag);
}
[Fact]
public async Task ResolveAsync_EmptyPriorityAndAckComment_LeaveThoseFieldsEmpty()
{
StubGalaxyRepository repo = new(
[
new GalaxyAlarmAttributeRow { FullTagReference = "Tank01.Level.HiHi", SourceObjectReference = "Tank01", Area = "TestArea" },
]);
AlarmWatchListResolver resolver = CreateResolver(repo);
// Default Priority is "Priority"; force it empty alongside empty AckComment.
IReadOnlyList<AlarmSubtagTarget> result = await resolver.ResolveAsync(Options(
subtags: new AlarmSubtagNameOptions { Priority = string.Empty, AckComment = string.Empty }));
AlarmSubtagTarget target = Assert.Single(result);
Assert.Equal("Tank01.Level.HiHi.InAlarm", target.ActiveSubtag);
Assert.Equal("Tank01.Level.HiHi.Acked", target.AckedSubtag);
Assert.Equal(string.Empty, target.PrioritySubtag);
Assert.Equal(string.Empty, target.AckCommentSubtag);
}
[Fact]
public async Task ResolveAsync_ComposesCanonicalFullReference_FromRealGalaxyArea_NotConfigArea()
{
// The GR row carries the object's real Galaxy area (the alarmmgr group). The
// composed reference must use that area, NOT the configured Discovery.Area.
StubGalaxyRepository repo = new(
[
new GalaxyAlarmAttributeRow
{
FullTagReference = "TestMachine_001.TestAlarm001",
SourceObjectReference = "TestMachine_001",
Area = "TestArea",
},
]);
AlarmWatchListResolver resolver = CreateResolver(repo);
// Config area "DEV" must be ignored for a GR row that has a discovered area.
IReadOnlyList<AlarmSubtagTarget> result = await resolver.ResolveAsync(
Options(area: "DEV", defaultArea: "DEV"));
AlarmSubtagTarget target = Assert.Single(result);
Assert.Equal("Galaxy!TestArea.TestMachine_001.TestAlarm001", target.AlarmFullReference);
}
[Fact]
public async Task ResolveAsync_ComposesCanonicalFullReference_WithoutArea()
{
// GR row with no discovered area and no config area -> bare Galaxy!{reference}.
StubGalaxyRepository repo = new(
[
new GalaxyAlarmAttributeRow { FullTagReference = "Tank01.Level.HiHi", SourceObjectReference = "Tank01", Area = "" },
]);
AlarmWatchListResolver resolver = CreateResolver(repo);
// No discovery area and no default area.
IReadOnlyList<AlarmSubtagTarget> result = await resolver.ResolveAsync(Options());
AlarmSubtagTarget target = Assert.Single(result);
Assert.Equal("Galaxy!Tank01.Level.HiHi", target.AlarmFullReference);
}
[Fact]
public async Task ResolveAsync_ConfigInclude_UsesDiscoveryAreaFallback()
{
// A config IncludeAttributes entry has no discovered area, so it uses the
// config fallback: Discovery.Area when set.
StubGalaxyRepository repo = new([]);
AlarmWatchListResolver resolver = CreateResolver(repo);
IReadOnlyList<AlarmSubtagTarget> result = await resolver.ResolveAsync(Options(
area: "Site_A",
include: ["Tank01.Level.HiHi"]));
AlarmSubtagTarget target = Assert.Single(result);
Assert.Equal("Galaxy!Site_A.Tank01.Level.HiHi", target.AlarmFullReference);
}
[Fact]
public async Task ResolveAsync_ConfigInclude_FallsBackToDefaultArea_WhenDiscoveryAreaEmpty()
{
// A config IncludeAttributes entry with no Discovery.Area uses DefaultArea.
StubGalaxyRepository repo = new([]);
AlarmWatchListResolver resolver = CreateResolver(repo);
IReadOnlyList<AlarmSubtagTarget> result = await resolver.ResolveAsync(Options(
area: "",
defaultArea: "Plant",
include: ["Tank01.Level.HiHi"]));
AlarmSubtagTarget target = Assert.Single(result);
Assert.Equal("Galaxy!Plant.Tank01.Level.HiHi", target.AlarmFullReference);
}
[Fact]
public async Task ResolveAsync_UseGalaxyRepositoryFalse_DoesNotCallRepository_UsesIncludesOnly()
{
StubGalaxyRepository repo = new(
[
new GalaxyAlarmAttributeRow { FullTagReference = "ShouldNotAppear.X", SourceObjectReference = "ShouldNotAppear" },
]);
AlarmWatchListResolver resolver = CreateResolver(repo);
IReadOnlyList<AlarmSubtagTarget> result = await resolver.ResolveAsync(Options(
useGalaxyRepository: false,
include: ["Tank01.Level.HiHi"]));
Assert.Equal(0, repo.GetAlarmAttributesCount);
AlarmSubtagTarget target = Assert.Single(result);
Assert.Equal("Galaxy!Tank01.Level.HiHi", target.AlarmFullReference);
}
[Fact]
public async Task ResolveAsync_RepositoryThrows_LogsAndReturnsConfigOnlySet()
{
ThrowingGalaxyRepository repo = new(new InvalidOperationException("SQL unavailable"));
AlarmWatchListResolver resolver = CreateResolver(repo);
IReadOnlyList<AlarmSubtagTarget> result = await resolver.ResolveAsync(Options(
include: ["Tank01.Level.HiHi"]));
// Did not throw; discovery set empty, include retained.
AlarmSubtagTarget target = Assert.Single(result);
Assert.Equal("Galaxy!Tank01.Level.HiHi", target.AlarmFullReference);
}
[Fact]
public async Task ResolveAsync_DerivesSourceObjectForConfigEntry()
{
StubGalaxyRepository repo = new([]);
AlarmWatchListResolver resolver = CreateResolver(repo);
IReadOnlyList<AlarmSubtagTarget> result = await resolver.ResolveAsync(Options(
include: ["Tank01.Level.HiHi", "StandaloneTag"]));
Assert.Equal("Tank01", result[0].SourceObjectReference);
// No dot — whole string is the source object.
Assert.Equal("StandaloneTag", result[1].SourceObjectReference);
}
/// <summary>
/// Fix 1: ExcludeAttributes must be ignored when UseGalaxyRepository is false.
/// A config-only include must survive even when the same path appears in ExcludeAttributes.
/// </summary>
[Fact]
public async Task ResolveAsync_ExcludeIgnored_WhenGalaxyRepositoryDisabled()
{
// Repo is never consulted; only IncludeAttributes matters.
StubGalaxyRepository repo = new([]);
AlarmWatchListResolver resolver = CreateResolver(repo);
IReadOnlyList<AlarmSubtagTarget> result = await resolver.ResolveAsync(Options(
useGalaxyRepository: false,
include: ["Tank01.Level.HiHi"],
exclude: ["Tank01.Level.HiHi"]));
// ExcludeAttributes is ignored when GR is off — the include must be present.
AlarmSubtagTarget target = Assert.Single(result);
Assert.Equal("Galaxy!Tank01.Level.HiHi", target.AlarmFullReference);
}
/// <summary>
/// Fix 1 (GR-on path): ExcludeAttributes still prunes GR rows when
/// UseGalaxyRepository is true.
/// </summary>
[Fact]
public async Task ResolveAsync_ExcludeApplied_WhenGalaxyRepositoryEnabled()
{
StubGalaxyRepository repo = new(
[
new GalaxyAlarmAttributeRow { FullTagReference = "Tank01.Level.HiHi", SourceObjectReference = "Tank01", Area = "TestArea" },
new GalaxyAlarmAttributeRow { FullTagReference = "Tank02.Level.HiHi", SourceObjectReference = "Tank02", Area = "TestArea" },
]);
AlarmWatchListResolver resolver = CreateResolver(repo);
IReadOnlyList<AlarmSubtagTarget> result = await resolver.ResolveAsync(Options(
useGalaxyRepository: true,
exclude: ["Tank02.Level.HiHi"]));
// Tank02 was excluded; only Tank01 remains.
AlarmSubtagTarget target = Assert.Single(result);
Assert.Contains("Tank01", target.ActiveSubtag, StringComparison.Ordinal);
}
/// <summary>
/// Fix 1: A whitespace-only ExcludeAttributes entry must be skipped and must
/// not accidentally exclude any real reference.
/// </summary>
[Fact]
public async Task ResolveAsync_WhitespaceOnlyExcludeEntry_IsSkipped()
{
StubGalaxyRepository repo = new(
[
new GalaxyAlarmAttributeRow { FullTagReference = "Tank01.Level.HiHi", SourceObjectReference = "Tank01", Area = "TestArea" },
]);
AlarmWatchListResolver resolver = CreateResolver(repo);
// The exclude array contains a whitespace-only string — should be a no-op.
IReadOnlyList<AlarmSubtagTarget> result = await resolver.ResolveAsync(Options(
useGalaxyRepository: true,
exclude: [" "]));
// Tank01 must not have been wrongly excluded.
AlarmSubtagTarget target = Assert.Single(result);
Assert.Contains("Tank01", target.ActiveSubtag, StringComparison.Ordinal);
}
/// <summary>
/// Server-051: a cancellation triggered while Galaxy Repository discovery is
/// awaiting must propagate as <see cref="OperationCanceledException"/>, not be
/// swallowed into a config-only watch-list, per the <see cref="IAlarmWatchListResolver"/>
/// contract.
/// </summary>
[Fact]
public async Task ResolveAsync_RepositoryCancelled_PropagatesOperationCanceled()
{
using CancellationTokenSource cts = new();
// Repository observes the token, cancels the source, then throws the matching
// OperationCanceledException — exactly what the live SQL path does on shutdown.
CancellingGalaxyRepository repo = new(cts);
AlarmWatchListResolver resolver = CreateResolver(repo);
await Assert.ThrowsAnyAsync<OperationCanceledException>(() =>
resolver.ResolveAsync(
Options(include: ["Tank01.Level.HiHi"]),
cts.Token));
}
/// <summary>
/// Server-052 item 2 / Server-053: an entry that appears in both
/// <c>IncludeAttributes</c> and <c>ExcludeAttributes</c> is removed — excludes
/// win over explicit includes (the documented "excludes also suppress matching
/// explicit includes" behaviour).
/// </summary>
[Fact]
public async Task ResolveAsync_ExcludeAlsoSuppressesMatchingExplicitInclude()
{
StubGalaxyRepository repo = new(
[
new GalaxyAlarmAttributeRow { FullTagReference = "Tank01.Level.HiHi", SourceObjectReference = "Tank01", Area = "TestArea" },
]);
AlarmWatchListResolver resolver = CreateResolver(repo);
// Pump01.Fault is an explicit include AND an exclude (case-insensitive). It must
// be dropped; the GR row Tank01 survives.
IReadOnlyList<AlarmSubtagTarget> result = await resolver.ResolveAsync(Options(
useGalaxyRepository: true,
include: ["Pump01.Fault"],
exclude: ["pump01.fault"]));
AlarmSubtagTarget target = Assert.Single(result);
Assert.Contains("Tank01", target.ActiveSubtag, StringComparison.Ordinal);
Assert.DoesNotContain(result, t => t.ActiveSubtag.Contains("Pump01", StringComparison.OrdinalIgnoreCase));
}
/// <summary>In-memory <see cref="IGalaxyRepository"/> returning a fixed alarm rowset.</summary>
private sealed class StubGalaxyRepository(List<GalaxyAlarmAttributeRow> rows) : IGalaxyRepository
{
/// <summary>Gets the number of times <see cref="GetAlarmAttributesAsync"/> was called.</summary>
public int GetAlarmAttributesCount { get; private set; }
/// <inheritdoc />
public Task<bool> TestConnectionAsync(CancellationToken ct = default) => Task.FromResult(true);
/// <inheritdoc />
public Task<DateTime?> GetLastDeployTimeAsync(CancellationToken ct = default) =>
Task.FromResult<DateTime?>(null);
/// <inheritdoc />
public Task<List<GalaxyHierarchyRow>> GetHierarchyAsync(CancellationToken ct = default) =>
Task.FromResult(new List<GalaxyHierarchyRow>());
/// <inheritdoc />
public Task<List<GalaxyAttributeRow>> GetAttributesAsync(CancellationToken ct = default) =>
Task.FromResult(new List<GalaxyAttributeRow>());
/// <inheritdoc />
public Task<List<GalaxyAlarmAttributeRow>> GetAlarmAttributesAsync(CancellationToken ct = default)
{
GetAlarmAttributesCount++;
return Task.FromResult(rows);
}
}
/// <summary><see cref="IGalaxyRepository"/> whose alarm-attribute query throws.</summary>
private sealed class ThrowingGalaxyRepository(Exception toThrow) : IGalaxyRepository
{
/// <inheritdoc />
public Task<bool> TestConnectionAsync(CancellationToken ct = default) => Task.FromResult(true);
/// <inheritdoc />
public Task<DateTime?> GetLastDeployTimeAsync(CancellationToken ct = default) =>
Task.FromResult<DateTime?>(null);
/// <inheritdoc />
public Task<List<GalaxyHierarchyRow>> GetHierarchyAsync(CancellationToken ct = default) =>
Task.FromResult(new List<GalaxyHierarchyRow>());
/// <inheritdoc />
public Task<List<GalaxyAttributeRow>> GetAttributesAsync(CancellationToken ct = default) =>
Task.FromResult(new List<GalaxyAttributeRow>());
/// <inheritdoc />
public Task<List<GalaxyAlarmAttributeRow>> GetAlarmAttributesAsync(CancellationToken ct = default) =>
Task.FromException<List<GalaxyAlarmAttributeRow>>(toThrow);
}
/// <summary>
/// <see cref="IGalaxyRepository"/> whose alarm-attribute query cancels the
/// supplied source and throws a token-bound <see cref="OperationCanceledException"/>,
/// mirroring the live SQL path being cancelled mid-await.
/// </summary>
private sealed class CancellingGalaxyRepository(CancellationTokenSource source) : IGalaxyRepository
{
/// <inheritdoc />
public Task<bool> TestConnectionAsync(CancellationToken ct = default) => Task.FromResult(true);
/// <inheritdoc />
public Task<DateTime?> GetLastDeployTimeAsync(CancellationToken ct = default) =>
Task.FromResult<DateTime?>(null);
/// <inheritdoc />
public Task<List<GalaxyHierarchyRow>> GetHierarchyAsync(CancellationToken ct = default) =>
Task.FromResult(new List<GalaxyHierarchyRow>());
/// <inheritdoc />
public Task<List<GalaxyAttributeRow>> GetAttributesAsync(CancellationToken ct = default) =>
Task.FromResult(new List<GalaxyAttributeRow>());
/// <inheritdoc />
public Task<List<GalaxyAlarmAttributeRow>> GetAlarmAttributesAsync(CancellationToken ct = default)
{
source.Cancel();
ct.ThrowIfCancellationRequested();
return Task.FromResult(new List<GalaxyAlarmAttributeRow>());
}
}
}
@@ -0,0 +1,791 @@
using System.Diagnostics.CodeAnalysis;
using System.Diagnostics.Metrics;
using System.Runtime.CompilerServices;
using System.Threading.Channels;
using Google.Protobuf.WellKnownTypes;
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
using ZB.MOM.WW.MxGateway.Server.Alarms;
using ZB.MOM.WW.MxGateway.Server.Configuration;
using ZB.MOM.WW.MxGateway.Server.Metrics;
using ZB.MOM.WW.MxGateway.Server.Sessions;
namespace ZB.MOM.WW.MxGateway.Tests.Alarms;
/// <summary>
/// Drives <see cref="GatewayAlarmMonitor"/> with a fake session manager to
/// verify it reflects the worker's <c>OnAlarmProviderModeChanged</c> event into
/// the alarm feed and the switch metric, and that a new subscriber receives the
/// provider status as its first message. Also covers the watch-list / forced-mode
/// wiring of the <c>SubscribeAlarms</c> command and the Mode→enum mapping.
/// </summary>
public sealed class GatewayAlarmMonitorProviderModeTests
{
private static readonly TimeSpan WaitTimeout = TimeSpan.FromSeconds(15);
[Fact]
public async Task ProviderModeChange_BroadcastsDegradedStatus_AndIncrementsSwitchMetric()
{
using GatewayMetrics metrics = new();
long switchCount = 0;
using MeterListener listener = new();
listener.InstrumentPublished = (instrument, meterListener) =>
{
if (ReferenceEquals(instrument.Meter, metrics.Meter)
&& instrument.Name == "mxgateway.alarms.provider_switches")
{
meterListener.EnableMeasurementEvents(instrument);
}
};
listener.SetMeasurementEventCallback<long>(
(instrument, measurement, _, _) =>
{
if (ReferenceEquals(instrument.Meter, metrics.Meter)
&& instrument.Name == "mxgateway.alarms.provider_switches")
{
Interlocked.Add(ref switchCount, measurement);
}
});
listener.Start();
FakeSessionManager sessions = new();
using GatewayAlarmMonitor monitor = CreateMonitor(sessions, metrics);
using CancellationTokenSource cts = new();
await monitor.StartAsync(cts.Token);
await sessions.WaitForSubscribeAsync(WaitTimeout);
// Subscribe a live feed reader. Gate emitting the mode-change event until the
// reader has consumed its baseline ProviderStatus message, avoiding a race where
// the event arrives before the subscriber is registered and draining its snapshot.
List<AlarmFeedMessage> received = [];
TaskCompletionSource baselineReceived = new(TaskCreationOptions.RunContinuationsAsynchronously);
using CancellationTokenSource streamCts = new();
Task reader = Task.Run(async () =>
{
try
{
await foreach (AlarmFeedMessage message in monitor.StreamAsync(null, streamCts.Token))
{
lock (received)
{
received.Add(message);
// Signal once the first message (baseline ProviderStatus) has arrived.
if (received.Count == 1)
{
baselineReceived.TrySetResult();
}
}
}
}
catch (OperationCanceledException)
{
// Expected when the test cancels the stream.
}
});
// Wait for the baseline ProviderStatus to arrive before emitting the mode change,
// so the subscriber is registered and the event is not dropped.
await baselineReceived.Task.WaitAsync(WaitTimeout);
// Emit the worker event that flips the provider into subtag mode.
sessions.EmitEvent(new MxEvent
{
OnAlarmProviderModeChanged = new OnAlarmProviderModeChangedEvent
{
Mode = AlarmProviderMode.Subtag,
Reason = "alarmmgr failed",
Hresult = unchecked((int)0x80004005),
At = Timestamp.FromDateTimeOffset(DateTimeOffset.UtcNow),
},
});
AlarmFeedMessage degraded = await WaitForAsync(
received,
m => m.PayloadCase == AlarmFeedMessage.PayloadOneofCase.ProviderStatus
&& m.ProviderStatus.Mode == AlarmProviderMode.Subtag,
WaitTimeout);
Assert.Equal(AlarmProviderMode.Subtag, degraded.ProviderStatus.Mode);
Assert.True(degraded.ProviderStatus.Degraded);
Assert.Equal("alarmmgr failed", degraded.ProviderStatus.Reason);
await WaitUntilAsync(() => Interlocked.Read(ref switchCount) >= 1, WaitTimeout);
Assert.Equal(1, Interlocked.Read(ref switchCount));
await streamCts.CancelAsync();
await reader;
await cts.CancelAsync();
await monitor.StopAsync(CancellationToken.None);
}
/// <summary>
/// Server-053: a redundant <c>OnAlarmProviderModeChanged</c> event whose target
/// mode equals the current mode still records a provider switch. The worker is the
/// authority on when a mode change occurred; the gateway does not second-guess it,
/// so each event the worker emits increments <c>provider_switches</c> (no from==to
/// suppression). This test pins that semantics so it cannot drift silently.
/// </summary>
[Fact]
public async Task ProviderModeChange_RepeatedSameMode_RecordsASwitchForEachEvent()
{
using GatewayMetrics metrics = new();
long switchCount = 0;
using MeterListener listener = new();
listener.InstrumentPublished = (instrument, meterListener) =>
{
if (ReferenceEquals(instrument.Meter, metrics.Meter)
&& instrument.Name == "mxgateway.alarms.provider_switches")
{
meterListener.EnableMeasurementEvents(instrument);
}
};
listener.SetMeasurementEventCallback<long>(
(instrument, measurement, _, _) =>
{
if (ReferenceEquals(instrument.Meter, metrics.Meter)
&& instrument.Name == "mxgateway.alarms.provider_switches")
{
Interlocked.Add(ref switchCount, measurement);
}
});
listener.Start();
FakeSessionManager sessions = new();
using GatewayAlarmMonitor monitor = CreateMonitor(sessions, metrics);
using CancellationTokenSource cts = new();
await monitor.StartAsync(cts.Token);
await sessions.WaitForSubscribeAsync(WaitTimeout);
List<AlarmFeedMessage> received = [];
TaskCompletionSource baselineReceived = new(TaskCreationOptions.RunContinuationsAsynchronously);
using CancellationTokenSource streamCts = new();
Task reader = Task.Run(async () =>
{
try
{
await foreach (AlarmFeedMessage message in monitor.StreamAsync(null, streamCts.Token))
{
lock (received)
{
received.Add(message);
if (received.Count == 1)
{
baselineReceived.TrySetResult();
}
}
}
}
catch (OperationCanceledException)
{
// Expected when the test cancels the stream.
}
});
await baselineReceived.Task.WaitAsync(WaitTimeout);
// First subtag-mode event.
sessions.EmitEvent(new MxEvent
{
OnAlarmProviderModeChanged = new OnAlarmProviderModeChangedEvent
{
Mode = AlarmProviderMode.Subtag,
Reason = "alarmmgr failed",
At = Timestamp.FromDateTimeOffset(DateTimeOffset.UtcNow),
},
});
await WaitUntilAsync(() => Interlocked.Read(ref switchCount) >= 1, WaitTimeout);
// Second subtag-mode event — same mode, but still a worker-reported switch.
sessions.EmitEvent(new MxEvent
{
OnAlarmProviderModeChanged = new OnAlarmProviderModeChangedEvent
{
Mode = AlarmProviderMode.Subtag,
Reason = "still degraded",
At = Timestamp.FromDateTimeOffset(DateTimeOffset.UtcNow),
},
});
await WaitUntilAsync(() => Interlocked.Read(ref switchCount) >= 2, WaitTimeout);
Assert.Equal(2, Interlocked.Read(ref switchCount));
await streamCts.CancelAsync();
await reader;
await cts.CancelAsync();
await monitor.StopAsync(CancellationToken.None);
}
/// <summary>
/// Tests-032: pins the monitor's <c>toMode → AlarmProviderSwitchReason</c>
/// derivation (<c>GatewayAlarmMonitor.ApplyProviderModeChangeAsync</c>): an
/// alarmmgr→subtag change must emit <c>reason=failover</c> and a subtag→alarmmgr
/// change must emit <c>reason=failback</c>. Captures the <c>reason</c> tag off the
/// <c>mxgateway.alarms.provider_switches</c> counter — a regression that swapped
/// the Failover/Failback arms or collapsed them to Unknown would be caught here,
/// whereas the count-only tests above would still pass.
/// </summary>
[Fact]
public async Task ProviderModeChange_FailoverThenFailback_RecordsCorrectReasonTags()
{
using GatewayMetrics metrics = new();
List<string> capturedReasons = [];
using MeterListener listener = new();
listener.InstrumentPublished = (instrument, meterListener) =>
{
if (ReferenceEquals(instrument.Meter, metrics.Meter)
&& instrument.Name == "mxgateway.alarms.provider_switches")
{
meterListener.EnableMeasurementEvents(instrument);
}
};
listener.SetMeasurementEventCallback<long>(
(instrument, _, tags, _) =>
{
if (!ReferenceEquals(instrument.Meter, metrics.Meter)
|| instrument.Name != "mxgateway.alarms.provider_switches")
{
return;
}
foreach (KeyValuePair<string, object?> tag in tags)
{
if (tag.Key == "reason" && tag.Value is string reasonTag)
{
lock (capturedReasons)
{
capturedReasons.Add(reasonTag);
}
}
}
});
listener.Start();
FakeSessionManager sessions = new();
using GatewayAlarmMonitor monitor = CreateMonitor(sessions, metrics);
using CancellationTokenSource cts = new();
await monitor.StartAsync(cts.Token);
await sessions.WaitForSubscribeAsync(WaitTimeout);
// Register a live subscriber and gate the mode-change events until the baseline
// ProviderStatus message has been drained, so neither event is dropped.
List<AlarmFeedMessage> received = [];
TaskCompletionSource baselineReceived = new(TaskCreationOptions.RunContinuationsAsynchronously);
using CancellationTokenSource streamCts = new();
Task reader = Task.Run(async () =>
{
try
{
await foreach (AlarmFeedMessage message in monitor.StreamAsync(null, streamCts.Token))
{
lock (received)
{
received.Add(message);
if (received.Count == 1)
{
baselineReceived.TrySetResult();
}
}
}
}
catch (OperationCanceledException)
{
// Expected when the test cancels the stream.
}
});
await baselineReceived.Task.WaitAsync(WaitTimeout);
// alarmmgr (baseline) → subtag: must classify as a failover.
sessions.EmitEvent(new MxEvent
{
OnAlarmProviderModeChanged = new OnAlarmProviderModeChangedEvent
{
Mode = AlarmProviderMode.Subtag,
Reason = "alarmmgr failed",
At = Timestamp.FromDateTimeOffset(DateTimeOffset.UtcNow),
},
});
await WaitUntilAsync(
() => { lock (capturedReasons) { return capturedReasons.Count >= 1; } },
WaitTimeout);
// subtag → alarmmgr: must classify as a failback.
sessions.EmitEvent(new MxEvent
{
OnAlarmProviderModeChanged = new OnAlarmProviderModeChangedEvent
{
Mode = AlarmProviderMode.Alarmmgr,
Reason = "alarmmgr recovered",
At = Timestamp.FromDateTimeOffset(DateTimeOffset.UtcNow),
},
});
await WaitUntilAsync(
() => { lock (capturedReasons) { return capturedReasons.Count >= 2; } },
WaitTimeout);
lock (capturedReasons)
{
Assert.Equal(new[] { "failover", "failback" }, capturedReasons);
}
await streamCts.CancelAsync();
await reader;
await cts.CancelAsync();
await monitor.StopAsync(CancellationToken.None);
}
[Fact]
public async Task NewSubscriber_ReceivesProviderStatusAsFirstMessage()
{
using GatewayMetrics metrics = new();
FakeSessionManager sessions = new();
using GatewayAlarmMonitor monitor = CreateMonitor(sessions, metrics);
using CancellationTokenSource cts = new();
await monitor.StartAsync(cts.Token);
await sessions.WaitForSubscribeAsync(WaitTimeout);
using CancellationTokenSource streamCts = new();
AlarmFeedMessage? first = null;
Task reader = Task.Run(async () =>
{
await foreach (AlarmFeedMessage message in monitor.StreamAsync(null, streamCts.Token))
{
first = message;
break;
}
});
await WaitUntilAsync(() => first is not null, WaitTimeout);
Assert.NotNull(first);
Assert.Equal(AlarmFeedMessage.PayloadOneofCase.ProviderStatus, first!.PayloadCase);
// Baseline before any provider-mode event: alarm-manager, not degraded.
Assert.Equal(AlarmProviderMode.Alarmmgr, first.ProviderStatus.Mode);
Assert.False(first.ProviderStatus.Degraded);
await streamCts.CancelAsync();
await reader;
await cts.CancelAsync();
await monitor.StopAsync(CancellationToken.None);
}
[Fact]
public async Task ForceSubtagConfig_BaselinesProviderStatusToSubtagDegraded_WithoutSwitch()
{
using GatewayMetrics metrics = new();
long switchCount = 0;
int gaugeValue = -1;
using MeterListener listener = new();
listener.InstrumentPublished = (instrument, meterListener) =>
{
if (ReferenceEquals(instrument.Meter, metrics.Meter)
&& (instrument.Name == "mxgateway.alarms.provider_switches"
|| instrument.Name == "mxgateway.alarms.provider_mode"))
{
meterListener.EnableMeasurementEvents(instrument);
}
};
listener.SetMeasurementEventCallback<long>(
(instrument, measurement, _, _) =>
{
if (ReferenceEquals(instrument.Meter, metrics.Meter)
&& instrument.Name == "mxgateway.alarms.provider_switches")
{
Interlocked.Add(ref switchCount, measurement);
}
});
listener.SetMeasurementEventCallback<int>(
(instrument, measurement, _, _) =>
{
if (ReferenceEquals(instrument.Meter, metrics.Meter)
&& instrument.Name == "mxgateway.alarms.provider_mode")
{
Interlocked.Exchange(ref gaugeValue, measurement);
}
});
listener.Start();
FakeSessionManager sessions = new();
using GatewayAlarmMonitor monitor = CreateMonitor(sessions, metrics, "ForceSubtag");
using CancellationTokenSource cts = new();
await monitor.StartAsync(cts.Token);
await sessions.WaitForSubscribeAsync(WaitTimeout);
using CancellationTokenSource streamCts = new();
AlarmFeedMessage? first = null;
Task reader = Task.Run(async () =>
{
await foreach (AlarmFeedMessage message in monitor.StreamAsync(null, streamCts.Token))
{
first = message;
break;
}
});
await WaitUntilAsync(() => first is not null, WaitTimeout);
Assert.NotNull(first);
Assert.Equal(AlarmFeedMessage.PayloadOneofCase.ProviderStatus, first!.PayloadCase);
Assert.Equal(AlarmProviderMode.Subtag, first.ProviderStatus.Mode);
Assert.True(first.ProviderStatus.Degraded);
// The observable gauge must read subtag (2) after start.
listener.RecordObservableInstruments();
Assert.Equal(2, Volatile.Read(ref gaugeValue));
// The initial set must not record a provider switch.
Assert.Equal(0, Interlocked.Read(ref switchCount));
await streamCts.CancelAsync();
await reader;
await cts.CancelAsync();
await monitor.StopAsync(CancellationToken.None);
}
[Fact]
public async Task ForceAlarmManagerConfig_BaselinesProviderStatusToAlarmmgr_WithoutSwitch()
{
using GatewayMetrics metrics = new();
long switchCount = 0;
int gaugeValue = -1;
using MeterListener listener = new();
listener.InstrumentPublished = (instrument, meterListener) =>
{
if (ReferenceEquals(instrument.Meter, metrics.Meter)
&& (instrument.Name == "mxgateway.alarms.provider_switches"
|| instrument.Name == "mxgateway.alarms.provider_mode"))
{
meterListener.EnableMeasurementEvents(instrument);
}
};
listener.SetMeasurementEventCallback<long>(
(instrument, measurement, _, _) =>
{
if (ReferenceEquals(instrument.Meter, metrics.Meter)
&& instrument.Name == "mxgateway.alarms.provider_switches")
{
Interlocked.Add(ref switchCount, measurement);
}
});
listener.SetMeasurementEventCallback<int>(
(instrument, measurement, _, _) =>
{
if (ReferenceEquals(instrument.Meter, metrics.Meter)
&& instrument.Name == "mxgateway.alarms.provider_mode")
{
Interlocked.Exchange(ref gaugeValue, measurement);
}
});
listener.Start();
FakeSessionManager sessions = new();
using GatewayAlarmMonitor monitor = CreateMonitor(sessions, metrics, "ForceAlarmManager");
using CancellationTokenSource cts = new();
await monitor.StartAsync(cts.Token);
await sessions.WaitForSubscribeAsync(WaitTimeout);
using CancellationTokenSource streamCts = new();
AlarmFeedMessage? first = null;
Task reader = Task.Run(async () =>
{
await foreach (AlarmFeedMessage message in monitor.StreamAsync(null, streamCts.Token))
{
first = message;
break;
}
});
await WaitUntilAsync(() => first is not null, WaitTimeout);
Assert.NotNull(first);
Assert.Equal(AlarmFeedMessage.PayloadOneofCase.ProviderStatus, first!.PayloadCase);
Assert.Equal(AlarmProviderMode.Alarmmgr, first.ProviderStatus.Mode);
Assert.False(first.ProviderStatus.Degraded);
listener.RecordObservableInstruments();
Assert.Equal(1, Volatile.Read(ref gaugeValue));
Assert.Equal(0, Interlocked.Read(ref switchCount));
await streamCts.CancelAsync();
await reader;
await cts.CancelAsync();
await monitor.StopAsync(CancellationToken.None);
}
[Fact]
public async Task SubscribeAlarms_SendsForcedModeAndWatchList_FromConfiguration()
{
using GatewayMetrics metrics = new();
FakeSessionManager sessions = new();
StubWatchListResolver resolver = new(
[
new AlarmSubtagTarget { AlarmFullReference = "Galaxy!Area.Tank01.Hi", ActiveSubtag = "Tank01.Hi.active" },
]);
AlarmsOptions options = new()
{
Enabled = true,
SubscriptionExpression = @"\\NODE\Galaxy!Area",
Fallback = new AlarmFallbackOptions
{
Mode = "ForceSubtag",
ConsecutiveFailureThreshold = 7,
FailbackProbeIntervalSeconds = 11,
FailbackStableProbes = 4,
},
};
using GatewayAlarmMonitor monitor = new(
sessions,
resolver,
metrics,
Microsoft.Extensions.Options.Options.Create(new GatewayOptions { Alarms = options }),
NullLogger<GatewayAlarmMonitor>.Instance);
using CancellationTokenSource cts = new();
await monitor.StartAsync(cts.Token);
await sessions.WaitForSubscribeAsync(WaitTimeout);
SubscribeAlarmsCommand sent = Assert.IsType<SubscribeAlarmsCommand>(sessions.LastSubscribeCommand);
Assert.Equal(AlarmProviderMode.Subtag, sent.ForcedMode);
Assert.Equal(7, sent.Failover.ConsecutiveFailureThreshold);
Assert.Equal(11, sent.Failover.FailbackProbeIntervalSeconds);
Assert.Equal(4, sent.Failover.FailbackStableProbes);
AlarmSubtagTarget target = Assert.Single(sent.WatchList);
Assert.Equal("Galaxy!Area.Tank01.Hi", target.AlarmFullReference);
await cts.CancelAsync();
await monitor.StopAsync(CancellationToken.None);
}
[Theory]
[InlineData("ForceAlarmManager", AlarmProviderMode.Alarmmgr)]
[InlineData("forcealarmmanager", AlarmProviderMode.Alarmmgr)]
[InlineData("ForceSubtag", AlarmProviderMode.Subtag)]
[InlineData("forcesubtag", AlarmProviderMode.Subtag)]
[InlineData("Auto", AlarmProviderMode.Unspecified)]
[InlineData("", AlarmProviderMode.Unspecified)]
[InlineData("nonsense", AlarmProviderMode.Unspecified)]
public async Task ModeString_MapsToForcedProviderMode(string mode, AlarmProviderMode expected)
{
using GatewayMetrics metrics = new();
FakeSessionManager sessions = new();
AlarmsOptions options = new()
{
Enabled = true,
SubscriptionExpression = @"\\NODE\Galaxy!Area",
Fallback = new AlarmFallbackOptions { Mode = mode },
};
using GatewayAlarmMonitor monitor = new(
sessions,
new StubWatchListResolver([]),
metrics,
Microsoft.Extensions.Options.Options.Create(new GatewayOptions { Alarms = options }),
NullLogger<GatewayAlarmMonitor>.Instance);
using CancellationTokenSource cts = new();
await monitor.StartAsync(cts.Token);
await sessions.WaitForSubscribeAsync(WaitTimeout);
Assert.Equal(expected, sessions.LastSubscribeCommand!.ForcedMode);
// Auto + empty watch-list preserves historical alarmmgr-only behaviour.
if (expected == AlarmProviderMode.Unspecified)
{
Assert.Empty(sessions.LastSubscribeCommand!.WatchList);
}
await cts.CancelAsync();
await monitor.StopAsync(CancellationToken.None);
}
private static GatewayAlarmMonitor CreateMonitor(FakeSessionManager sessions, GatewayMetrics metrics)
{
AlarmsOptions options = new()
{
Enabled = true,
SubscriptionExpression = @"\\NODE\Galaxy!Area",
};
return new GatewayAlarmMonitor(
sessions,
new StubWatchListResolver([]),
metrics,
Microsoft.Extensions.Options.Options.Create(new GatewayOptions { Alarms = options }),
NullLogger<GatewayAlarmMonitor>.Instance);
}
private static GatewayAlarmMonitor CreateMonitor(FakeSessionManager sessions, GatewayMetrics metrics, string mode)
{
AlarmsOptions options = new()
{
Enabled = true,
SubscriptionExpression = @"\\NODE\Galaxy!Area",
Fallback = new AlarmFallbackOptions { Mode = mode },
};
return new GatewayAlarmMonitor(
sessions,
new StubWatchListResolver([]),
metrics,
Microsoft.Extensions.Options.Options.Create(new GatewayOptions { Alarms = options }),
NullLogger<GatewayAlarmMonitor>.Instance);
}
private static async Task<AlarmFeedMessage> WaitForAsync(
List<AlarmFeedMessage> received,
Func<AlarmFeedMessage, bool> predicate,
TimeSpan timeout)
{
DateTime deadline = DateTime.UtcNow + timeout;
while (DateTime.UtcNow < deadline)
{
lock (received)
{
AlarmFeedMessage? match = received.FirstOrDefault(predicate);
if (match is not null)
{
return match;
}
}
await Task.Delay(25);
}
throw new TimeoutException("No matching AlarmFeedMessage was received in time.");
}
private static async Task WaitUntilAsync(Func<bool> condition, TimeSpan timeout)
{
DateTime deadline = DateTime.UtcNow + timeout;
while (DateTime.UtcNow < deadline)
{
if (condition())
{
return;
}
await Task.Delay(25);
}
throw new TimeoutException("Condition was not met in time.");
}
/// <summary><see cref="IAlarmWatchListResolver"/> that returns a fixed watch-list.</summary>
private sealed class StubWatchListResolver(IReadOnlyList<AlarmSubtagTarget> targets) : IAlarmWatchListResolver
{
/// <inheritdoc />
public Task<IReadOnlyList<AlarmSubtagTarget>> ResolveAsync(
AlarmsOptions options,
CancellationToken cancellationToken = default) => Task.FromResult(targets);
}
/// <summary>
/// Minimal <see cref="ISessionManager"/> for driving the monitor: opens a
/// constructed session, records the SubscribeAlarms command, replies OK to
/// every command, and exposes a channel for pushing worker events.
/// </summary>
private sealed class FakeSessionManager : ISessionManager
{
private readonly Channel<WorkerEvent> _events = Channel.CreateUnbounded<WorkerEvent>();
private readonly TaskCompletionSource _subscribed =
new(TaskCreationOptions.RunContinuationsAsynchronously);
/// <summary>The most recent SubscribeAlarms command the monitor sent.</summary>
public SubscribeAlarmsCommand? LastSubscribeCommand { get; private set; }
/// <summary>Pushes a worker event onto the monitor's event stream.</summary>
public void EmitEvent(MxEvent mxEvent) =>
_events.Writer.TryWrite(new WorkerEvent { Event = mxEvent });
/// <summary>Completes once the monitor has issued its SubscribeAlarms command.</summary>
public Task WaitForSubscribeAsync(TimeSpan timeout) => _subscribed.Task.WaitAsync(timeout);
/// <inheritdoc />
public Task<GatewaySession> OpenSessionAsync(
SessionOpenRequest request,
string? clientIdentity,
CancellationToken cancellationToken)
{
GatewaySession session = new(
Guid.NewGuid().ToString("N"),
"Galaxy",
"pipe-test",
"nonce-test",
clientIdentity,
null,
null,
TimeSpan.FromSeconds(30),
TimeSpan.FromSeconds(30),
TimeSpan.FromSeconds(30),
DateTimeOffset.UtcNow);
return Task.FromResult(session);
}
/// <inheritdoc />
public Task<WorkerCommandReply> InvokeAsync(
string sessionId,
WorkerCommand command,
CancellationToken cancellationToken)
{
if (command.Command?.Kind == MxCommandKind.SubscribeAlarms)
{
LastSubscribeCommand = command.Command.SubscribeAlarms;
_subscribed.TrySetResult();
}
MxCommandReply reply = new()
{
ProtocolStatus = new ProtocolStatus { Code = ProtocolStatusCode.Ok },
};
if (command.Command?.Kind == MxCommandKind.QueryActiveAlarms)
{
reply.QueryActiveAlarms = new QueryActiveAlarmsReplyPayload();
}
return Task.FromResult(new WorkerCommandReply { Reply = reply });
}
/// <inheritdoc />
public async IAsyncEnumerable<WorkerEvent> ReadEventsAsync(
string sessionId,
[EnumeratorCancellation] CancellationToken cancellationToken)
{
await foreach (WorkerEvent workerEvent in _events.Reader.ReadAllAsync(cancellationToken))
{
yield return workerEvent;
}
}
/// <inheritdoc />
public bool TryGetSession(string sessionId, [MaybeNullWhen(false)] out GatewaySession session)
{
session = null;
return false;
}
/// <inheritdoc />
public Task<SessionCloseResult> CloseSessionAsync(string sessionId, CancellationToken cancellationToken)
{
_events.Writer.TryComplete();
return Task.FromResult(new SessionCloseResult(sessionId, SessionState.Closed, AlreadyClosed: false));
}
/// <inheritdoc />
public Task<SessionCloseResult> KillWorkerAsync(string sessionId, string reason, CancellationToken cancellationToken) =>
Task.FromResult(new SessionCloseResult(sessionId, SessionState.Closed, AlreadyClosed: false));
/// <inheritdoc />
public Task<int> CloseExpiredLeasesAsync(DateTimeOffset now, CancellationToken cancellationToken) =>
Task.FromResult(0);
/// <inheritdoc />
public Task ShutdownAsync(CancellationToken cancellationToken) => Task.CompletedTask;
}
}
@@ -118,4 +118,175 @@ public sealed class GatewayOptionsValidatorTests
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Succeeded);
}
// -------------------------------------------------------------------------
// AlarmFallbackOptions validation
// -------------------------------------------------------------------------
private static AlarmsOptions EnabledAlarmsWithFallback(AlarmFallbackOptions fallback) => new()
{
Enabled = true,
DefaultArea = "Galaxy",
Fallback = fallback,
};
private static GatewayOptions CloneWithAlarms(GatewayOptions source, AlarmsOptions alarms)
=> new()
{
Authentication = source.Authentication,
Ldap = source.Ldap,
Worker = source.Worker,
Sessions = source.Sessions,
Events = source.Events,
Dashboard = source.Dashboard,
Protocol = source.Protocol,
Alarms = alarms,
Tls = source.Tls,
};
[Fact]
public void Validate_Succeeds_WhenAlarmsDisabled_FallbackNotValidated()
{
// Even an invalid Mode is acceptable when Enabled = false.
GatewayOptions options = CloneWithAlarms(
ValidOptions(),
new AlarmsOptions
{
Enabled = false,
Fallback = new AlarmFallbackOptions { Mode = "InvalidMode" },
});
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Succeeded);
}
[Fact]
public void Validate_Succeeds_WhenAlarmsEnabled_DefaultAutoConfig()
{
// Default AlarmFallbackOptions (Mode="Auto") must pass validation when alarms are enabled.
GatewayOptions options = CloneWithAlarms(
ValidOptions(),
EnabledAlarmsWithFallback(new AlarmFallbackOptions()));
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Succeeded);
}
[Theory]
[InlineData("Auto")]
[InlineData("ForceAlarmManager")]
[InlineData("ForceSubtag")]
[InlineData("auto")]
[InlineData("FORCESUBTAG")]
public void Validate_Succeeds_WhenAlarmsEnabled_RecognisedMode(string mode)
{
AlarmsOptions alarms = EnabledAlarmsWithFallback(new AlarmFallbackOptions { Mode = mode });
GatewayOptions options = CloneWithAlarms(ValidOptions(), alarms);
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Succeeded);
}
[Fact]
public void Validate_Fails_WhenAlarmsEnabled_InvalidMode()
{
GatewayOptions options = CloneWithAlarms(
ValidOptions(),
EnabledAlarmsWithFallback(new AlarmFallbackOptions { Mode = "InvalidMode" }));
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Failed);
Assert.Contains(result.Failures!, f => f.Contains("MxGateway:Alarms:Fallback") && f.Contains("Mode"));
}
[Fact]
public void Validate_Fails_WhenForceSubtag_NoGalaxyRepository_NoIncludes()
{
// ForceSubtag without galaxy repository and without IncludeAttributes must fail.
GatewayOptions options = CloneWithAlarms(
ValidOptions(),
EnabledAlarmsWithFallback(new AlarmFallbackOptions
{
Mode = "ForceSubtag",
Discovery = new AlarmDiscoveryOptions
{
UseGalaxyRepository = false,
IncludeAttributes = [],
},
}));
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Failed);
Assert.Contains(
result.Failures!,
f => f.Contains("ForceSubtag") && f.Contains("Discovery"));
}
[Fact]
public void Validate_Succeeds_WhenForceSubtag_NoGalaxyRepository_WithIncludes()
{
// ForceSubtag without galaxy repository is allowed when IncludeAttributes is non-empty.
GatewayOptions options = CloneWithAlarms(
ValidOptions(),
EnabledAlarmsWithFallback(new AlarmFallbackOptions
{
Mode = "ForceSubtag",
Discovery = new AlarmDiscoveryOptions
{
UseGalaxyRepository = false,
IncludeAttributes = ["attr1"],
},
}));
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Succeeded);
}
[Fact]
public void Validate_Succeeds_WhenForceSubtag_WithGalaxyRepository()
{
// ForceSubtag + UseGalaxyRepository=true (default) must pass even without IncludeAttributes.
GatewayOptions options = CloneWithAlarms(
ValidOptions(),
EnabledAlarmsWithFallback(new AlarmFallbackOptions
{
Mode = "ForceSubtag",
Discovery = new AlarmDiscoveryOptions { UseGalaxyRepository = true },
}));
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Succeeded);
}
[Theory]
[InlineData(0, nameof(AlarmFallbackOptions.ConsecutiveFailureThreshold))]
[InlineData(-1, nameof(AlarmFallbackOptions.ConsecutiveFailureThreshold))]
public void Validate_Fails_WhenConsecutiveFailureThresholdBelowOne(int value, string keyPart)
{
GatewayOptions options = CloneWithAlarms(
ValidOptions(),
EnabledAlarmsWithFallback(new AlarmFallbackOptions { ConsecutiveFailureThreshold = value }));
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Failed);
Assert.Contains(result.Failures!, f => f.Contains(keyPart));
}
[Theory]
[InlineData(0, nameof(AlarmFallbackOptions.FailbackProbeIntervalSeconds))]
[InlineData(-5, nameof(AlarmFallbackOptions.FailbackProbeIntervalSeconds))]
public void Validate_Fails_WhenFailbackProbeIntervalSecondsBelowOne(int value, string keyPart)
{
GatewayOptions options = CloneWithAlarms(
ValidOptions(),
EnabledAlarmsWithFallback(new AlarmFallbackOptions { FailbackProbeIntervalSeconds = value }));
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Failed);
Assert.Contains(result.Failures!, f => f.Contains(keyPart));
}
[Theory]
[InlineData(0, nameof(AlarmFallbackOptions.FailbackStableProbes))]
[InlineData(-1, nameof(AlarmFallbackOptions.FailbackStableProbes))]
public void Validate_Fails_WhenFailbackStableProbesBelowOne(int value, string keyPart)
{
GatewayOptions options = CloneWithAlarms(
ValidOptions(),
EnabledAlarmsWithFallback(new AlarmFallbackOptions { FailbackStableProbes = value }));
ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options);
Assert.True(result.Failed);
Assert.Contains(result.Failures!, f => f.Contains(keyPart));
}
}
@@ -1332,6 +1332,56 @@ public sealed class ProtobufContractRoundTripTests
Assert.Equal(kind, parsed.Kind);
}
/// <summary>
/// Verifies that an <see cref="AlarmFeedMessage"/> carrying the
/// <c>provider_status</c> payload case round-trips and resolves to
/// <see cref="AlarmFeedMessage.PayloadOneofCase.ProviderStatus"/>.
/// </summary>
[Fact]
public void Feed_RoundTripsProviderStatus()
{
var since = Timestamp.FromDateTime(new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc));
var original = new AlarmFeedMessage
{
ProviderStatus = new AlarmProviderStatus
{
Mode = AlarmProviderMode.Subtag,
Degraded = true,
Reason = "wnwrap poll failed 3x (HRESULT 0x80004005)",
Since = since,
},
};
var parsed = AlarmFeedMessage.Parser.ParseFrom(original.ToByteArray());
Assert.Equal(original, parsed);
Assert.Equal(AlarmFeedMessage.PayloadOneofCase.ProviderStatus, parsed.PayloadCase);
Assert.True(parsed.ProviderStatus.Degraded);
Assert.Equal(AlarmProviderMode.Subtag, parsed.ProviderStatus.Mode);
}
/// <summary>
/// Verifies that an <see cref="OnAlarmTransitionEvent"/> carrying the
/// new <c>degraded</c> and <c>source_provider</c> provenance fields
/// round-trips with their values preserved.
/// </summary>
[Fact]
public void Transition_RoundTripsDegradedProvenance()
{
var t = new OnAlarmTransitionEvent
{
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
TransitionKind = AlarmTransitionKind.Raise,
Degraded = true,
SourceProvider = AlarmProviderMode.Subtag,
};
var parsed = OnAlarmTransitionEvent.Parser.ParseFrom(t.ToByteArray());
Assert.True(parsed.Degraded);
Assert.Equal(AlarmProviderMode.Subtag, parsed.SourceProvider);
}
/// <summary>
/// Verifies that an <see cref="MxCommandReply"/> with kind
/// <see cref="MxCommandKind.ReadBulk"/> and a populated
@@ -1377,4 +1427,120 @@ public sealed class ProtobufContractRoundTripTests
Assert.Single(parsed.ReadBulk.Results);
Assert.True(parsed.ReadBulk.Results[0].WasCached);
}
/// <summary>
/// Verifies that an <see cref="ActiveAlarmSnapshot"/> carrying the
/// alarm-provider provenance fields <c>degraded</c> (14) and
/// <c>source_provider</c> (15) round-trips with their values preserved,
/// pinning the wire shape of the byte-identical provenance fields that
/// also appear on <see cref="OnAlarmTransitionEvent"/>.
/// </summary>
[Fact]
public void ActiveAlarmSnapshot_RoundTripsDegradedProvenance()
{
var raise = Timestamp.FromDateTime(new DateTime(2026, 6, 13, 12, 0, 0, DateTimeKind.Utc));
var original = new ActiveAlarmSnapshot
{
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
SourceObjectReference = "Tank01",
AlarmTypeName = "AnalogLimitAlarm.HiHi",
Severity = 750,
OriginalRaiseTimestamp = raise,
CurrentState = AlarmConditionState.Active,
Degraded = true,
SourceProvider = AlarmProviderMode.Subtag,
};
var parsed = ActiveAlarmSnapshot.Parser.ParseFrom(original.ToByteArray());
Assert.Equal(original, parsed);
Assert.True(parsed.Degraded);
Assert.Equal(AlarmProviderMode.Subtag, parsed.SourceProvider);
}
/// <summary>
/// Verifies that a <see cref="SubscribeAlarmsCommand"/> populating the
/// alarm-provider fallback extensions — <c>forced_mode</c> (2), a
/// <c>watch_list</c> entry with all six <see cref="AlarmSubtagTarget"/>
/// string fields (3), and a <c>failover</c>
/// <see cref="AlarmFailoverConfig"/> (4) — round-trips end to end,
/// pinning the wire shape that the forced-subtag-mode fix depends on.
/// </summary>
[Fact]
public void SubscribeAlarmsCommand_RoundTripsForcedModeWatchListAndFailover()
{
var original = new SubscribeAlarmsCommand
{
SubscriptionExpression = @"\\node\Galaxy!Area",
ForcedMode = AlarmProviderMode.Subtag,
WatchList =
{
new AlarmSubtagTarget
{
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
SourceObjectReference = "Tank01",
ActiveSubtag = "Tank01.Level.HiHi.InAlarm",
AckedSubtag = "Tank01.Level.HiHi.Acked",
AckCommentSubtag = "Tank01.Level.HiHi.AckMsg",
PrioritySubtag = "Tank01.Level.HiHi.Priority",
},
},
Failover = new AlarmFailoverConfig
{
ConsecutiveFailureThreshold = 3,
FailbackProbeIntervalSeconds = 10,
FailbackStableProbes = 5,
},
};
var parsed = SubscribeAlarmsCommand.Parser.ParseFrom(original.ToByteArray());
Assert.Equal(original, parsed);
Assert.Equal(AlarmProviderMode.Subtag, parsed.ForcedMode);
var target = Assert.Single(parsed.WatchList);
Assert.Equal("Galaxy!Area.Tank01.Level.HiHi", target.AlarmFullReference);
Assert.Equal("Tank01", target.SourceObjectReference);
Assert.Equal("Tank01.Level.HiHi.InAlarm", target.ActiveSubtag);
Assert.Equal("Tank01.Level.HiHi.Acked", target.AckedSubtag);
Assert.Equal("Tank01.Level.HiHi.AckMsg", target.AckCommentSubtag);
Assert.Equal("Tank01.Level.HiHi.Priority", target.PrioritySubtag);
Assert.Equal(3, parsed.Failover.ConsecutiveFailureThreshold);
Assert.Equal(10, parsed.Failover.FailbackProbeIntervalSeconds);
Assert.Equal(5, parsed.Failover.FailbackStableProbes);
}
/// <summary>
/// Verifies that an <see cref="MxEvent"/> carrying an
/// <see cref="OnAlarmProviderModeChangedEvent"/> body (the
/// <c>MxEvent.body</c> oneof tag 25 paired with
/// <see cref="MxEventFamily.OnAlarmProviderModeChanged"/>, family 6)
/// round-trips and resolves to
/// <see cref="MxEvent.BodyOneofCase.OnAlarmProviderModeChanged"/>.
/// </summary>
[Fact]
public void MxEvent_RoundTripsOnAlarmProviderModeChangedBody()
{
var at = Timestamp.FromDateTime(new DateTime(2026, 6, 13, 9, 30, 0, DateTimeKind.Utc));
var original = new MxEvent
{
Family = MxEventFamily.OnAlarmProviderModeChanged,
SessionId = "session-1",
WorkerSequence = 42,
OnAlarmProviderModeChanged = new OnAlarmProviderModeChangedEvent
{
Mode = AlarmProviderMode.Subtag,
Reason = "wnwrap poll failed 3x",
Hresult = unchecked((int)0x80004005),
At = at,
},
};
var parsed = MxEvent.Parser.ParseFrom(original.ToByteArray());
Assert.Equal(original, parsed);
Assert.Equal(MxEvent.BodyOneofCase.OnAlarmProviderModeChanged, parsed.BodyCase);
Assert.Equal(MxEventFamily.OnAlarmProviderModeChanged, parsed.Family);
Assert.Equal(AlarmProviderMode.Subtag, parsed.OnAlarmProviderModeChanged.Mode);
Assert.Equal(unchecked((int)0x80004005), parsed.OnAlarmProviderModeChanged.Hresult);
}
}
@@ -1,14 +1,14 @@
using System.Collections.Generic;
using ZB.MOM.WW.MxGateway.Server.Diagnostics;
using Xunit;
public class GatewayLogRedactorSeamTests
namespace ZB.MOM.WW.MxGateway.Tests.Diagnostics;
public sealed class GatewayLogRedactorSeamTests
{
[Fact]
public void Redact_MasksApiKeyInClientIdentity()
{
var redactor = new GatewayLogRedactorSeam();
var props = new Dictionary<string, object?> { ["ClientIdentity"] = "Bearer mxgw_operator01_super-secret" };
GatewayLogRedactorSeam redactor = new();
Dictionary<string, object?> props = new() { ["ClientIdentity"] = "Bearer mxgw_operator01_super-secret" };
redactor.Redact(props);
Assert.Equal("Bearer mxgw_operator01_[redacted]", props["ClientIdentity"]);
}
@@ -0,0 +1,68 @@
using ZB.MOM.WW.MxGateway.Server.Galaxy;
namespace ZB.MOM.WW.MxGateway.Tests.Galaxy;
/// <summary>
/// Pure mapper tests for <see cref="GalaxyRepository.MapAlarmRow"/>. These assert the
/// FullTagReference / SourceObjectReference derivation produced by
/// <c>AlarmAttributesSql</c> without touching a database: the SQL projects
/// <c>tag_name</c> as the source object and <c>tag_name + '.' + attribute_name</c> as
/// the full reference, exactly as <c>AttributesSql</c> does.
/// </summary>
public sealed class GalaxyAlarmAttributeMappingTests
{
/// <summary>Verifies the mapper copies all projected columns onto the row.</summary>
[Fact]
public void MapAlarmRow_CopiesProjectedColumns()
{
GalaxyAlarmAttributeRow row = GalaxyRepository.MapAlarmRow(
fullTagReference: "Tank01.Level.HiHi",
sourceObjectReference: "Tank01",
area: "TestArea");
Assert.Equal("Tank01.Level.HiHi", row.FullTagReference);
Assert.Equal("Tank01", row.SourceObjectReference);
Assert.Equal("TestArea", row.Area);
}
/// <summary>
/// Verifies <see cref="GalaxyAlarmAttributeRow.AckCommentSubtag"/> is always empty:
/// the schema does not expose an ack-comment address, so the watch-list resolver
/// composes it later from configuration.
/// </summary>
[Fact]
public void MapAlarmRow_LeavesAckCommentSubtagEmpty()
{
GalaxyAlarmAttributeRow row = GalaxyRepository.MapAlarmRow(
fullTagReference: "Tank01.Level.HiHi",
sourceObjectReference: "Tank01",
area: "TestArea");
Assert.Equal(string.Empty, row.AckCommentSubtag);
}
/// <summary>
/// Verifies the SourceObjectReference is the owning object (the SQL <c>tag_name</c>),
/// i.e. the segment that precedes the first attribute dot in the full reference, even
/// when the attribute itself is a multi-segment extension path.
/// </summary>
[Theory]
[InlineData("Tank01", "Level.HiHi", "Tank01.Level.HiHi")]
[InlineData("Pump_001", "Speed", "Pump_001.Speed")]
[InlineData("TestAlarm001", "Alarm001", "TestAlarm001.Alarm001")]
public void MapAlarmRow_SourceObjectIsSegmentBeforeFirstAttributeDot(
string tagName,
string attributeName,
string expectedFullReference)
{
// Mirror the AlarmAttributesSql projection: full_tag_reference = tag_name + '.' + attribute_name.
string fullTagReference = tagName + "." + attributeName;
GalaxyAlarmAttributeRow row = GalaxyRepository.MapAlarmRow(fullTagReference, tagName, area: "TestArea");
Assert.Equal(expectedFullReference, row.FullTagReference);
Assert.Equal(tagName, row.SourceObjectReference);
Assert.Equal("TestArea", row.Area);
Assert.Equal(row.FullTagReference, row.SourceObjectReference + "." + attributeName);
}
}
@@ -378,6 +378,10 @@ public sealed class GalaxyHierarchyCacheTests : IDisposable
/// <inheritdoc />
public Task<List<GalaxyAttributeRow>> GetAttributesAsync(CancellationToken ct = default)
=> throw new InvalidOperationException("GetAttributesAsync should not be reached");
/// <inheritdoc />
public Task<List<GalaxyAlarmAttributeRow>> GetAlarmAttributesAsync(CancellationToken ct = default)
=> throw new InvalidOperationException("GetAlarmAttributesAsync should not be reached");
}
/// <summary>Snapshot store whose <see cref="SaveAsync"/> cancels the token mid-save.</summary>
@@ -465,6 +469,10 @@ public sealed class GalaxyHierarchyCacheTests : IDisposable
GetAttributesCount++;
return Task.FromResult(_attributes);
}
/// <inheritdoc />
public Task<List<GalaxyAlarmAttributeRow>> GetAlarmAttributesAsync(CancellationToken ct = default)
=> Task.FromResult(new List<GalaxyAlarmAttributeRow>());
}
/// <inheritdoc />
@@ -518,6 +526,10 @@ public sealed class GalaxyHierarchyCacheTests : IDisposable
GetAttributesCount++;
throw toThrow;
}
/// <inheritdoc />
public Task<List<GalaxyAlarmAttributeRow>> GetAlarmAttributesAsync(CancellationToken ct = default)
=> throw toThrow;
}
}
@@ -1,3 +1,4 @@
using Google.Protobuf.WellKnownTypes;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
using ZB.MOM.WW.MxGateway.Contracts.Proto.Galaxy;
using ZB.MOM.WW.MxGateway.Server.Dashboard;
@@ -137,6 +138,151 @@ public sealed class DashboardBrowseAndAlarmModelTests
Assert.False(ackedRow.IsUnacknowledged);
}
/// <summary>Verifies that a healthy alarmmgr provider status maps to a green badge.</summary>
[Fact]
public void FromProviderStatus_Alarmmgr_NotDegraded_GreenBadge()
{
AlarmProviderStatus status = new()
{
Mode = AlarmProviderMode.Alarmmgr,
Degraded = false,
};
DashboardAlarmProviderStatus model = DashboardAlarmProviderStatus.FromProviderStatus(status);
Assert.False(model.IsDegraded);
Assert.Contains("bg-success", model.BadgeCssClass, StringComparison.Ordinal);
Assert.Equal(DashboardAlarmProviderStatus.AlarmManagerLabel, model.Label);
}
/// <summary>Verifies that a degraded subtag provider status maps to an amber warning badge.</summary>
[Fact]
public void FromProviderStatus_Subtag_Degraded_WarningBadge()
{
AlarmProviderStatus status = new()
{
Mode = AlarmProviderMode.Subtag,
Degraded = true,
Reason = "x",
};
DashboardAlarmProviderStatus model = DashboardAlarmProviderStatus.FromProviderStatus(status);
Assert.True(model.IsDegraded);
Assert.Contains("bg-warning", model.BadgeCssClass, StringComparison.Ordinal);
Assert.Equal("x", model.Reason);
// Tests-033: pin the amber label text, not just the CSS class — a label swap
// would otherwise pass this test.
Assert.Equal(DashboardAlarmProviderStatus.DegradedLabel, model.Label);
}
/// <summary>
/// Tests-033: an explicitly-degraded status whose mode is still Alarmmgr (the
/// <c>Degraded || Mode==Subtag</c> guard's second, independent branch) must still
/// map to the degraded amber badge.
/// </summary>
[Fact]
public void FromProviderStatus_Alarmmgr_DegradedFlagSet_WarningBadge()
{
AlarmProviderStatus status = new()
{
Mode = AlarmProviderMode.Alarmmgr,
Degraded = true,
Reason = "independently degraded",
};
DashboardAlarmProviderStatus model = DashboardAlarmProviderStatus.FromProviderStatus(status);
Assert.True(model.IsDegraded);
Assert.Equal(DashboardAlarmProviderStatus.DegradedLabel, model.Label);
Assert.Contains("bg-warning", model.BadgeCssClass, StringComparison.Ordinal);
}
/// <summary>
/// Tests-033: the <c>SinceUtc</c> field must carry the protobuf <c>Since</c>
/// timestamp converted to a <see cref="DateTimeOffset" />.
/// </summary>
[Fact]
public void FromProviderStatus_WithSinceTimestamp_MapsSinceUtc()
{
DateTimeOffset since = new(2026, 6, 15, 12, 30, 0, TimeSpan.Zero);
AlarmProviderStatus status = new()
{
Mode = AlarmProviderMode.Subtag,
Degraded = true,
Reason = "x",
Since = Timestamp.FromDateTimeOffset(since),
};
DashboardAlarmProviderStatus model = DashboardAlarmProviderStatus.FromProviderStatus(status);
Assert.Equal(since, model.SinceUtc);
}
/// <summary>
/// Tests-033: <see cref="DashboardAlarmProviderStatus.FromFeed" /> — the entry the
/// dashboard SignalR snapshot path actually calls — projects a provider-status
/// feed message into the badge model.
/// </summary>
[Fact]
public void FromFeed_ProviderStatusPayload_ProjectsBadge()
{
AlarmFeedMessage message = new()
{
ProviderStatus = new AlarmProviderStatus
{
Mode = AlarmProviderMode.Subtag,
Degraded = true,
Reason = "alarmmgr failed",
},
};
DashboardAlarmProviderStatus model = DashboardAlarmProviderStatus.FromFeed(message);
Assert.Equal(AlarmProviderMode.Subtag, model.Mode);
Assert.True(model.IsDegraded);
Assert.Equal(DashboardAlarmProviderStatus.DegradedLabel, model.Label);
Assert.Equal("alarmmgr failed", model.Reason);
}
/// <summary>
/// Tests-033: <see cref="DashboardAlarmProviderStatus.FromFeed" /> throws
/// <see cref="ArgumentException" /> when the feed message does not carry a
/// provider-status payload.
/// </summary>
[Fact]
public void FromFeed_NonProviderStatusPayload_Throws()
{
AlarmFeedMessage message = new()
{
SnapshotComplete = true,
};
Assert.Throws<ArgumentException>(() => DashboardAlarmProviderStatus.FromFeed(message));
}
/// <summary>
/// Verifies that a configured forced-subtag provider status renders the
/// distinct "forced" badge (cyan/info), not the amber failover-degraded one.
/// </summary>
[Fact]
public void FromProviderStatus_Subtag_ForcedReason_ForcedBadge()
{
AlarmProviderStatus status = new()
{
Mode = AlarmProviderMode.Subtag,
Degraded = true,
Reason = ZB.MOM.WW.MxGateway.Server.Alarms.AlarmProviderReasons.ForcedSubtag,
};
DashboardAlarmProviderStatus model = DashboardAlarmProviderStatus.FromProviderStatus(status);
Assert.True(model.IsDegraded);
Assert.Equal(DashboardAlarmProviderStatus.ForcedSubtagLabel, model.Label);
Assert.Contains("bg-info", model.BadgeCssClass, StringComparison.Ordinal);
Assert.DoesNotContain("bg-warning", model.BadgeCssClass, StringComparison.Ordinal);
}
/// <summary>Verifies that the formatter renders array elements and element type correctly.</summary>
[Fact]
public void FormatValue_AndDataType_RenderArrayElementsAndElementType()
@@ -94,6 +94,24 @@ public sealed class DashboardSnapshotServiceTests
Assert.Equal("worker pipe disconnected", fault.Message);
}
/// <summary>
/// Verifies snapshot metrics include the cumulative alarm provider switch count.
/// </summary>
[Fact]
public void GetSnapshot_IncludesAlarmProviderSwitchCountMetric()
{
SessionRegistry registry = new();
using GatewayMetrics metrics = new();
metrics.AlarmProviderSwitched(1, 2, AlarmProviderSwitchReason.Failover);
DashboardSnapshotService service = CreateService(registry, metrics);
DashboardSnapshot snapshot = service.GetSnapshot();
Assert.Contains(
snapshot.Metrics,
metric => metric.Name == "mxgateway.alarms.provider_switches" && metric.Value == 1);
}
/// <summary>
/// Verifies snapshot redacts sensitive values from client identity, session name, and fault messages.
/// </summary>
@@ -1,3 +1,4 @@
using System.Diagnostics.Metrics;
using ZB.MOM.WW.MxGateway.Server.Metrics;
namespace ZB.MOM.WW.MxGateway.Tests.Metrics;
@@ -63,6 +64,99 @@ public sealed class GatewayMetricsTests
Assert.Equal("depth", exception.ParamName);
}
/// <summary>
/// Verifies that <see cref="GatewayMetrics.AlarmProviderSwitched"/> increments
/// <c>mxgateway.alarms.provider_switches</c> by one with the expected from/to/reason tags.
/// The listener filters by the specific <see cref="System.Diagnostics.Metrics.Meter"/> instance
/// to avoid cross-talk between parallel tests (Tests-027).
/// </summary>
[Fact]
public void AlarmProviderSwitched_IncrementsCounterWithExpectedTags()
{
using GatewayMetrics metrics = new();
using MeterListener listener = new();
long capturedValue = 0;
string? capturedFrom = null;
string? capturedTo = null;
string? capturedReason = null;
listener.InstrumentPublished = (instrument, meterListener) =>
{
if (ReferenceEquals(instrument.Meter, metrics.Meter)
&& instrument.Name == "mxgateway.alarms.provider_switches")
{
meterListener.EnableMeasurementEvents(instrument);
}
};
listener.SetMeasurementEventCallback<long>(
(instrument, measurement, tags, _) =>
{
if (!ReferenceEquals(instrument.Meter, metrics.Meter)
|| instrument.Name != "mxgateway.alarms.provider_switches")
{
return;
}
capturedValue += measurement;
foreach (KeyValuePair<string, object?> tag in tags)
{
switch (tag.Key)
{
case "from": capturedFrom = tag.Value as string; break;
case "to": capturedTo = tag.Value as string; break;
case "reason": capturedReason = tag.Value as string; break;
}
}
});
listener.Start();
metrics.AlarmProviderSwitched(1, 2, AlarmProviderSwitchReason.Failover);
Assert.Equal(1, capturedValue);
Assert.Equal("1", capturedFrom);
Assert.Equal("2", capturedTo);
Assert.Equal("failover", capturedReason);
Assert.Equal(1, metrics.GetSnapshot().AlarmProviderSwitchCount);
}
/// <summary>
/// Verifies that <see cref="GatewayMetrics.AlarmProviderSwitched"/> updates the
/// <c>mxgateway.alarms.provider_mode</c> observable gauge to the <paramref name="toMode"/> value.
/// </summary>
[Fact]
public void AlarmProviderSwitched_UpdatesProviderModeGauge()
{
using GatewayMetrics metrics = new();
using MeterListener listener = new();
int? capturedMode = null;
listener.InstrumentPublished = (instrument, meterListener) =>
{
if (ReferenceEquals(instrument.Meter, metrics.Meter)
&& instrument.Name == "mxgateway.alarms.provider_mode")
{
meterListener.EnableMeasurementEvents(instrument);
}
};
listener.SetMeasurementEventCallback<int>(
(instrument, measurement, _, _) =>
{
if (ReferenceEquals(instrument.Meter, metrics.Meter)
&& instrument.Name == "mxgateway.alarms.provider_mode")
{
capturedMode = measurement;
}
});
listener.Start();
metrics.AlarmProviderSwitched(1, 2, AlarmProviderSwitchReason.Failover);
listener.RecordObservableInstruments();
Assert.Equal(2, capturedMode);
}
/// <summary>Verifies that removing session events only affects that session.</summary>
[Fact]
public void RemoveSessionEvents_RemovesOnlyThatSession()
@@ -372,11 +372,11 @@ public sealed class AlarmCommandExecutorTests
public string? LastFilterPrefix { get; private set; }
/// <summary>Records a subscription.</summary>
/// <param name="subscription">The subscription expression.</param>
/// <param name="command">The subscribe-alarms command.</param>
/// <param name="sessionId">The session identifier.</param>
public void Subscribe(string subscription, string sessionId)
public void Subscribe(SubscribeAlarmsCommand command, string sessionId)
{
LastSubscription = subscription;
LastSubscription = command.SubscriptionExpression;
LastSessionId = sessionId;
}
@@ -21,7 +21,7 @@ public sealed class AlarmCommandHandlerTests
new MxAccessEventQueue(),
() => consumer);
handler.Subscribe(@"\\HOST\Galaxy!Area", "session-1");
handler.Subscribe(new SubscribeAlarmsCommand { SubscriptionExpression = @"\\HOST\Galaxy!Area" }, "session-1");
Assert.True(handler.IsSubscribed);
Assert.Equal(@"\\HOST\Galaxy!Area", consumer.LastSubscription);
@@ -36,9 +36,9 @@ public sealed class AlarmCommandHandlerTests
new MxAccessEventQueue(),
() => consumer);
handler.Subscribe(@"\\HOST\Galaxy!A", "s1");
handler.Subscribe(new SubscribeAlarmsCommand { SubscriptionExpression = @"\\HOST\Galaxy!A" }, "s1");
Assert.Throws<InvalidOperationException>(
() => handler.Subscribe(@"\\HOST\Galaxy!B", "s1"));
() => handler.Subscribe(new SubscribeAlarmsCommand { SubscriptionExpression = @"\\HOST\Galaxy!B" }, "s1"));
}
/// <summary>
@@ -63,7 +63,7 @@ public sealed class AlarmCommandHandlerTests
() => consumer);
InvalidOperationException exception = Assert.Throws<InvalidOperationException>(
() => handler.Subscribe(@"\\HOST\Galaxy!A", "s1"));
() => handler.Subscribe(new SubscribeAlarmsCommand { SubscriptionExpression = @"\\HOST\Galaxy!A" }, "s1"));
Assert.Contains("simulated wnwrap subscribe failure", exception.Message);
Assert.False(handler.IsSubscribed);
Assert.True(consumer.Disposed);
@@ -77,7 +77,7 @@ public sealed class AlarmCommandHandlerTests
AlarmCommandHandler handler = new AlarmCommandHandler(
new MxAccessEventQueue(),
() => consumer);
handler.Subscribe(@"\\HOST\Galaxy!A", "s1");
handler.Subscribe(new SubscribeAlarmsCommand { SubscriptionExpression = @"\\HOST\Galaxy!A" }, "s1");
handler.Unsubscribe();
@@ -104,7 +104,7 @@ public sealed class AlarmCommandHandlerTests
AlarmCommandHandler handler = new AlarmCommandHandler(
new MxAccessEventQueue(),
() => consumer);
handler.Subscribe(@"\\HOST\Galaxy!A", "s1");
handler.Subscribe(new SubscribeAlarmsCommand { SubscriptionExpression = @"\\HOST\Galaxy!A" }, "s1");
Guid g = Guid.NewGuid();
int rc = handler.Acknowledge(g, "c", "u", "n", "d", "F");
@@ -149,7 +149,7 @@ public sealed class AlarmCommandHandlerTests
AlarmCommandHandler handler = new AlarmCommandHandler(
new MxAccessEventQueue(),
() => consumer);
handler.Subscribe(@"\\HOST\Galaxy!A", "s1");
handler.Subscribe(new SubscribeAlarmsCommand { SubscriptionExpression = @"\\HOST\Galaxy!A" }, "s1");
IReadOnlyList<ActiveAlarmSnapshot> snapshots = handler.QueryActive(null);
@@ -173,7 +173,7 @@ public sealed class AlarmCommandHandlerTests
AlarmCommandHandler handler = new AlarmCommandHandler(
new MxAccessEventQueue(),
() => consumer);
handler.Subscribe(@"\\HOST\Galaxy!A", "s1");
handler.Subscribe(new SubscribeAlarmsCommand { SubscriptionExpression = @"\\HOST\Galaxy!A" }, "s1");
IReadOnlyList<ActiveAlarmSnapshot> filtered = handler.QueryActive("Galaxy!AreaA");
@@ -189,13 +189,13 @@ public sealed class AlarmCommandHandlerTests
AlarmCommandHandler handler = new AlarmCommandHandler(
new MxAccessEventQueue(),
() => consumer);
handler.Subscribe(@"\\HOST\Galaxy!A", "s1");
handler.Subscribe(new SubscribeAlarmsCommand { SubscriptionExpression = @"\\HOST\Galaxy!A" }, "s1");
handler.Dispose();
Assert.True(consumer.Disposed);
Assert.Throws<ObjectDisposedException>(
() => handler.Subscribe("x", "y"));
() => handler.Subscribe(new SubscribeAlarmsCommand { SubscriptionExpression = "x" }, "y"));
}
/// <summary>
@@ -218,7 +218,7 @@ public sealed class AlarmCommandHandlerTests
// factory is invoked. We tally invocation counts after each call so
// that a missed guard surfaces as the diagnostic count, not a generic
// "Subscribe should have failed".
handler.Subscribe(@"\\HOST\Galaxy!A", "s1");
handler.Subscribe(new SubscribeAlarmsCommand { SubscriptionExpression = @"\\HOST\Galaxy!A" }, "s1");
Assert.Equal(1, guardInvocations);
handler.Acknowledge(Guid.NewGuid(), "c", "u", "n", "d", "F");
@@ -254,7 +254,7 @@ public sealed class AlarmCommandHandlerTests
// Subscribe: guard runs before the dispatcher is constructed.
Assert.Throws<InvalidOperationException>(
() => handler.Subscribe(@"\\HOST\Galaxy!A", "s1"));
() => handler.Subscribe(new SubscribeAlarmsCommand { SubscriptionExpression = @"\\HOST\Galaxy!A" }, "s1"));
// To exercise the other entry points we need a subscribed handler.
// Construct a parallel handler with a passing guard, then swap in a
@@ -273,6 +273,132 @@ public sealed class AlarmCommandHandlerTests
Assert.Throws<InvalidOperationException>(() => handler.Unsubscribe());
}
/// <summary>
/// Worker-9: ForcedMode=Subtag builds a subtag consumer (via the
/// injected standby factory) and advises it — the primary
/// (alarmmgr) consumer is NOT created.
/// </summary>
[Fact]
public void Subscribe_WithForcedSubtagMode_BuildsStandbyConsumerOnly()
{
FakeConsumer primary = new FakeConsumer();
FakeConsumer standby = new FakeConsumer();
IReadOnlyList<AlarmSubtagTarget>? capturedWatchList = null;
AlarmCommandHandler handler = new AlarmCommandHandler(
new MxAccessEventQueue(),
() => primary,
threadAffinityCheck: null,
comFactory: null,
standbyFactory: watch =>
{
capturedWatchList = watch;
return standby;
});
SubscribeAlarmsCommand command = new SubscribeAlarmsCommand
{
SubscriptionExpression = @"\\HOST\Galaxy!Area",
ForcedMode = AlarmProviderMode.Subtag,
};
command.WatchList.Add(new AlarmSubtagTarget { AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi" });
handler.Subscribe(command, "s1");
Assert.True(handler.IsSubscribed);
Assert.Equal(@"\\HOST\Galaxy!Area", standby.LastSubscription); // standby advised
Assert.Null(primary.LastSubscription); // primary never built
Assert.NotNull(capturedWatchList);
Assert.Single(capturedWatchList!);
}
/// <summary>
/// Worker-9: ForcedMode=Unspecified + a non-empty watch list builds a
/// failover composite (primary + subtag standby). Forcing the primary
/// to fail on subscribe with a threshold of 1 drives the composite to
/// switch to the subtag provider, which must enqueue an
/// OnAlarmProviderModeChanged event carrying mode=Subtag.
/// </summary>
[Fact]
public void Subscribe_AutoModeWithWatchList_FailoverModeChange_EnqueuesProviderModeChangedEvent()
{
FakeConsumer primary = new FakeConsumer { ThrowOnSubscribe = true };
FakeConsumer standby = new FakeConsumer();
MxAccessEventQueue queue = new MxAccessEventQueue();
AlarmCommandHandler handler = new AlarmCommandHandler(
queue,
() => primary,
threadAffinityCheck: null,
comFactory: null,
standbyFactory: _ => standby);
SubscribeAlarmsCommand command = new SubscribeAlarmsCommand
{
SubscriptionExpression = @"\\HOST\Galaxy!Area",
ForcedMode = AlarmProviderMode.Unspecified,
Failover = new AlarmFailoverConfig
{
ConsecutiveFailureThreshold = 1,
FailbackProbeIntervalSeconds = 1,
FailbackStableProbes = 1,
},
};
command.WatchList.Add(new AlarmSubtagTarget { AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi" });
// Subscribe: standby is armed cleanly; the primary subscribe throws and,
// at threshold 1, the failover composite switches to standby and raises
// ProviderModeChanged. The handler enqueues the proto event.
handler.Subscribe(command, "s1");
IReadOnlyList<WorkerEvent> drained = queue.Drain(0);
Assert.Single(drained);
MxEvent evt = drained[0].Event;
Assert.Equal(MxEventFamily.OnAlarmProviderModeChanged, evt.Family);
Assert.Equal("s1", evt.SessionId);
Assert.NotNull(evt.OnAlarmProviderModeChanged);
Assert.Equal(AlarmProviderMode.Subtag, evt.OnAlarmProviderModeChanged.Mode);
}
/// <summary>
/// Worker-9: a non-failover subscribe (alarmmgr-only) never enqueues a
/// provider-mode-changed event, and a subsequent Unsubscribe detaches
/// the handler so no event leaks.
/// </summary>
[Fact]
public void Subscribe_AlarmmgrOnly_DoesNotEnqueueProviderModeChangedEvent()
{
FakeConsumer consumer = new FakeConsumer();
MxAccessEventQueue queue = new MxAccessEventQueue();
AlarmCommandHandler handler = new AlarmCommandHandler(queue, () => consumer);
handler.Subscribe(
new SubscribeAlarmsCommand { SubscriptionExpression = @"\\HOST\Galaxy!A" }, "s1");
handler.Unsubscribe();
Assert.Empty(queue.Drain(0));
}
/// <summary>
/// Worker-9: the mapper builds a well-formed OnAlarmProviderModeChanged
/// MxEvent — correct family and populated body fields.
/// </summary>
[Fact]
public void Mapper_CreateOnAlarmProviderModeChanged_PopulatesFamilyAndBody()
{
MxAccessEventMapper mapper = new MxAccessEventMapper();
DateTime at = new DateTime(2026, 6, 13, 10, 0, 0, DateTimeKind.Utc);
MxEvent evt = mapper.CreateOnAlarmProviderModeChanged(
"session-7", AlarmProviderMode.Subtag, "primary PollOnce failed", unchecked((int)0x80004005), at);
Assert.Equal(MxEventFamily.OnAlarmProviderModeChanged, evt.Family);
Assert.Equal("session-7", evt.SessionId);
Assert.NotNull(evt.OnAlarmProviderModeChanged);
Assert.Equal(AlarmProviderMode.Subtag, evt.OnAlarmProviderModeChanged.Mode);
Assert.Equal("primary PollOnce failed", evt.OnAlarmProviderModeChanged.Reason);
Assert.Equal(unchecked((int)0x80004005), evt.OnAlarmProviderModeChanged.Hresult);
Assert.Equal(at, evt.OnAlarmProviderModeChanged.At.ToDateTime());
}
private static MxAlarmSnapshotRecord NewRecord(string provider, string group, string tag)
{
return new MxAlarmSnapshotRecord
@@ -62,6 +62,43 @@ public sealed class AlarmDispatcherTests
Assert.Equal("TestArea", body.Category);
Assert.NotNull(body.TransitionTimestamp);
Assert.Equal(ts, body.TransitionTimestamp.ToDateTime());
Assert.False(body.Degraded);
Assert.Equal(AlarmProviderMode.Alarmmgr, body.SourceProvider);
}
/// <summary>
/// Verifies that a transition enqueued via the subtag fallback
/// (<c>degraded: true</c>) is marked <see cref="OnAlarmTransitionEvent.Degraded"/>
/// with <see cref="AlarmProviderMode.Subtag"/>, while the default path
/// stays on the alarmmgr parity contract.
/// </summary>
[Fact]
public void EnqueueTransition_WhenDegraded_MarksDegradedAndSubtagProvider()
{
MxAccessEventQueue queue = new MxAccessEventQueue();
MxAccessAlarmEventSink sink = new MxAccessAlarmEventSink(queue, new MxAccessEventMapper());
sink.Attach(new object(), SessionId);
DateTime ts = new DateTime(2026, 5, 1, 17, 26, 14, 709, DateTimeKind.Utc);
sink.EnqueueTransition(
alarmFullReference: "Galaxy!TestArea.TestMachine_001.TestAlarm001",
sourceObjectReference: "TestMachine_001.TestAlarm001",
alarmTypeName: "DSC",
transitionKind: AlarmTransitionKind.Raise,
severity: 500,
originalRaiseTimestampUtc: null,
transitionTimestampUtc: ts,
operatorUser: string.Empty,
operatorComment: string.Empty,
category: "TestArea",
description: string.Empty,
degraded: true);
Assert.Equal(1, queue.Count);
Assert.True(queue.TryDequeue(out WorkerEvent? workerEvent));
OnAlarmTransitionEvent body = workerEvent!.Event.OnAlarmTransition;
Assert.True(body.Degraded);
Assert.Equal(AlarmProviderMode.Subtag, body.SourceProvider);
}
/// <summary>Verifies that unchanged alarm states do not emit transitions.</summary>
@@ -242,6 +279,60 @@ public sealed class AlarmDispatcherTests
Assert.Equal(AlarmConditionState.ActiveAcked, snapshots[1].CurrentState);
}
/// <summary>
/// Verifies that the per-record subtag-fallback flag flows through the
/// snapshot path: a degraded record maps to an
/// <see cref="ActiveAlarmSnapshot"/> with <see cref="ActiveAlarmSnapshot.Degraded"/>
/// set and <see cref="AlarmProviderMode.Subtag"/>, while a non-degraded
/// record stays on the alarmmgr parity contract.
/// </summary>
[Fact]
public void SnapshotActiveAlarms_PropagatesDegradedAndSourceProvider()
{
FakeAlarmConsumer consumer = new FakeAlarmConsumer();
DateTime ts = new DateTime(2026, 5, 1, 17, 26, 14, 709, DateTimeKind.Utc);
consumer.SnapshotResult = new[]
{
new MxAlarmSnapshotRecord
{
AlarmGuid = Guid.NewGuid(),
ProviderName = "Galaxy",
Group = "TestArea",
TagName = "Tag1",
Type = "DSC",
Priority = 500,
State = MxAlarmStateKind.UnackAlm,
TransitionTimestampUtc = ts,
Degraded = true,
},
new MxAlarmSnapshotRecord
{
AlarmGuid = Guid.NewGuid(),
ProviderName = "Galaxy",
Group = "TestArea",
TagName = "Tag2",
Type = "ANL",
Priority = 100,
State = MxAlarmStateKind.UnackAlm,
TransitionTimestampUtc = ts,
Degraded = false,
},
};
using AlarmDispatcher dispatcher = new AlarmDispatcher(
consumer,
new MxAccessAlarmEventSink(new MxAccessEventQueue(), new MxAccessEventMapper()),
SessionId);
IReadOnlyList<ActiveAlarmSnapshot> snapshots = dispatcher.SnapshotActiveAlarms();
Assert.Equal(2, snapshots.Count);
Assert.True(snapshots[0].Degraded);
Assert.Equal(AlarmProviderMode.Subtag, snapshots[0].SourceProvider);
Assert.False(snapshots[1].Degraded);
Assert.Equal(AlarmProviderMode.Alarmmgr, snapshots[1].SourceProvider);
}
/// <summary>Verifies that dispose unsubscribes the handler and disposes the consumer.</summary>
[Fact]
public void Dispose_WhenSubscribed_UnsubscribesHandlerAndDisposesConsumer()
@@ -0,0 +1,473 @@
using System;
using System.Collections.Generic;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
using ZB.MOM.WW.MxGateway.Worker.MxAccess;
using Xunit;
namespace ZB.MOM.WW.MxGateway.Worker.Tests.MxAccess;
/// <summary>
/// Unit tests for <see cref="FailoverAlarmConsumer"/>: prove the
/// auto-failover (consecutive primary COM failures → standby) and
/// auto-failback (consecutive clean probes → primary) state machine,
/// active-child transition forwarding, and active-child delegation of
/// acknowledgments. Fakes stand in for both children so this needs no
/// AVEVA install.
/// </summary>
public sealed class FailoverAlarmConsumerTests
{
/// <summary>
/// Primary fake whose Subscribe/PollOnce throw a COMException while
/// <see cref="ThrowOnPoll"/> is set, modeling a wnwrap consumer that
/// surfaces COM HRESULT failures. Can also re-raise a transition so
/// before-failover forwarding can be exercised.
/// </summary>
private sealed class FlakyPrimary : IMxAccessAlarmConsumer
{
public event EventHandler<MxAlarmTransitionEvent>? AlarmTransitionEmitted;
public bool ThrowOnPoll = true;
/// <summary>
/// When set, <see cref="PollOnce"/> throws
/// <see cref="OutOfMemoryException"/> instead of a
/// <see cref="System.Runtime.InteropServices.COMException"/>, to
/// exercise the OOM-safe exception filter (Worker.Tests-032).
/// </summary>
public bool ThrowOutOfMemoryOnPoll;
public int Polls;
/// <summary>
/// Number of times <see cref="Subscribe"/> has been called.
/// Incremented at entry, before any throw, so every attempt is
/// counted regardless of whether <see cref="ThrowOnPoll"/> is set.
/// </summary>
public int SubscribeCount;
public void Subscribe(string s)
{
SubscribeCount++;
if (ThrowOnPoll)
{
throw new System.Runtime.InteropServices.COMException("boom", unchecked((int)0x80004005));
}
}
public void PollOnce()
{
Polls++;
if (ThrowOutOfMemoryOnPoll)
{
throw new OutOfMemoryException("simulated allocation failure");
}
if (ThrowOnPoll)
{
throw new System.Runtime.InteropServices.COMException("boom", unchecked((int)0x80004005));
}
}
public int AcknowledgeByGuid(Guid g, string c, string a, string b, string d, string e) => 11;
public int AcknowledgeByName(string n, string p, string gr, string c, string a, string b, string d, string e) => 11;
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms() => Array.Empty<MxAlarmSnapshotRecord>();
public void Dispose() { }
public void Raise(MxAlarmTransitionEvent e) => AlarmTransitionEmitted?.Invoke(this, e);
}
/// <summary>
/// Standby fake (subtag stand-in): never throws, records that it was
/// armed, and can re-raise a transition.
/// </summary>
private sealed class StubStandby : IMxAccessAlarmConsumer
{
public event EventHandler<MxAlarmTransitionEvent>? AlarmTransitionEmitted;
public bool Subscribed;
/// <summary>
/// When set, <see cref="SnapshotActiveAlarms"/> throws — modeling a
/// priming-snapshot failure during failover (Worker-026).
/// </summary>
public bool ThrowOnSnapshot;
/// <summary>Number of <see cref="SnapshotActiveAlarms"/> calls.</summary>
public int SnapshotCalls;
public void Subscribe(string s) => Subscribed = true;
public void PollOnce() { }
public int AcknowledgeByGuid(Guid g, string c, string a, string b, string d, string e) => 22;
public int AcknowledgeByName(string n, string p, string gr, string c, string a, string b, string d, string e) => 22;
public IReadOnlyList<MxAlarmSnapshotRecord> SnapshotActiveAlarms()
{
SnapshotCalls++;
if (ThrowOnSnapshot)
{
throw new InvalidOperationException("priming snapshot failed");
}
return Array.Empty<MxAlarmSnapshotRecord>();
}
public void Dispose() { }
public void Raise(MxAlarmTransitionEvent e) => AlarmTransitionEmitted?.Invoke(this, e);
}
private static MxAlarmTransitionEvent SampleTransition() => new MxAlarmTransitionEvent
{
Record = new MxAlarmSnapshotRecord { AlarmGuid = Guid.NewGuid() },
PreviousState = MxAlarmStateKind.Unspecified,
};
[Fact]
public void Primary_FailsThresholdTimes_SwitchesToSubtag()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
StubStandby standby = new StubStandby();
FailoverSettings settings = new FailoverSettings(threshold: 3, probeIntervalSeconds: 0, stableProbes: 1);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
List<AlarmProviderModeChange> changes = new List<AlarmProviderModeChange>();
sut.ProviderModeChanged += (_, e) => changes.Add(e);
sut.Subscribe(@"\\HOST\Galaxy!Area"); // failure 1 (primary), standby armed
Assert.True(standby.Subscribed);
Assert.Empty(changes);
sut.PollOnce(); // failure 2
Assert.Empty(changes);
sut.PollOnce(); // failure 3 → switch
Assert.Single(changes);
Assert.Equal(AlarmProviderMode.Subtag, changes[0].Mode);
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
Assert.Equal(unchecked((int)0x80004005), changes[0].HResult);
}
[Fact]
public void AfterSwitch_StandbyTransitionsAreForwarded()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
StubStandby standby = new StubStandby();
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
MxAlarmTransitionEvent? forwarded = null;
sut.AlarmTransitionEmitted += (_, e) => forwarded = e;
sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → switch to Subtag immediately
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
MxAlarmTransitionEvent transition = SampleTransition();
standby.Raise(transition);
Assert.Same(transition, forwarded);
}
[Fact]
public void WhileDegraded_PrimaryHeals_FailsBackAfterStableProbes()
{
// threshold=1 so the initial Subscribe failure (PollOnce path) immediately
// switches to Subtag. stableProbes=2 means two consecutive clean PollOnce
// calls are needed before failback. ProbeOnce must NOT call Subscribe —
// WnWrapAlarmConsumer is single-subscribe; re-calling would always throw.
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
StubStandby standby = new StubStandby();
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 2);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
List<AlarmProviderModeChange> changes = new List<AlarmProviderModeChange>();
sut.ProviderModeChanged += (_, e) => changes.Add(e);
sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → Subtag (mode change 1)
Assert.Single(changes);
Assert.Equal(AlarmProviderMode.Subtag, changes[changes.Count - 1].Mode);
// Primary heals: PollOnce stops throwing. ProbeOnce should call only
// PollOnce (not Subscribe) to detect recovery.
primary.ThrowOnPoll = false;
int subscribeCountAfterFailover = primary.SubscribeCount;
sut.ProbeOnce(); // cleanProbes=1 — not yet at stableProbes=2
Assert.Single(changes);
sut.ProbeOnce(); // cleanProbes=2 → failback to Alarmmgr (mode change 2)
Assert.Equal(2, changes.Count);
Assert.Equal(AlarmProviderMode.Alarmmgr, changes[changes.Count - 1].Mode);
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
Assert.Equal(0, changes[changes.Count - 1].HResult);
// ProbeOnce must not have called Subscribe at all during probing.
Assert.Equal(subscribeCountAfterFailover, primary.SubscribeCount);
}
[Fact]
public void BeforeFailover_PrimaryTransitionsAreForwarded()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = false }; // healthy, can Raise
StubStandby standby = new StubStandby();
FailoverSettings settings = new FailoverSettings(threshold: 3, probeIntervalSeconds: 0, stableProbes: 1);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
List<MxAlarmTransitionEvent> forwarded = new List<MxAlarmTransitionEvent>();
sut.AlarmTransitionEmitted += (_, e) => forwarded.Add(e);
sut.Subscribe(@"\\HOST\Galaxy!Area");
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
MxAlarmTransitionEvent fromPrimary = SampleTransition();
primary.Raise(fromPrimary); // active=Primary → forwarded
Assert.Single(forwarded);
Assert.Same(fromPrimary, forwarded[0]);
standby.Raise(SampleTransition()); // standby not active → suppressed
Assert.Single(forwarded);
}
/// <summary>
/// Proves that <see cref="FailoverAlarmConsumer.ProbeOnce"/> never calls
/// <c>Subscribe</c> on the primary while degraded. The production primary
/// (<see cref="WnWrapAlarmConsumer"/>) is single-subscribe; a second
/// <c>Subscribe</c> call would always throw and make failback impossible.
/// The probe must re-poll the still-subscribed primary via
/// <c>PollOnce</c> only.
/// </summary>
[Fact]
public void ProbeOnce_DoesNotCallPrimarySubscribe()
{
// threshold=1 → first Subscribe failure immediately switches to Subtag.
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
StubStandby standby = new StubStandby();
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 3);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
sut.Subscribe(@"\\HOST\Galaxy!Area"); // Subscribe attempt #1 (throws) → Subtag
// Capture how many Subscribe calls the initial setup caused (exactly 1:
// the attempt that threw and triggered failover).
int subscribeCountAfterSetup = primary.SubscribeCount;
Assert.Equal(1, subscribeCountAfterSetup);
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
// Let PollOnce succeed so ProbeOnce progresses without throwing.
primary.ThrowOnPoll = false;
// Drive several ProbeOnce calls — none should touch Subscribe.
sut.ProbeOnce();
sut.ProbeOnce();
sut.ProbeOnce(); // stableProbes=3 → failback on this call
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
Assert.Equal(subscribeCountAfterSetup, primary.SubscribeCount);
}
[Fact]
public void Acknowledge_DelegatesToActiveChild()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = false };
StubStandby standby = new StubStandby();
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
sut.Subscribe(@"\\HOST\Galaxy!Area");
// Active = Primary → primary's sentinel value (11).
Assert.Equal(11, sut.AcknowledgeByGuid(Guid.NewGuid(), "c", "n", "node", "dom", "full"));
Assert.Equal(11, sut.AcknowledgeByName("a", "p", "g", "c", "n", "node", "dom", "full"));
// Force a failover by failing the primary past threshold.
primary.ThrowOnPoll = true;
sut.PollOnce(); // threshold=1 → switch to Standby
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
// Active = Standby → standby's sentinel value (22).
Assert.Equal(22, sut.AcknowledgeByGuid(Guid.NewGuid(), "c", "n", "node", "dom", "full"));
Assert.Equal(22, sut.AcknowledgeByName("a", "p", "g", "c", "n", "node", "dom", "full"));
}
/// <summary>
/// Proves that an intermittent failure during failback probing resets the
/// clean-probe counter to zero, requiring a fresh unbroken run of
/// <see cref="FailoverSettings.StableProbes"/> before failing back.
/// </summary>
[Fact]
public void FailbackProbe_IntermittentFailure_ResetsCleanCount()
{
var primary = new FlakyPrimary { ThrowOnPoll = true };
var standby = new StubStandby();
using var sut = new FailoverAlarmConsumer(primary, standby, new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 3));
sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → switch to Subtag
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
primary.ThrowOnPoll = false;
sut.ProbeOnce(); // clean 1
sut.ProbeOnce(); // clean 2
primary.ThrowOnPoll = true;
sut.ProbeOnce(); // fails → reset to 0
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
primary.ThrowOnPoll = false;
sut.ProbeOnce(); // clean 1
sut.ProbeOnce(); // clean 2
sut.ProbeOnce(); // clean 3 → failback
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
}
/// <summary>
/// Worker-026 regression: when the standby's priming
/// <c>SnapshotActiveAlarms</c> throws during failover, the switch must
/// still (a) fire <c>ProviderModeChanged</c> so the gateway learns the
/// feed went degraded, (b) leave <see cref="FailoverAlarmConsumer.Mode"/>
/// in Subtag, and (c) not rethrow out of <c>PollOnce</c> (which on the
/// real STA would land in the poll loop's trailing catch and permanently
/// stop alarm delivery).
/// </summary>
[Fact]
public void Failover_WhenStandbyPrimingSnapshotThrows_StillRaisesModeChangeAndDoesNotRethrow()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
StubStandby standby = new StubStandby { ThrowOnSnapshot = true };
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
List<AlarmProviderModeChange> changes = new List<AlarmProviderModeChange>();
sut.ProviderModeChanged += (_, e) => changes.Add(e);
// threshold=1 → the Subscribe failure triggers the switch, which primes
// the standby snapshot (throwing). The exception must be contained.
Exception? escaped = Record.Exception(() => sut.Subscribe(@"\\HOST\Galaxy!Area"));
Assert.Null(escaped);
Assert.Single(changes);
Assert.Equal(AlarmProviderMode.Subtag, changes[0].Mode);
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
Assert.True(standby.SnapshotCalls >= 1); // priming was attempted
// A subsequent degraded PollOnce (standby.PollOnce + ProbeOnce) must also
// not rethrow the snapshot failure.
Exception? pollEscaped = Record.Exception(() => sut.PollOnce());
Assert.Null(pollEscaped);
}
/// <summary>
/// Worker-026 regression: when a <c>ProviderModeChanged</c> subscriber's
/// handler throws (modeling the AlarmCommandHandler's event-queue enqueue
/// overflowing at capacity), the switch must still take effect and the
/// exception must not escape the switch path into the poll loop.
/// </summary>
[Fact]
public void Failover_WhenModeChangedHandlerThrows_SwitchStillTakesEffectAndDoesNotRethrow()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
StubStandby standby = new StubStandby();
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
int handlerInvocations = 0;
sut.ProviderModeChanged += (_, _) =>
{
handlerInvocations++;
throw new InvalidOperationException("subscriber handler blew up");
};
Exception? escaped = Record.Exception(() => sut.Subscribe(@"\\HOST\Galaxy!Area"));
Assert.Null(escaped);
Assert.Equal(1, handlerInvocations); // the event still fired
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode); // the switch still took effect
}
/// <summary>
/// Worker.Tests-031 regression: with a non-zero
/// <see cref="FailoverSettings.ProbeIntervalSeconds"/>, two back-to-back
/// <c>ProbeOnce</c> calls must throttle — the second falls inside the
/// interval and must NOT re-poll the primary. Two consecutive calls
/// reliably fall inside any interval of one second or more, so this needs
/// no injected clock.
/// </summary>
[Fact]
public void ProbeOnce_WithNonZeroInterval_ThrottlesSecondProbeWithinInterval()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = true };
StubStandby standby = new StubStandby();
// stableProbes high enough that a single clean probe cannot fail back,
// so Mode stays Subtag and ProbeOnce remains the throttled path.
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 3600, stableProbes: 5);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
sut.Subscribe(@"\\HOST\Galaxy!Area"); // threshold=1 → switch to Subtag
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
primary.ThrowOnPoll = false; // primary healthy so a probe would poll cleanly
sut.ProbeOnce(); // first probe runs: re-polls the primary
int pollsAfterFirstProbe = primary.Polls;
Assert.Equal(1, pollsAfterFirstProbe);
sut.ProbeOnce(); // within the 3600s interval → throttled, must NOT re-poll
Assert.Equal(pollsAfterFirstProbe, primary.Polls);
Assert.Equal(AlarmProviderMode.Subtag, sut.Mode);
}
/// <summary>
/// Worker.Tests-032 regression: <c>RunPrimary</c>'s
/// <c>when (ex is not OutOfMemoryException)</c> filter must let an
/// <see cref="OutOfMemoryException"/> propagate rather than swallowing it
/// and counting it toward the failover threshold. No mode change must
/// fire — a fatal allocation failure is not a clean degraded handoff.
/// </summary>
[Fact]
public void RunPrimary_WhenPrimaryThrowsOutOfMemory_PropagatesAndDoesNotFailOver()
{
FlakyPrimary primary = new FlakyPrimary { ThrowOnPoll = false, ThrowOutOfMemoryOnPoll = true };
StubStandby standby = new StubStandby();
FailoverSettings settings = new FailoverSettings(threshold: 1, probeIntervalSeconds: 0, stableProbes: 1);
using FailoverAlarmConsumer sut = new FailoverAlarmConsumer(primary, standby, settings);
bool modeChanged = false;
sut.ProviderModeChanged += (_, _) => modeChanged = true;
sut.Subscribe(@"\\HOST\Galaxy!Area"); // Subscribe path does not poll; no throw here
Assert.Throws<OutOfMemoryException>(() => sut.PollOnce());
Assert.False(modeChanged);
Assert.Equal(AlarmProviderMode.Alarmmgr, sut.Mode);
}
/// <summary>
/// Worker.Tests-032 regression: <see cref="FailoverSettings"/> clamps
/// sub-1 <c>threshold</c> and <c>stableProbes</c> (and sub-0
/// <c>probeIntervalSeconds</c>) to their safe minimums so a misconfigured
/// bind cannot change failover semantics.
/// </summary>
[Theory]
[InlineData(0, 0, 0, 1, 0, 1)]
[InlineData(-5, -5, -5, 1, 0, 1)]
[InlineData(3, 7, 2, 3, 7, 2)]
public void FailoverSettings_ClampsSubMinimumValues(
int threshold,
int probeInterval,
int stableProbes,
int expectedThreshold,
int expectedProbeInterval,
int expectedStableProbes)
{
FailoverSettings settings = new FailoverSettings(threshold, probeInterval, stableProbes);
Assert.Equal(expectedThreshold, settings.Threshold);
Assert.Equal(expectedProbeInterval, settings.ProbeIntervalSeconds);
Assert.Equal(expectedStableProbes, settings.StableProbes);
}
}
@@ -0,0 +1,281 @@
using System;
using System.Collections.Generic;
using System.Linq;
using ZB.MOM.WW.MxGateway.Worker.MxAccess;
namespace ZB.MOM.WW.MxGateway.Worker.Tests.MxAccess;
/// <summary>
/// Unit-test coverage for <see cref="LmxSubtagAlarmSource"/>'s advise/write
/// sequencing and its <c>OnDataChange</c> normalization. The actual
/// <c>LMXProxyServerClass</c> COM event subscription cannot be exercised
/// without a live MXAccess install, so these tests drive the source through
/// its internal <see cref="IMxAccessServer"/> seam and call
/// <c>HandleDataChange</c> directly to simulate a COM callback — exactly the
/// boundary <c>MxAccessBaseEventSink.OnDataChange</c> uses for the
/// per-session pipeline. End-to-end COM delivery is covered by the
/// Skip-gated alarm live smoke tests.
/// </summary>
public sealed class LmxSubtagAlarmSourceTests
{
private const int FakeServerHandle = 7;
/// <summary>Verifies the production constructor rejects a null factory.</summary>
[Fact]
public void Constructor_NullFactory_Throws()
{
Assert.Throws<ArgumentNullException>(() => new LmxSubtagAlarmSource(factory: null!));
}
/// <summary>
/// Verifies <see cref="LmxSubtagAlarmSource.Advise"/> calls AddItem then
/// Advise once per distinct address, and is idempotent on a repeated
/// address.
/// </summary>
[Fact]
public void Advise_AddsAndAdvisesEachAddressOnce()
{
var server = new RecordingMxAccessServer();
using var source = new LmxSubtagAlarmSource(server, FakeServerHandle);
source.Advise(new[] { "Tank1.Alarm.Subtag", "Tank2.Alarm.Subtag" });
// Re-advising an already-advised address is a no-op.
source.Advise(new[] { "Tank1.Alarm.Subtag" });
Assert.Equal(
new[] { "Tank1.Alarm.Subtag", "Tank2.Alarm.Subtag" },
server.AddedItems);
Assert.Equal(2, server.AdviseCount);
// Every advise targeted the supplied server handle.
Assert.All(server.AdvisedServerHandles, h => Assert.Equal(FakeServerHandle, h));
}
/// <summary>
/// Verifies a simulated <c>OnDataChange</c> for an advised item handle
/// raises <see cref="LmxSubtagAlarmSource.ValueChanged"/> with the
/// address that was advised and the delivered value.
/// </summary>
[Fact]
public void HandleDataChange_RaisesValueChangedWithAdvisedAddress()
{
var server = new RecordingMxAccessServer();
using var source = new LmxSubtagAlarmSource(server, FakeServerHandle);
source.Advise(new[] { "Tank1.Alarm.Subtag" });
int itemHandle = server.LastItemHandleFor("Tank1.Alarm.Subtag");
SubtagValueChange? received = null;
source.ValueChanged += (_, change) => received = change;
source.HandleDataChange(itemHandle, pvItemValue: 42, pftItemTimeStamp: null);
Assert.NotNull(received);
Assert.Equal("Tank1.Alarm.Subtag", received!.ItemAddress);
Assert.Equal(42, received.Value);
Assert.Equal(DateTimeKind.Utc, received.TimestampUtc.Kind);
}
/// <summary>
/// Verifies <c>OnDataChange</c> for an unknown item handle is ignored
/// (no <see cref="LmxSubtagAlarmSource.ValueChanged"/> raised).
/// </summary>
[Fact]
public void HandleDataChange_UnknownHandle_DoesNotRaise()
{
var server = new RecordingMxAccessServer();
using var source = new LmxSubtagAlarmSource(server, FakeServerHandle);
bool raised = false;
source.ValueChanged += (_, _) => raised = true;
source.HandleDataChange(phItemHandle: 999, pvItemValue: 1, pftItemTimeStamp: null);
Assert.False(raised);
}
/// <summary>
/// Verifies <see cref="LmxSubtagAlarmSource.Write"/> adds the item when
/// it was not previously advised and writes with user id 0.
/// </summary>
[Fact]
public void Write_AddsItemWhenUnknownAndWrites()
{
var server = new RecordingMxAccessServer();
using var source = new LmxSubtagAlarmSource(server, FakeServerHandle);
source.Write("Tank1.Alarm.AckComment", "acknowledged");
Assert.Contains("Tank1.Alarm.AckComment", server.AddedItems);
Assert.Single(server.Writes);
RecordingMxAccessServer.WriteRecord write = server.Writes[0];
Assert.Equal(FakeServerHandle, write.ServerHandle);
Assert.Equal("acknowledged", write.Value);
Assert.Equal(0, write.UserId);
}
/// <summary>
/// Verifies <see cref="LmxSubtagAlarmSource.Write"/> reuses an existing
/// item handle (no duplicate AddItem) when the address was already
/// advised.
/// </summary>
[Fact]
public void Write_ReusesHandleForAdvisedAddress()
{
var server = new RecordingMxAccessServer();
using var source = new LmxSubtagAlarmSource(server, FakeServerHandle);
source.Advise(new[] { "Tank1.Alarm.Subtag" });
int adviseAddCount = server.AddedItems.Count;
source.Write("Tank1.Alarm.Subtag", true);
// No second AddItem for the already-advised address.
Assert.Equal(adviseAddCount, server.AddedItems.Count);
Assert.Single(server.Writes);
}
/// <summary>
/// Verifies <see cref="LmxSubtagAlarmSource.Dispose"/> UnAdvises only the
/// handles that were actually advised — a write-only item (added by
/// <see cref="LmxSubtagAlarmSource.Write"/> but never advised) is removed
/// but not unadvised — and unregisters the server exactly once.
/// </summary>
[Fact]
public void Dispose_UnAdvisesOnlyAdvisedHandles_RemovesAll_AndUnregistersOnce()
{
var server = new RecordingMxAccessServer();
var source = new LmxSubtagAlarmSource(server, FakeServerHandle);
source.Advise(new[] { "Tank1.Alarm.Subtag", "Tank2.Alarm.Subtag" });
// A write-only subtag: added by Write, never advised.
source.Write("Tank1.Alarm.AckComment", "acknowledged");
int advised1 = server.LastItemHandleFor("Tank1.Alarm.Subtag");
int advised2 = server.LastItemHandleFor("Tank2.Alarm.Subtag");
int writeOnly = server.LastItemHandleFor("Tank1.Alarm.AckComment");
source.Dispose();
// Only the two advised handles are unadvised — never the write-only one.
Assert.Equal(new[] { advised1, advised2 }, server.UnAdvisedItemHandles);
Assert.DoesNotContain(writeOnly, server.UnAdvisedItemHandles);
// Every added item (advised + write-only) is removed.
Assert.Equal(
new[] { advised1, advised2, writeOnly }.OrderBy(h => h),
server.RemovedItemHandles.OrderBy(h => h));
Assert.Equal(1, server.UnregisterCount);
}
/// <summary>
/// Verifies <see cref="LmxSubtagAlarmSource.Dispose"/> is idempotent: a
/// second call performs no further teardown.
/// </summary>
[Fact]
public void Dispose_IsIdempotent()
{
var server = new RecordingMxAccessServer();
var source = new LmxSubtagAlarmSource(server, FakeServerHandle);
source.Advise(new[] { "Tank1.Alarm.Subtag" });
source.Dispose();
int unadviseAfterFirst = server.UnAdvisedItemHandles.Count;
int unregisterAfterFirst = server.UnregisterCount;
source.Dispose();
Assert.Equal(unadviseAfterFirst, server.UnAdvisedItemHandles.Count);
Assert.Equal(unregisterAfterFirst, server.UnregisterCount);
}
/// <summary>
/// Recording <see cref="IMxAccessServer"/> test double that captures the
/// AddItem/Advise/Write/UnAdvise/RemoveItem/Unregister calls
/// <see cref="LmxSubtagAlarmSource"/> makes and hands out monotonically
/// increasing item handles.
/// </summary>
private sealed class RecordingMxAccessServer : IMxAccessServer
{
private readonly Dictionary<string, int> handlesByAddress = new(StringComparer.Ordinal);
private int nextItemHandle = 100;
public List<string> AddedItems { get; } = new();
public int AdviseCount { get; private set; }
public List<int> AdvisedServerHandles { get; } = new();
public List<WriteRecord> Writes { get; } = new();
public List<int> UnAdvisedItemHandles { get; } = new();
public List<int> RemovedItemHandles { get; } = new();
public int UnregisterCount { get; private set; }
public int LastItemHandleFor(string itemAddress) => handlesByAddress[itemAddress];
public int Register(string clientName) => FakeServerHandle;
public void Unregister(int serverHandle) => UnregisterCount++;
public int AddItem(int serverHandle, string itemDefinition)
{
AddedItems.Add(itemDefinition);
int handle = nextItemHandle++;
handlesByAddress[itemDefinition] = handle;
return handle;
}
public int AddItem2(int serverHandle, string itemDefinition, string itemContext)
=> AddItem(serverHandle, itemDefinition);
public void RemoveItem(int serverHandle, int itemHandle) => RemovedItemHandles.Add(itemHandle);
public void Advise(int serverHandle, int itemHandle)
{
AdviseCount++;
AdvisedServerHandles.Add(serverHandle);
}
public void UnAdvise(int serverHandle, int itemHandle) => UnAdvisedItemHandles.Add(itemHandle);
public void AdviseSupervisory(int serverHandle, int itemHandle)
{
}
public void Write(int serverHandle, int itemHandle, object? value, int userId)
=> Writes.Add(new WriteRecord(serverHandle, itemHandle, value, userId));
public void Write2(int serverHandle, int itemHandle, object? value, object? timestamp, int userId)
{
}
public void WriteSecured(int serverHandle, int itemHandle, int currentUserId, int verifierUserId, object? value)
{
}
public void WriteSecured2(int serverHandle, int itemHandle, int currentUserId, int verifierUserId, object? value, object? timestamp)
{
}
internal sealed class WriteRecord
{
public WriteRecord(int serverHandle, int itemHandle, object? value, int userId)
{
ServerHandle = serverHandle;
ItemHandle = itemHandle;
Value = value;
UserId = userId;
}
public int ServerHandle { get; }
public int ItemHandle { get; }
public object? Value { get; }
public int UserId { get; }
}
}
}
@@ -200,7 +200,7 @@ public sealed class MxAccessStaSessionTests
factory,
eventSink,
new MxAccessEventQueue(),
(_eq, _affinity) => handler);
(_eq, _affinity, _comFactory) => handler);
await session.StartAsync("session-1", workerProcessId: 1);
@@ -279,7 +279,7 @@ public sealed class MxAccessStaSessionTests
factory,
eventSink,
new MxAccessEventQueue(),
(_eq, _affinity) => handler);
(_eq, _affinity, _comFactory) => handler);
await session.StartAsync("session-1", workerProcessId: 1);
@@ -320,7 +320,7 @@ public sealed class MxAccessStaSessionTests
factory,
eventSink,
new MxAccessEventQueue(),
(_eq, _affinity) => handler);
(_eq, _affinity, _comFactory) => handler);
await session.StartAsync("session-1", workerProcessId: 1);
@@ -369,7 +369,7 @@ public sealed class MxAccessStaSessionTests
factory,
eventSink,
eventQueue,
(_eq, _affinity) => handler);
(_eq, _affinity, _comFactory) => handler);
await session.StartAsync("session-1", workerProcessId: 1);
@@ -416,7 +416,7 @@ public sealed class MxAccessStaSessionTests
factory,
eventSink,
eventQueue,
(_eq, _affinity) => handler);
(_eq, _affinity, _comFactory) => handler);
await session.StartAsync("session-1", workerProcessId: 1);
@@ -496,12 +496,12 @@ public sealed class MxAccessStaSessionTests
}
/// <summary>Subscribes to alarm events.</summary>
/// <param name="subscription">The subscription descriptor.</param>
/// <param name="command">The subscribe-alarms command.</param>
/// <param name="sessionId">The session identifier.</param>
public void Subscribe(string subscription, string sessionId)
public void Subscribe(SubscribeAlarmsCommand command, string sessionId)
{
IsSubscribed = true;
LastSubscription = subscription;
LastSubscription = command.SubscriptionExpression;
}
/// <summary>Unsubscribes from alarm events.</summary>
@@ -0,0 +1,331 @@
using System;
using System.Collections.Generic;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
using ZB.MOM.WW.MxGateway.Worker.MxAccess;
using Xunit;
namespace ZB.MOM.WW.MxGateway.Worker.Tests.MxAccess;
/// <summary>
/// Unit tests for <see cref="SubtagAlarmConsumer"/>: prove that the subtag
/// fallback advises the observable subtags, synthesizes degraded
/// transitions with stable synthetic GUIDs, routes acknowledgments to the
/// ack-comment subtag, and stamps snapshots. A <see cref="FakeSource"/>
/// stands in for the live MXAccess subtag source so this needs no AVEVA
/// install.
/// </summary>
public sealed class SubtagAlarmConsumerTests
{
private const string Reference = "Galaxy!TestArea.Tank01.Level.HiHi";
private const string ActiveSubtag = "Tank01.Level.HiHi.InAlarm";
private const string AckedSubtag = "Tank01.Level.HiHi.Acked";
private const string AckCommentSubtag = "Tank01.Level.HiHi.AckComment";
private const string PrioritySubtag = "Tank01.Level.HiHi.Priority";
private static AlarmSubtagTarget BuildTarget()
{
return new AlarmSubtagTarget
{
AlarmFullReference = Reference,
SourceObjectReference = "Tank01",
ActiveSubtag = ActiveSubtag,
AckedSubtag = AckedSubtag,
AckCommentSubtag = AckCommentSubtag,
PrioritySubtag = PrioritySubtag,
};
}
private static SubtagAlarmConsumer BuildConsumer(FakeSource source)
{
return new SubtagAlarmConsumer(source, new[] { BuildTarget() });
}
/// <summary>Verifies Subscribe advises the active, acked, priority, and ack-comment subtags (the ack-comment is advised so it is an active, writable MXAccess item).</summary>
[Fact]
public void Subscribe_AdvisesAllSubtagsIncludingAckComment()
{
FakeSource source = new FakeSource();
using SubtagAlarmConsumer consumer = BuildConsumer(source);
consumer.Subscribe(@"\\HOST\Galaxy!TestArea");
Assert.Contains(ActiveSubtag, source.Advised);
Assert.Contains(AckedSubtag, source.Advised);
Assert.Contains(PrioritySubtag, source.Advised);
Assert.Contains(AckCommentSubtag, source.Advised);
}
/// <summary>Verifies an active=true value change raises a degraded, GUID-stamped UNACK_ALM transition.</summary>
[Fact]
public void ValueChange_RaisesDegradedSynthesizedTransition()
{
FakeSource source = new FakeSource();
using SubtagAlarmConsumer consumer = BuildConsumer(source);
consumer.Subscribe(@"\\HOST\Galaxy!TestArea");
MxAlarmTransitionEvent? emitted = null;
consumer.AlarmTransitionEmitted += (_, e) => emitted = e;
source.Raise(ActiveSubtag, true, new DateTime(2026, 6, 13, 10, 0, 0, DateTimeKind.Utc));
Assert.NotNull(emitted);
Assert.Equal(MxAlarmStateKind.UnackAlm, emitted!.Record.State);
Assert.True(emitted.Record.Degraded);
Assert.NotEqual(Guid.Empty, emitted.Record.AlarmGuid);
}
/// <summary>Verifies AcknowledgeByName writes the comment to the ack-comment subtag and returns success.</summary>
[Fact]
public void AcknowledgeByName_WritesCommentToAckCommentSubtag()
{
FakeSource source = new FakeSource();
using SubtagAlarmConsumer consumer = BuildConsumer(source);
consumer.Subscribe(@"\\HOST\Galaxy!TestArea");
int rc = consumer.AcknowledgeByName(
alarmName: "Tank01.Level.HiHi",
providerName: "Galaxy",
groupName: "TestArea",
ackComment: "operator ack",
ackOperatorName: "alice",
ackOperatorNode: "WS01",
ackOperatorDomain: "CORP",
ackOperatorFullName: "Alice Smith");
Assert.Equal(0, rc);
Assert.NotNull(source.LastWrite);
Assert.Equal(AckCommentSubtag, source.LastWrite!.Value.Address);
Assert.Equal("operator ack", source.LastWrite!.Value.Value);
}
/// <summary>Verifies AcknowledgeByName returns non-zero when no target matches the supplied name.</summary>
[Fact]
public void AcknowledgeByName_UnknownAlarm_ReturnsNonZero()
{
FakeSource source = new FakeSource();
using SubtagAlarmConsumer consumer = BuildConsumer(source);
consumer.Subscribe(@"\\HOST\Galaxy!TestArea");
int rc = consumer.AcknowledgeByName(
alarmName: "DoesNotExist.NoSuchAlarm",
providerName: "Galaxy",
groupName: "TestArea",
ackComment: "operator ack",
ackOperatorName: "alice",
ackOperatorNode: "WS01",
ackOperatorDomain: "CORP",
ackOperatorFullName: "Alice Smith");
Assert.NotEqual(0, rc);
Assert.Null(source.LastWrite);
}
/// <summary>Verifies a snapshot of an active alarm stamps Degraded and a non-empty synthetic GUID.</summary>
[Fact]
public void SnapshotActiveAlarms_StampsDegradedAndGuid()
{
FakeSource source = new FakeSource();
using SubtagAlarmConsumer consumer = BuildConsumer(source);
consumer.Subscribe(@"\\HOST\Galaxy!TestArea");
source.Raise(ActiveSubtag, true, new DateTime(2026, 6, 13, 10, 0, 0, DateTimeKind.Utc));
IReadOnlyList<MxAlarmSnapshotRecord> snapshot = consumer.SnapshotActiveAlarms();
Assert.Single(snapshot);
Assert.True(snapshot[0].Degraded);
Assert.NotEqual(Guid.Empty, snapshot[0].AlarmGuid);
}
/// <summary>Verifies the synthetic GUID on the emitted transition equals the GUID in the snapshot for the same alarm.</summary>
[Fact]
public void SameReference_SyntheticGuidStableAcrossTransitionAndSnapshot()
{
FakeSource source = new FakeSource();
using SubtagAlarmConsumer consumer = BuildConsumer(source);
consumer.Subscribe(@"\\HOST\Galaxy!TestArea");
MxAlarmTransitionEvent? emitted = null;
consumer.AlarmTransitionEmitted += (_, e) => emitted = e;
source.Raise(ActiveSubtag, true, new DateTime(2026, 6, 13, 10, 0, 0, DateTimeKind.Utc));
IReadOnlyList<MxAlarmSnapshotRecord> snapshot = consumer.SnapshotActiveAlarms();
Assert.NotNull(emitted);
Assert.Single(snapshot);
Assert.Equal(emitted!.Record.AlarmGuid, snapshot[0].AlarmGuid);
Assert.NotEqual(Guid.Empty, emitted.Record.AlarmGuid);
}
/// <summary>
/// Verifies that when two alarm targets share a prefix (e.g. Level.Hi vs Level.HiHi),
/// AcknowledgeByName routes each ack to its own ack-comment subtag and never
/// conflates the shorter name with the longer one.
/// </summary>
[Fact]
public void AcknowledgeByName_PrefixNameDoesNotFalseMatch()
{
const string ReferenceHi = "Galaxy!Area.Tank01.Level.Hi";
const string ReferenceHiHi = "Galaxy!Area.Tank01.Level.HiHi";
const string AckCommentHi = "Tank01.Level.Hi.AckComment";
const string AckCommentHiHi = "Tank01.Level.HiHi.AckComment";
AlarmSubtagTarget targetHi = new AlarmSubtagTarget
{
AlarmFullReference = ReferenceHi,
SourceObjectReference = "Tank01",
ActiveSubtag = "Tank01.Level.Hi.InAlarm",
AckedSubtag = "Tank01.Level.Hi.Acked",
AckCommentSubtag = AckCommentHi,
PrioritySubtag = "Tank01.Level.Hi.Priority",
};
AlarmSubtagTarget targetHiHi = new AlarmSubtagTarget
{
AlarmFullReference = ReferenceHiHi,
SourceObjectReference = "Tank01",
ActiveSubtag = "Tank01.Level.HiHi.InAlarm",
AckedSubtag = "Tank01.Level.HiHi.Acked",
AckCommentSubtag = AckCommentHiHi,
PrioritySubtag = "Tank01.Level.HiHi.Priority",
};
FakeSource source = new FakeSource();
using SubtagAlarmConsumer consumer = new SubtagAlarmConsumer(
source, new[] { targetHi, targetHiHi });
consumer.Subscribe(@"\\HOST\Galaxy!Area");
// Ack the shorter name — must write to the shorter target's subtag only.
int rcHi = consumer.AcknowledgeByName(
alarmName: "Tank01.Level.Hi",
providerName: "Galaxy",
groupName: "Area",
ackComment: "ack hi",
ackOperatorName: "op", ackOperatorNode: "WS01",
ackOperatorDomain: "CORP", ackOperatorFullName: "Operator");
Assert.Equal(0, rcHi);
Assert.NotNull(source.LastWrite);
Assert.Equal(AckCommentHi, source.LastWrite!.Value.Address);
Assert.Equal("ack hi", source.LastWrite.Value.Value);
// Ack the longer name — must write to the longer target's subtag.
int rcHiHi = consumer.AcknowledgeByName(
alarmName: "Tank01.Level.HiHi",
providerName: "Galaxy",
groupName: "Area",
ackComment: "ack hihi",
ackOperatorName: "op", ackOperatorNode: "WS01",
ackOperatorDomain: "CORP", ackOperatorFullName: "Operator");
Assert.Equal(0, rcHiHi);
Assert.NotNull(source.LastWrite);
Assert.Equal(AckCommentHiHi, source.LastWrite!.Value.Address);
Assert.Equal("ack hihi", source.LastWrite.Value.Value);
}
/// <summary>
/// Verifies AcknowledgeByGuid resolves the synthetic GUID (computed from
/// the alarm's full reference) to the correct target and writes the comment
/// to that target's ack-comment subtag.
/// </summary>
[Fact]
public void AcknowledgeByGuid_WritesCommentToAckCommentSubtag()
{
FakeSource source = new FakeSource();
using SubtagAlarmConsumer consumer = BuildConsumer(source);
consumer.Subscribe(@"\\HOST\Galaxy!TestArea");
// Raise a transition so the state machine sees the alarm, then capture
// the GUID stamped on the emitted event.
MxAlarmTransitionEvent? emitted = null;
consumer.AlarmTransitionEmitted += (_, e) => emitted = e;
source.Raise(ActiveSubtag, true, new DateTime(2026, 6, 13, 10, 0, 0, DateTimeKind.Utc));
Assert.NotNull(emitted);
Guid syntheticGuid = emitted!.Record.AlarmGuid;
Assert.NotEqual(Guid.Empty, syntheticGuid);
int rc = consumer.AcknowledgeByGuid(
alarmGuid: syntheticGuid,
ackComment: "guid ack",
ackOperatorName: "op",
ackOperatorNode: "WS01",
ackOperatorDomain: "CORP",
ackOperatorFullName: "Operator");
Assert.Equal(0, rc);
Assert.NotNull(source.LastWrite);
Assert.Equal(AckCommentSubtag, source.LastWrite!.Value.Address);
Assert.Equal("guid ack", source.LastWrite.Value.Value);
}
/// <summary>
/// Verifies AcknowledgeByGuid returns non-zero and performs no write when
/// the supplied GUID is not known to the consumer.
/// </summary>
[Fact]
public void AcknowledgeByGuid_UnknownGuid_ReturnsNonZero()
{
FakeSource source = new FakeSource();
using SubtagAlarmConsumer consumer = BuildConsumer(source);
consumer.Subscribe(@"\\HOST\Galaxy!TestArea");
int rc = consumer.AcknowledgeByGuid(
alarmGuid: Guid.NewGuid(),
ackComment: "should not write",
ackOperatorName: "op",
ackOperatorNode: "WS01",
ackOperatorDomain: "CORP",
ackOperatorFullName: "Operator");
Assert.NotEqual(0, rc);
Assert.Null(source.LastWrite);
}
private sealed class FakeSource : ISubtagAlarmSource
{
/// <summary>Raised when an advised subtag reports a new value.</summary>
public event EventHandler<SubtagValueChange>? ValueChanged;
/// <summary>Gets the subtag addresses passed to <see cref="Advise"/>.</summary>
public List<string> Advised { get; } = new List<string>();
/// <summary>Gets the most recent (address, value) pair passed to <see cref="Write"/>.</summary>
public (string Address, object? Value)? LastWrite { get; private set; }
/// <summary>Records the advised subtag addresses.</summary>
/// <param name="itemAddresses">The subtag references to advise.</param>
public void Advise(IReadOnlyCollection<string> itemAddresses)
{
Advised.AddRange(itemAddresses);
}
/// <summary>Records the most recent write.</summary>
/// <param name="itemAddress">The subtag reference to write.</param>
/// <param name="value">The value to write.</param>
public void Write(string itemAddress, object? value)
{
LastWrite = (itemAddress, value);
}
/// <summary>Raises a <see cref="SubtagValueChange"/> for the given subtag.</summary>
/// <param name="address">The subtag address whose value changed.</param>
/// <param name="value">The new value.</param>
/// <param name="timestampUtc">The UTC timestamp of the change.</param>
public void Raise(string address, object? value, DateTime timestampUtc)
{
ValueChanged?.Invoke(this, new SubtagValueChange
{
ItemAddress = address,
Value = value,
TimestampUtc = timestampUtc,
});
}
/// <inheritdoc />
public void Dispose()
{
}
}
}
@@ -0,0 +1,250 @@
using System;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
using ZB.MOM.WW.MxGateway.Worker.MxAccess;
using Xunit;
namespace ZB.MOM.WW.MxGateway.Worker.Tests.MxAccess;
/// <summary>
/// Unit tests for the subtag-fallback synthesis state machine. The machine
/// consumes normalized subtag value changes (active/acked/priority) and
/// emits <see cref="MxAlarmTransitionEvent"/> records mirroring the wnwrap
/// consumer's UNACK_ALM / ACK_ALM / UNACK_RTN / ACK_RTN transitions. No COM
/// or AVEVA install is required.
/// </summary>
public sealed class SubtagAlarmStateMachineTests
{
private static AlarmSubtagTarget Target() => new()
{
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
SourceObjectReference = "Tank01",
ActiveSubtag = "Tank01.Level.HiHi.active",
AckedSubtag = "Tank01.Level.HiHi.acked",
AckCommentSubtag = "Tank01.Level.HiHi.ackmsg",
};
[Fact]
public void ActiveFalseToTrue_EmitsRaise()
{
var sm = new SubtagAlarmStateMachine(new[] { Target() });
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
var events = sm.Apply("Tank01.Level.HiHi.active", true, ts);
var e = Assert.Single(events);
Assert.Equal(MxAlarmStateKind.UnackAlm, e.Record.State);
Assert.Equal(MxAlarmStateKind.Unspecified, e.PreviousState);
Assert.Equal("Tank01.Level.HiHi", e.Record.TagName);
Assert.Equal("Galaxy", e.Record.ProviderName);
Assert.Equal("Area", e.Record.Group);
}
[Fact]
public void ActiveFalseToTrue_AlarmMgrShape_EmitsNativeProviderGroupTagName()
{
// Reference parity: a subtag target composed from the object's real Galaxy
// area must round-trip to exactly the native alarmmgr (wnwrap) record fields:
// Provider "Galaxy", Group = the real area "TestArea", and the object-rooted
// TagName "TestMachine_001.TestAlarm001".
var target = new AlarmSubtagTarget
{
AlarmFullReference = "Galaxy!TestArea.TestMachine_001.TestAlarm001",
SourceObjectReference = "TestMachine_001",
ActiveSubtag = "TestMachine_001.TestAlarm001.InAlarm",
};
var sm = new SubtagAlarmStateMachine(new[] { target });
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
var events = sm.Apply("TestMachine_001.TestAlarm001.InAlarm", true, ts);
var e = Assert.Single(events);
Assert.Equal("Galaxy", e.Record.ProviderName);
Assert.Equal("TestArea", e.Record.Group);
Assert.Equal("TestMachine_001.TestAlarm001", e.Record.TagName);
}
[Fact]
public void ActiveFalseToTrue_NoProviderBang_UsesWholeReferenceAsTagName()
{
var target = new AlarmSubtagTarget
{
AlarmFullReference = "Tank01.Level.HiHi",
SourceObjectReference = string.Empty,
ActiveSubtag = "Tank01.Level.HiHi.active",
};
var sm = new SubtagAlarmStateMachine(new[] { target });
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
var events = sm.Apply("Tank01.Level.HiHi.active", true, ts);
var e = Assert.Single(events);
Assert.Equal("Tank01.Level.HiHi", e.Record.TagName);
Assert.Equal(string.Empty, e.Record.ProviderName);
Assert.Equal(string.Empty, e.Record.Group);
}
[Fact]
public void OutOfOrderAckThenClear_StillEmitsAckRtn()
{
var sm = new SubtagAlarmStateMachine(new[] { Target() });
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
sm.Apply("Tank01.Level.HiHi.active", true, ts);
sm.Apply("Tank01.Level.HiHi.acked", true, ts.AddSeconds(2));
// Out-of-order un-ack arrives before the active=false clear.
sm.Apply("Tank01.Level.HiHi.acked", false, ts.AddSeconds(3));
var events = sm.Apply("Tank01.Level.HiHi.active", false, ts.AddSeconds(10));
var e = Assert.Single(events);
Assert.Equal(MxAlarmStateKind.AckRtn, e.Record.State);
}
[Fact]
public void DuplicateActiveSubtag_Throws()
{
var first = new AlarmSubtagTarget
{
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
SourceObjectReference = "Tank01",
ActiveSubtag = "Shared.active",
};
var second = new AlarmSubtagTarget
{
AlarmFullReference = "Galaxy!Area.Tank02.Level.HiHi",
SourceObjectReference = "Tank02",
ActiveSubtag = "Shared.active",
};
Assert.Throws<ArgumentException>(() => new SubtagAlarmStateMachine(new[] { first, second }));
}
/// <summary>
/// Worker-028 regression: two watch-list entries sharing an
/// <see cref="AlarmSubtagTarget.AlarmFullReference"/> (but using distinct
/// subtag addresses) must throw at construction, symmetric with the
/// duplicate-address guard, rather than silently overwriting the earlier
/// reference's state and orphaning its bound addresses.
/// </summary>
[Fact]
public void DuplicateAlarmFullReference_Throws()
{
var first = new AlarmSubtagTarget
{
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
SourceObjectReference = "Tank01",
ActiveSubtag = "Tank01.Level.HiHi.active",
};
var second = new AlarmSubtagTarget
{
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
SourceObjectReference = "Tank01",
ActiveSubtag = "Other.active",
};
Assert.Throws<ArgumentException>(() => new SubtagAlarmStateMachine(new[] { first, second }));
}
[Fact]
public void AckedTrueWhileActive_EmitsAck()
{
var sm = new SubtagAlarmStateMachine(new[] { Target() });
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
sm.Apply("Tank01.Level.HiHi.active", true, ts);
var events = sm.Apply("Tank01.Level.HiHi.acked", true, ts.AddSeconds(5));
var e = Assert.Single(events);
Assert.Equal(MxAlarmStateKind.AckAlm, e.Record.State);
Assert.Equal(MxAlarmStateKind.UnackAlm, e.PreviousState);
}
[Fact]
public void ActiveTrueToFalse_WhileUnacked_EmitsUnackRtn()
{
var sm = new SubtagAlarmStateMachine(new[] { Target() });
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
sm.Apply("Tank01.Level.HiHi.active", true, ts);
var events = sm.Apply("Tank01.Level.HiHi.active", false, ts.AddSeconds(10));
var e = Assert.Single(events);
Assert.Equal(MxAlarmStateKind.UnackRtn, e.Record.State);
}
[Fact]
public void ActiveTrueToFalse_WhileAcked_EmitsAckRtn()
{
var sm = new SubtagAlarmStateMachine(new[] { Target() });
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
sm.Apply("Tank01.Level.HiHi.active", true, ts);
sm.Apply("Tank01.Level.HiHi.acked", true, ts.AddSeconds(2));
var events = sm.Apply("Tank01.Level.HiHi.active", false, ts.AddSeconds(10));
var e = Assert.Single(events);
Assert.Equal(MxAlarmStateKind.AckRtn, e.Record.State);
}
[Fact]
public void Snapshot_ReflectsActiveAndAckedState()
{
var sm = new SubtagAlarmStateMachine(new[] { Target() });
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
sm.Apply("Tank01.Level.HiHi.active", true, ts);
sm.Apply("Tank01.Level.HiHi.acked", true, ts);
var snap = Assert.Single(sm.SnapshotActive());
Assert.Equal(MxAlarmStateKind.AckAlm, snap.State);
}
[Fact]
public void UnknownAddress_NoEvents()
{
var sm = new SubtagAlarmStateMachine(new[] { Target() });
var events = sm.Apply("Some.Other.Tag.active", true, DateTime.UtcNow);
Assert.Empty(events);
}
/// <summary>
/// Worker.Tests-033 regression: an ack arriving while the alarm is NOT
/// active must emit nothing and must NOT latch
/// <c>AckedDuringEpisode</c> — otherwise a stale ack from a prior episode
/// would mis-latch the next raise into a spurious ACK_RTN on clear. The
/// subsequent raise/clear must therefore still emit UNACK_RTN.
/// </summary>
[Fact]
public void AckedTrueWhileInactive_EmitsNothingAndDoesNotLatch()
{
var sm = new SubtagAlarmStateMachine(new[] { Target() });
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
// Ack with no preceding active raise: must be a no-op.
var ackEvents = sm.Apply("Tank01.Level.HiHi.acked", true, ts);
Assert.Empty(ackEvents);
// A fresh episode: raise then clear. Because the earlier ack must not
// have latched AckedDuringEpisode, the clear must be UNACK_RTN.
sm.Apply("Tank01.Level.HiHi.active", true, ts.AddSeconds(5));
var clearEvents = sm.Apply("Tank01.Level.HiHi.active", false, ts.AddSeconds(10));
var clear = Assert.Single(clearEvents);
Assert.Equal(MxAlarmStateKind.UnackRtn, clear.Record.State);
}
/// <summary>
/// Worker.Tests-033 regression: a priority-subtag value change must flow
/// through <c>CoerceInt</c> into the emitted record's
/// <see cref="MxAlarmSnapshotRecord.Priority"/>. A non-numeric value must
/// leave the prior priority unchanged (the CoerceInt fallback path).
/// </summary>
[Fact]
public void PriorityChange_FlowsIntoEmittedRecord()
{
var target = new AlarmSubtagTarget
{
AlarmFullReference = "Galaxy!Area.Tank01.Level.HiHi",
SourceObjectReference = "Tank01",
ActiveSubtag = "Tank01.Level.HiHi.active",
AckedSubtag = "Tank01.Level.HiHi.acked",
PrioritySubtag = "Tank01.Level.HiHi.priority",
};
var sm = new SubtagAlarmStateMachine(new[] { target });
var ts = new DateTime(2026, 6, 13, 9, 0, 0, DateTimeKind.Utc);
// A priority change alone emits nothing but records the priority.
var priorityEvents = sm.Apply("Tank01.Level.HiHi.priority", 750, ts);
Assert.Empty(priorityEvents);
// Raise: the emitted record carries the recorded priority.
var raiseEvents = sm.Apply("Tank01.Level.HiHi.active", true, ts.AddSeconds(1));
var raise = Assert.Single(raiseEvents);
Assert.Equal(750, raise.Record.Priority);
// A non-numeric priority must fall back to the existing value, not zero.
sm.Apply("Tank01.Level.HiHi.priority", "not-a-number", ts.AddSeconds(2));
var snap = Assert.Single(sm.SnapshotActive());
Assert.Equal(750, snap.Priority);
}
}
@@ -0,0 +1,66 @@
using System;
using ZB.MOM.WW.MxGateway.Worker.MxAccess;
namespace ZB.MOM.WW.MxGateway.Worker.Tests.MxAccess;
/// <summary>
/// Unit tests for <see cref="SyntheticAlarmGuid"/>: the subtag-fallback
/// path derives a deterministic GUID from the alarm reference, so identical
/// references must collide and distinct references must not.
/// </summary>
public sealed class SyntheticAlarmGuidTests
{
/// <summary>Verifies the same reference yields the same GUID.</summary>
[Fact]
public void SameReference_SameGuid() =>
Assert.Equal(SyntheticAlarmGuid.ForReference("A.B.C"), SyntheticAlarmGuid.ForReference("A.B.C"));
/// <summary>Verifies distinct references yield distinct GUIDs.</summary>
[Fact]
public void DifferentReference_DifferentGuid() =>
Assert.NotEqual(SyntheticAlarmGuid.ForReference("A.B.C"), SyntheticAlarmGuid.ForReference("A.B.D"));
/// <summary>Verifies a reference produces a non-empty GUID.</summary>
[Fact]
public void Reference_ProducesNonEmptyGuid() =>
Assert.NotEqual(Guid.Empty, SyntheticAlarmGuid.ForReference("A.B.C"));
/// <summary>
/// Verifies the empty string still derives a non-empty GUID. The length
/// fold in the derivation prevents a degenerate all-zero (Guid.Empty)
/// result, which would collide with the unset-record default downstream.
/// </summary>
[Fact]
public void EmptyReference_ProducesNonEmptyGuid() =>
Assert.NotEqual(Guid.Empty, SyntheticAlarmGuid.ForReference(string.Empty));
/// <summary>
/// Worker-027 regression: <see cref="SyntheticAlarmGuid.ForReference"/>
/// must derive its GUID without routing through
/// <see cref="System.Security.Cryptography"/>, because on net48
/// <c>MD5.Create()</c> throws under the Windows FIPS-compliance policy.
/// This test enables the per-AppContext FIPS-enforcement switch (which the
/// managed crypto factories honour) and asserts the derivation still
/// succeeds deterministically — a regression that reintroduced a FIPS-gated
/// provider would throw here instead of returning a stable GUID.
/// </summary>
[Fact]
public void ForReference_UnderFipsEnforcement_DoesNotThrowAndStaysDeterministic()
{
const string switchName = "Switch.System.Security.Cryptography.UseLegacyFipsThrow";
bool original = AppContext.TryGetSwitch(switchName, out bool value) && value;
AppContext.SetSwitch(switchName, true);
try
{
Guid first = SyntheticAlarmGuid.ForReference("Galaxy!Area.Tank01.Level.HiHi");
Guid second = SyntheticAlarmGuid.ForReference("Galaxy!Area.Tank01.Level.HiHi");
Assert.NotEqual(Guid.Empty, first);
Assert.Equal(first, second);
}
finally
{
AppContext.SetSwitch(switchName, original);
}
}
}
@@ -0,0 +1,574 @@
// AlarmSubtagLiveSmokeTests.cs
//
// Validates the subtag-fallback pipeline against a real Galaxy + MXAccess install:
// LmxSubtagAlarmSource (own LMXProxyServerClass) ->
// SubtagAlarmConsumer (state machine + AcknowledgeByName write) ->
// synthesized MxAlarmTransitionEvent (Raise / Clear, Degraded=true, SyntheticGuid)
//
// FIELD NAMES CONFIRMED: InAlarm/Acked/AckMsg/Priority are the confirmed AVEVA
// AlarmExtension primitive field names, verified by querying the live ZB Galaxy
// attribute_definition rows. The remaining open item for live validation is
// confirming the runtime item reference path — i.e. that
// "<Object>.<AlarmAttr>.InAlarm" is the correct MXAccess path with no
// intermediate alarm-condition segment.
//
// HOW TO RUN:
// 1. On the dev rig with AVEVA System Platform installed and Galaxy running:
// $env:MXGATEWAY_RUN_LIVE_MXACCESS_TESTS = "1"
// 2. Remove (or set to null) the Skip parameter on the [Fact] below.
// 3. Drive a TestMachine alarm so its Active/Acked subtags toggle — either an
// alarm flip script (same one used by AlarmsLiveSmokeTests, ~10 s cadence)
// or a manual operator/IDE toggle of the alarm attribute. The rig's
// TestAlarm attributes are object-driven, so an external MXAccess Write
// cannot toggle them (confirmed live 2026-06-14 by toggling TestAlarm002
// from the IDE).
//
// net48/x86 constraints:
// - No init-only properties, records, index/range operators, C# 8+ pattern
// matches beyond what the existing Worker.Tests files use.
// - All Booleans from MXAccess arrive as boxed int (0 / non-zero); coerce
// with Convert.ToBoolean or cast to int first.
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.Runtime.InteropServices;
using System.Threading;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
using ZB.MOM.WW.MxGateway.Worker.MxAccess;
using ZB.MOM.WW.MxGateway.Worker.Tests.TestSupport;
using Xunit.Abstractions;
namespace ZB.MOM.WW.MxGateway.Worker.Tests.Probes;
/// <summary>
/// Live dev-rig smoke test for the subtag-fallback alarm pipeline.
/// Confirms that <see cref="LmxSubtagAlarmSource"/> wired to a real
/// <c>LMXProxyServerClass</c> + <see cref="SubtagAlarmConsumer"/> can
/// synthesize <see cref="MxAlarmTransitionEvent"/> records from Galaxy
/// alarm subtags using the confirmed AVEVA <c>AlarmExtension</c> field
/// names (InAlarm/Acked/AckMsg/Priority), and that
/// <see cref="SubtagAlarmConsumer.AcknowledgeByName"/> writes the
/// ack-comment subtag (AckMsg) successfully.
///
/// Skip-gated; flip <c>Skip=null</c> on the dev rig with an alarm being
/// driven (flip script or a manual operator/IDE toggle of the alarm
/// attribute). The remaining live-validation item is confirming that
/// the runtime MXAccess item reference path requires no intermediate
/// alarm-condition segment (i.e. <c>&lt;Object&gt;.&lt;AlarmAttr&gt;.InAlarm</c>
/// resolves as-is).
/// </summary>
public sealed class AlarmSubtagLiveSmokeTests
{
// -------------------------------------------------------------------------
// Subtag addresses for TestMachine_001.TestAlarm001.
//
// Field names (InAlarm/Acked/AckMsg/Priority) are CONFIRMED against the live
// ZB Galaxy AlarmExtension primitive attribute_definition rows. The subtag
// address format is <ObjectTagName>.<AlarmAttributeName>. The remaining
// live-validation item is confirming that the MXAccess runtime item
// reference resolves without an intermediate alarm-condition segment
// (i.e. "<Object>.<AlarmAttr>.InAlarm" resolves as-is).
// -------------------------------------------------------------------------
/// <summary>The Galaxy provider expression used by the existing live smoke tests.</summary>
private static readonly string Provider =
string.Format(@"\\{0}\Galaxy", Environment.MachineName);
/// <summary>The Galaxy group (provider sub-path) containing the test alarm.</summary>
private const string Group = "DEV";
/// <summary>The test alarm's tag name as the dispatcher composes it (Group.TagName).</summary>
private const string AlarmTagName = "TestMachine_001.TestAlarm001";
// Confirmed AVEVA AlarmExtension field name (in-alarm boolean).
private const string PlaceholderActiveSubtag = "TestMachine_001.TestAlarm001.InAlarm";
// Confirmed AVEVA AlarmExtension field name (acknowledged boolean).
private const string PlaceholderAckedSubtag = "TestMachine_001.TestAlarm001.Acked";
// Confirmed AVEVA AlarmExtension field name (ack-comment write target).
// Writing this subtag performs the acknowledge in AVEVA.
private const string PlaceholderAckCommentSubtag = "TestMachine_001.TestAlarm001.AckMsg";
// Confirmed AVEVA AlarmExtension field name (alarm priority/severity).
private const string PlaceholderPrioritySubtag = "TestMachine_001.TestAlarm001.Priority";
// -------------------------------------------------------------------------
private static readonly TimeSpan RaiseWaitTimeout = TimeSpan.FromSeconds(30);
private static readonly TimeSpan ClearWaitTimeout = TimeSpan.FromSeconds(30);
private static readonly string AlarmFullReference =
AlarmRecordTransitionMapper.ComposeFullReference(Provider, Group, AlarmTagName);
private readonly ITestOutputHelper output;
private readonly Stopwatch elapsed = Stopwatch.StartNew();
private readonly ConcurrentQueue<string> log =
new ConcurrentQueue<string>();
/// <summary>Initializes a new instance of the AlarmSubtagLiveSmokeTests class.</summary>
/// <param name="output">Test output helper for logging.</param>
public AlarmSubtagLiveSmokeTests(ITestOutputHelper output)
{
this.output = output;
}
/// <summary>
/// Verifies the subtag-fallback pipeline: advises alarm subtags through a
/// real <c>LMXProxyServerClass</c>, collects a Raise then a Clear
/// transition synthesized by <see cref="SubtagAlarmConsumer"/>, confirms
/// the Degraded flag and synthetic GUID are stamped, then
/// AcknowledgeByName and verifies the ack-comment write returns 0.
/// </summary>
[Fact(Skip = "Live dev-rig smoke test — flip Skip=null with AVEVA + an alarm being driven (flip script or manual operator/IDE toggle of the alarm attribute). Subtag fallback path. Field names confirmed (InAlarm/Acked/AckMsg/Priority); live-validate runtime path resolves without intermediate alarm-condition segment.")]
public void SubtagFallback_FullPipelineRoundTrip_SynthesizesRaiseAndAcknowledges()
{
Exception? threadException = null;
ManualResetEventSlim done = new ManualResetEventSlim(false);
Thread thread = new Thread(() =>
{
try { RunSmoke(); }
catch (Exception ex) { threadException = ex; }
finally { done.Set(); }
});
thread.IsBackground = false;
thread.SetApartmentState(ApartmentState.STA);
thread.Start();
done.Wait();
thread.Join();
output.WriteLine(string.Format("Captured {0} log line(s):", log.Count));
string? line;
while (log.TryDequeue(out line))
{
output.WriteLine(line);
}
if (threadException != null)
{
throw threadException;
}
}
/// <summary>
/// Runtime-path resolution probe. Unlike the full lifecycle test this
/// does NOT require an active alarm: it advises the four subtags and
/// observes the initial values MXAccess delivers on advise, then writes
/// the AckMsg subtag. It confirms the runtime item-reference path
/// (<c>&lt;Object&gt;.&lt;AlarmAttr&gt;.&lt;field&gt;</c>) resolves with
/// no intermediate alarm-condition segment — the last open item from the
/// design. Runs only when <c>MXGATEWAY_RUN_LIVE_MXACCESS_TESTS=1</c>.
/// Reports "inconclusive" (without failing) when no values are delivered,
/// e.g. the Galaxy engine for the test object is not running.
/// </summary>
[LiveMxAccessFact]
public void LiveProbe_AlarmSubtagsResolve_AndAckMsgWriteSucceeds()
{
Exception? threadException = null;
ManualResetEventSlim done = new ManualResetEventSlim(false);
Thread thread = new Thread(() =>
{
try { RunProbe(); }
catch (Exception ex) { threadException = ex; }
finally { done.Set(); }
});
thread.IsBackground = false;
thread.SetApartmentState(ApartmentState.STA);
thread.Start();
done.Wait();
thread.Join();
List<string> lines = new List<string>();
output.WriteLine(string.Format("Captured {0} log line(s):", log.Count));
string? line;
while (log.TryDequeue(out line))
{
output.WriteLine(line);
lines.Add(line);
}
// Also persist to a file so the evidence survives shell-logger quirks
// when run over SSH. Path overridable via PROBE_LOG.
string logPath = Environment.GetEnvironmentVariable("PROBE_LOG")
?? System.IO.Path.Combine(System.IO.Path.GetTempPath(), "alarm-subtag-probe.log");
try { System.IO.File.WriteAllLines(logPath, lines); }
catch (Exception writeEx) { output.WriteLine("probe-log write failed: " + writeEx.Message); }
if (threadException != null)
{
throw threadException;
}
}
private void RunProbe()
{
Log("=== Subtag runtime-path resolution probe ===");
Log("AlarmFullReference: " + AlarmFullReference);
Log("Subtag addresses under test:");
Log(" active = " + PlaceholderActiveSubtag);
Log(" acked = " + PlaceholderAckedSubtag);
Log(" ackMsg = " + PlaceholderAckCommentSubtag);
Log(" priority= " + PlaceholderPrioritySubtag);
AlarmSubtagTarget target = new AlarmSubtagTarget
{
AlarmFullReference = AlarmFullReference,
SourceObjectReference = AlarmTagName,
ActiveSubtag = PlaceholderActiveSubtag,
AckedSubtag = PlaceholderAckedSubtag,
AckCommentSubtag = PlaceholderAckCommentSubtag,
PrioritySubtag = PlaceholderPrioritySubtag,
};
List<AlarmSubtagTarget> watchList = new List<AlarmSubtagTarget> { target };
MxAccessComObjectFactory factory = new MxAccessComObjectFactory();
LmxSubtagAlarmSource source = new LmxSubtagAlarmSource(factory, clientName: null);
using SubtagAlarmConsumer consumer = new SubtagAlarmConsumer(source, watchList);
// Hook the RAW source so every advised subtag's value-change is seen,
// including the initial value MXAccess delivers on advise (the state
// machine itself only emits on active/acked transitions).
Dictionary<string, string> lastValues =
new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
source.ValueChanged += (_, c) =>
{
string v = c.Value == null
? "<null>"
: string.Format("{0} ({1})", c.Value, c.Value.GetType().Name);
lastValues[c.ItemAddress] = v;
Log(string.Format("ValueChanged: {0} = {1}", c.ItemAddress, v));
};
Log("Subscribe (advise InAlarm/Acked/Priority) ...");
consumer.Subscribe(string.Format(@"\\{0}\Galaxy!{1}", Environment.MachineName, Group));
Log("Subscribe returned OK.");
// Pump the STA for ~6s to receive each advised subtag's initial value.
DateTime deadline = DateTime.UtcNow + TimeSpan.FromSeconds(6);
while (DateTime.UtcNow < deadline)
{
PumpMessages();
Thread.Sleep(100);
}
Log(string.Format("Distinct subtags that delivered a value: {0}", lastValues.Count));
bool inAlarmResolved = lastValues.ContainsKey(PlaceholderActiveSubtag);
Log("InAlarm subtag delivered a value: " + inAlarmResolved);
// Decisive write: AckMsg. Returns 0 only if the address resolves and is writable.
Log("AcknowledgeByName (writes AckMsg subtag) ...");
int rc = consumer.AcknowledgeByName(
alarmName: AlarmTagName,
providerName: Provider,
groupName: Group,
ackComment: "subtag-resolution-probe",
ackOperatorName: Environment.UserName,
ackOperatorNode: Environment.MachineName,
ackOperatorDomain: Environment.UserDomainName ?? string.Empty,
ackOperatorFullName: Environment.UserName);
Log(string.Format("AcknowledgeByName(AckMsg write) -> rc={0}", rc));
DateTime settle = DateTime.UtcNow + TimeSpan.FromSeconds(2);
while (DateTime.UtcNow < settle)
{
PumpMessages();
Thread.Sleep(100);
}
if (lastValues.Count == 0)
{
Log("INCONCLUSIVE: no subtag values delivered within the window. The Galaxy "
+ "engine hosting TestMachine_001 may be stopped, or the runtime path needs "
+ "an intermediate segment. Re-run with the engine running. (Field names "
+ "remain confirmed from the ZB AlarmExtension model.)");
return;
}
// Live data arrived — assert the in-alarm subtag resolved and the ack write succeeded.
Assert.True(inAlarmResolved,
"InAlarm subtag delivered no value while other subtags did — the runtime path for InAlarm may differ.");
Assert.Equal(0, rc);
Log("CONFIRMED: runtime subtag path resolves and the AckMsg write succeeded.");
}
private void RunSmoke()
{
Log(string.Format("AlarmFullReference: {0}", AlarmFullReference));
Log("VERIFY: PlaceholderActiveSubtag = " + PlaceholderActiveSubtag);
Log("VERIFY: PlaceholderAckedSubtag = " + PlaceholderAckedSubtag);
Log("VERIFY: PlaceholderAckCommentSubtag = " + PlaceholderAckCommentSubtag);
Log("VERIFY: PlaceholderPrioritySubtag = " + PlaceholderPrioritySubtag);
Log(string.Format("RaiseWaitTimeout={0}s ClearWaitTimeout={1}s",
RaiseWaitTimeout.TotalSeconds, ClearWaitTimeout.TotalSeconds));
// Build target with confirmed AVEVA AlarmExtension subtag names.
// InAlarm/Acked/AckMsg/Priority confirmed against live ZB Galaxy
// attribute_definition rows. Remaining live-validation: runtime path resolves
// without an intermediate alarm-condition segment.
AlarmSubtagTarget target = new AlarmSubtagTarget
{
AlarmFullReference = AlarmFullReference,
SourceObjectReference = AlarmTagName,
ActiveSubtag = PlaceholderActiveSubtag,
AckedSubtag = PlaceholderAckedSubtag,
AckCommentSubtag = PlaceholderAckCommentSubtag,
PrioritySubtag = PlaceholderPrioritySubtag,
};
List<AlarmSubtagTarget> watchList = new List<AlarmSubtagTarget> { target };
// Construct the real COM-backed subtag source using the production
// factory (MxAccessComObjectFactory -> new LMXProxyServerClass()).
// This is the same factory the worker uses in production; no test
// double is involved on this path.
MxAccessComObjectFactory factory = new MxAccessComObjectFactory();
LmxSubtagAlarmSource source = new LmxSubtagAlarmSource(factory, clientName: null);
// SubtagAlarmConsumer wraps the source and drives the state machine.
using SubtagAlarmConsumer consumer = new SubtagAlarmConsumer(source, watchList);
// Collect emitted transitions on the STA (handler fires on the same
// STA that services the COM OnDataChange callback).
ConcurrentQueue<MxAlarmTransitionEvent> transitions =
new ConcurrentQueue<MxAlarmTransitionEvent>();
consumer.AlarmTransitionEmitted += (_, e) =>
{
Log(string.Format("Transition emitted: {0}", DescribeTransition(e)));
transitions.Enqueue(e);
};
// Subscribe binds Advise on all observable subtags (Active, Acked,
// Priority). The subscription expression is unused in subtag mode; pass
// something recognizable for diagnostics.
string subscriptionExpression = string.Format(@"\\{0}\Galaxy!{1}", Environment.MachineName, Group);
Log(string.Format("Calling Subscribe({0}) ...", subscriptionExpression));
consumer.Subscribe(subscriptionExpression);
Log("Subscribe returned OK.");
// 1. Wait for a Raise transition. Whatever is driving the alarm — a
// flip script (same one used by AlarmsLiveSmokeTests, ~10 s cadence)
// or a manual operator/IDE toggle — writes the active subtag.
// LmxSubtagAlarmSource delivers OnDataChange via
// the Windows message pump on the STA, so we must pump messages
// here while we wait — mirroring how AlarmsLiveSmokeTests drives
// its WnWrapAlarmConsumer.PollOnce() from the STA in a tight loop.
Log(string.Format("Waiting up to {0}s for a Raise transition ...", RaiseWaitTimeout.TotalSeconds));
MxAlarmTransitionEvent? raiseEvent = WaitForTransitionKind(
transitions, AlarmTransitionKind.Raise, RaiseWaitTimeout, "Raise");
Assert.NotNull(raiseEvent);
Assert.True(raiseEvent!.Record.Degraded,
"Subtag-synthesized records must have Degraded=true.");
// SubtagAlarmConsumer must stamp a synthetic GUID on the transition.
Assert.NotEqual(Guid.Empty, raiseEvent.Record.AlarmGuid);
// A Raise transition must leave the record in UnackAlm state.
Assert.Equal(MxAlarmStateKind.UnackAlm, raiseEvent.Record.State);
Log(string.Format("Raise confirmed: AlarmGuid={0} Degraded={1} State={2}",
raiseEvent.Record.AlarmGuid, raiseEvent.Record.Degraded, raiseEvent.Record.State));
// 2. Snapshot active alarms and confirm the raised alarm is present.
IReadOnlyList<MxAlarmSnapshotRecord> snapshot = consumer.SnapshotActiveAlarms();
Log(string.Format("SnapshotActiveAlarms count={0}", snapshot.Count));
foreach (MxAlarmSnapshotRecord s in snapshot)
{
Log(string.Format(" snapshot: TagName='{0}' Group='{1}' State={2} Degraded={3} Guid={4}",
s.TagName, s.Group, s.State, s.Degraded, s.AlarmGuid));
}
bool foundInSnapshot = false;
foreach (MxAlarmSnapshotRecord s in snapshot)
{
if (string.Equals(
AlarmRecordTransitionMapper.ComposeFullReference(s.ProviderName, s.Group, s.TagName),
AlarmFullReference,
StringComparison.OrdinalIgnoreCase))
{
foundInSnapshot = true;
break;
}
}
Assert.True(foundInSnapshot,
string.Format("Active alarm snapshot must contain '{0}' after a Raise.", AlarmFullReference));
// 3. AcknowledgeByName — writes the ack-comment subtag.
// The dispatcher derives (alarmName, provider, group) via the same
// TryParseReference logic as AlarmsLiveSmokeTests.
Log("Calling AcknowledgeByName ...");
int rc = consumer.AcknowledgeByName(
alarmName: AlarmTagName,
providerName: Provider,
groupName: Group,
ackComment: "subtag-fallback-smoke ack",
ackOperatorName: Environment.UserName,
ackOperatorNode: Environment.MachineName,
ackOperatorDomain: Environment.UserDomainName ?? string.Empty,
ackOperatorFullName: Environment.UserName);
Log(string.Format("AcknowledgeByName -> rc={0}", rc));
Assert.Equal(0, rc);
// 4. Wait for a Clear or Acknowledge transition to confirm the state
// machine continues tracking after the ack write.
Log(string.Format("Waiting up to {0}s for a Clear or Acknowledge transition ...",
ClearWaitTimeout.TotalSeconds));
MxAlarmTransitionEvent? postAckEvent = WaitForAnyTransition(transitions, ClearWaitTimeout, "post-ack");
// A null here is not a hard failure: the test alarm may not have
// cleared within the window, and the ack-comment write already
// confirmed the subtag path is wired. Log and assert non-null
// only when the post-ack transition is expected.
if (postAckEvent != null)
{
AlarmTransitionKind postAckKind = AlarmRecordTransitionMapper.MapTransition(
postAckEvent.PreviousState, postAckEvent.Record.State);
Log(string.Format("Post-ack transition: {0}", DescribeTransition(postAckEvent)));
Assert.NotEqual(AlarmTransitionKind.Unspecified, postAckKind);
}
else
{
Log("No post-ack transition within timeout — ack-comment write succeeded; state machine still live.");
}
Log("Smoke test complete.");
}
private MxAlarmTransitionEvent? WaitForTransitionKind(
ConcurrentQueue<MxAlarmTransitionEvent> queue,
AlarmTransitionKind kind,
TimeSpan timeout,
string label)
{
DateTime deadline = DateTime.UtcNow + timeout;
while (DateTime.UtcNow < deadline)
{
// Pump the STA Windows message queue so COM OnDataChange callbacks
// can be delivered. LMXProxyServerClass is apartment-threaded and
// requires the STA to be pumping; a bare Thread.Sleep would stall
// the pump.
PumpMessages();
MxAlarmTransitionEvent? evt;
if (queue.TryDequeue(out evt) && evt != null)
{
AlarmTransitionKind evtKind = AlarmRecordTransitionMapper.MapTransition(
evt.PreviousState, evt.Record.State);
if (evtKind == kind)
{
return evt;
}
Log(string.Format("Skipped transition kind={0} while waiting for {1}.", evtKind, label));
// Can't re-enqueue; log and continue.
}
Thread.Sleep(250);
}
Log(string.Format("Timed out waiting for {0} transition after {1}s.", label, timeout.TotalSeconds));
return null;
}
private MxAlarmTransitionEvent? WaitForAnyTransition(
ConcurrentQueue<MxAlarmTransitionEvent> queue,
TimeSpan timeout,
string label)
{
DateTime deadline = DateTime.UtcNow + timeout;
while (DateTime.UtcNow < deadline)
{
PumpMessages();
MxAlarmTransitionEvent? evt;
if (queue.TryDequeue(out evt) && evt != null)
{
return evt;
}
Thread.Sleep(250);
}
Log(string.Format("Timed out waiting for any transition ({0}) after {1}s.", label, timeout.TotalSeconds));
return null;
}
/// <summary>
/// Runs a single pass of the Windows STA message pump using a
/// non-blocking PeekMessage/DispatchMessage loop so COM
/// <c>OnDataChange</c> callbacks from <c>LMXProxyServerClass</c>
/// (ThreadingModel=Apartment) can be delivered on this thread.
/// Mirrors the pump pattern documented in
/// <c>docs/MxAccessWorkerInstanceDesign.md</c>.
/// </summary>
private static void PumpMessages()
{
NativeMethods.MSG msg;
// Drain all currently posted messages; return as soon as the queue
// is empty.
while (NativeMethods.PeekMessage(out msg, IntPtr.Zero, 0, 0, NativeMethods.PM_REMOVE))
{
NativeMethods.TranslateMessage(ref msg);
NativeMethods.DispatchMessage(ref msg);
}
}
private static string DescribeTransition(MxAlarmTransitionEvent e)
{
AlarmTransitionKind kind = AlarmRecordTransitionMapper.MapTransition(
e.PreviousState, e.Record.State);
return string.Format(
"Kind={0} PreviousState={1} State={2} TagName='{3}' Group='{4}' Provider='{5}' Degraded={6} Guid={7}",
kind,
e.PreviousState,
e.Record.State,
e.Record.TagName,
e.Record.Group,
e.Record.ProviderName,
e.Record.Degraded,
e.Record.AlarmGuid);
}
private void Log(string line)
{
log.Enqueue(string.Format("[t={0:F3}s] {1}", elapsed.Elapsed.TotalSeconds, line));
}
// -------------------------------------------------------------------------
// Minimal P/Invoke shim so the STA pump can be driven without pulling in
// the full StaRuntime machinery from the Worker project. The signatures
// mirror those in MxAccessStaRuntime and are well-known Win32.
// -------------------------------------------------------------------------
private static class NativeMethods
{
internal const uint PM_REMOVE = 0x0001;
[StructLayout(LayoutKind.Sequential)]
internal struct MSG
{
internal IntPtr hwnd;
internal uint message;
internal IntPtr wParam;
internal IntPtr lParam;
internal uint time;
internal int ptX;
internal int ptY;
}
[DllImport("user32.dll")]
[return: MarshalAs(UnmanagedType.Bool)]
internal static extern bool PeekMessage(
out MSG lpMsg,
IntPtr hWnd,
uint wMsgFilterMin,
uint wMsgFilterMax,
uint wRemoveMsg);
[DllImport("user32.dll")]
[return: MarshalAs(UnmanagedType.Bool)]
internal static extern bool TranslateMessage(ref MSG lpMsg);
[DllImport("user32.dll")]
internal static extern IntPtr DispatchMessage(ref MSG lpmsg);
}
}
@@ -51,7 +51,7 @@ public sealed class WorkerPipeSession
options,
() => Process.GetCurrentProcess().Id,
new WorkerPipeSessionOptions(),
() => new MxAccessStaSession((eq, affinity) => new AlarmCommandHandler(eq, () => new WnWrapAlarmConsumer(), affinity)),
() => new MxAccessStaSession((eq, affinity, comFactory) => new AlarmCommandHandler(eq, () => new WnWrapAlarmConsumer(), affinity, comFactory, standbyFactory: null)),
logger)
{
}
@@ -72,7 +72,7 @@ public sealed class WorkerPipeSession
options,
processIdProvider,
new WorkerPipeSessionOptions(),
() => new MxAccessStaSession((eq, affinity) => new AlarmCommandHandler(eq, () => new WnWrapAlarmConsumer(), affinity)),
() => new MxAccessStaSession((eq, affinity, comFactory) => new AlarmCommandHandler(eq, () => new WnWrapAlarmConsumer(), affinity, comFactory, standbyFactory: null)),
logger: null)
{
}
@@ -867,7 +867,7 @@ public sealed class WorkerPipeSession
// parameterless CompleteStartupHandshakeAsync is used without a
// prior factory call.
_runtimeSession ??= new MxAccessStaSession(
(eq, affinity) => new AlarmCommandHandler(eq, () => new WnWrapAlarmConsumer(), affinity));
(eq, affinity, comFactory) => new AlarmCommandHandler(eq, () => new WnWrapAlarmConsumer(), affinity, comFactory, standbyFactory: null));
IWorkerRuntimeSession session = _runtimeSession;
try
{
@@ -37,8 +37,14 @@ public sealed class AlarmCommandHandler : IAlarmCommandHandler
private readonly MxAccessEventQueue eventQueue;
private readonly Func<IMxAccessAlarmConsumer> consumerFactory;
private readonly Action? threadAffinityCheck;
private readonly IMxAccessComObjectFactory? comFactory;
private readonly Func<IReadOnlyList<AlarmSubtagTarget>, IMxAccessAlarmConsumer>? standbyFactory;
private readonly MxAccessEventMapper mapper = new MxAccessEventMapper();
private readonly object syncRoot = new object();
private AlarmDispatcher? dispatcher;
private FailoverAlarmConsumer? failoverConsumer;
private EventHandler<AlarmProviderModeChange>? providerModeChangedHandler;
private string subscribeSessionId = string.Empty;
private bool disposed;
/// <summary>Initializes a new alarm command handler with the given event queue.</summary>
@@ -79,10 +85,49 @@ public sealed class AlarmCommandHandler : IAlarmCommandHandler
MxAccessEventQueue eventQueue,
Func<IMxAccessAlarmConsumer> consumerFactory,
Action? threadAffinityCheck)
: this(eventQueue, consumerFactory, threadAffinityCheck, comFactory: null, standbyFactory: null)
{
}
/// <summary>
/// Full constructor that also threads the MXAccess COM-object factory and
/// an optional standby-consumer seam so the subscribe path can build the
/// subtag / failover consumers required by
/// <see cref="SubscribeAlarmsCommand.ForcedMode"/> and
/// <see cref="SubscribeAlarmsCommand.WatchList"/>.
/// </summary>
/// <param name="eventQueue">The event queue.</param>
/// <param name="consumerFactory">
/// Factory for the PRIMARY (alarmmgr) consumer — the existing
/// wnwrap-backed source. Used alone in alarmmgr mode and as the primary
/// of the failover composite in auto mode.
/// </param>
/// <param name="threadAffinityCheck">Optional STA thread-affinity guard.</param>
/// <param name="comFactory">
/// The MXAccess COM-object factory used to build the
/// <see cref="LmxSubtagAlarmSource"/> backing the subtag consumer. May be
/// <see langword="null"/> when a <paramref name="standbyFactory"/> is
/// supplied (tests) or when only the alarmmgr path is ever exercised.
/// </param>
/// <param name="standbyFactory">
/// Optional seam that builds the STANDBY (subtag) consumer from a watch
/// list. Defaults to a <see cref="SubtagAlarmConsumer"/> over an
/// <see cref="LmxSubtagAlarmSource"/> built from
/// <paramref name="comFactory"/>. Tests inject a fake so they need no
/// live COM factory.
/// </param>
public AlarmCommandHandler(
MxAccessEventQueue eventQueue,
Func<IMxAccessAlarmConsumer> consumerFactory,
Action? threadAffinityCheck,
IMxAccessComObjectFactory? comFactory,
Func<IReadOnlyList<AlarmSubtagTarget>, IMxAccessAlarmConsumer>? standbyFactory)
{
this.eventQueue = eventQueue ?? throw new ArgumentNullException(nameof(eventQueue));
this.consumerFactory = consumerFactory ?? throw new ArgumentNullException(nameof(consumerFactory));
this.threadAffinityCheck = threadAffinityCheck;
this.comFactory = comFactory;
this.standbyFactory = standbyFactory;
}
/// <summary>Gets a value indicating whether the handler is subscribed.</summary>
@@ -92,10 +137,10 @@ public sealed class AlarmCommandHandler : IAlarmCommandHandler
}
/// <inheritdoc />
public void Subscribe(string subscription, string sessionId)
public void Subscribe(SubscribeAlarmsCommand command, string sessionId)
{
if (disposed) throw new ObjectDisposedException(nameof(AlarmCommandHandler));
if (subscription is null) throw new ArgumentNullException(nameof(subscription));
if (command is null) throw new ArgumentNullException(nameof(command));
threadAffinityCheck?.Invoke();
lock (syncRoot)
@@ -106,17 +151,31 @@ public sealed class AlarmCommandHandler : IAlarmCommandHandler
"AlarmCommandHandler already has an active subscription; " +
"call Unsubscribe before issuing another SubscribeAlarms command.");
}
IMxAccessAlarmConsumer consumer = consumerFactory()
subscribeSessionId = sessionId ?? string.Empty;
IMxAccessAlarmConsumer consumer = BuildConsumer(command)
?? throw new InvalidOperationException("Alarm consumer factory returned null.");
MxAccessAlarmEventSink sink = new MxAccessAlarmEventSink(
eventQueue, new MxAccessEventMapper());
dispatcher = new AlarmDispatcher(consumer, sink, sessionId ?? string.Empty);
// When the selected consumer is a failover composite, surface its
// provider switches onto the worker's event queue so connected
// gateway clients can observe degraded/recovered state. The handler
// is unsubscribed/disposed on Unsubscribe/Dispose below.
if (consumer is FailoverAlarmConsumer failover)
{
failoverConsumer = failover;
providerModeChangedHandler = OnProviderModeChanged;
failover.ProviderModeChanged += providerModeChangedHandler;
}
MxAccessAlarmEventSink sink = new MxAccessAlarmEventSink(eventQueue, mapper);
dispatcher = new AlarmDispatcher(consumer, sink, subscribeSessionId);
try
{
dispatcher.Subscribe(subscription);
dispatcher.Subscribe(command.SubscriptionExpression ?? string.Empty);
}
catch
{
DetachProviderModeChanged();
try { dispatcher.Dispose(); } catch { /* swallow */ }
dispatcher = null;
throw;
@@ -124,6 +183,89 @@ public sealed class AlarmCommandHandler : IAlarmCommandHandler
}
}
/// <summary>
/// Selects and builds the alarm consumer from the command's
/// <see cref="SubscribeAlarmsCommand.ForcedMode"/> and
/// <see cref="SubscribeAlarmsCommand.WatchList"/>:
/// <list type="bullet">
/// <item><description>
/// <c>Alarmmgr</c>, or <c>Unspecified</c> with an empty watch
/// list: the existing primary (alarmmgr) consumer only —
/// today's behavior.
/// </description></item>
/// <item><description>
/// <c>Subtag</c>: a <see cref="SubtagAlarmConsumer"/> only.
/// </description></item>
/// <item><description>
/// <c>Unspecified</c> with a non-empty watch list (auto): a
/// <see cref="FailoverAlarmConsumer"/> over the primary and a
/// subtag standby.
/// </description></item>
/// </list>
/// </summary>
private IMxAccessAlarmConsumer BuildConsumer(SubscribeAlarmsCommand command)
{
List<AlarmSubtagTarget> watchList = new List<AlarmSubtagTarget>(command.WatchList);
if (command.ForcedMode == AlarmProviderMode.Subtag)
{
return BuildStandby(watchList);
}
if (command.ForcedMode == AlarmProviderMode.Unspecified && watchList.Count > 0)
{
IMxAccessAlarmConsumer primary = consumerFactory()
?? throw new InvalidOperationException("Alarm consumer factory returned null.");
IMxAccessAlarmConsumer standby = BuildStandby(watchList);
AlarmFailoverConfig? failoverConfig = command.Failover;
FailoverSettings settings = new FailoverSettings(
failoverConfig?.ConsecutiveFailureThreshold ?? 3,
failoverConfig?.FailbackProbeIntervalSeconds ?? 30,
failoverConfig?.FailbackStableProbes ?? 3);
return new FailoverAlarmConsumer(primary, standby, settings);
}
// Alarmmgr, or Unspecified with an empty watch list — primary only.
return consumerFactory()
?? throw new InvalidOperationException("Alarm consumer factory returned null.");
}
private IMxAccessAlarmConsumer BuildStandby(IReadOnlyList<AlarmSubtagTarget> watchList)
{
if (standbyFactory is not null)
{
return standbyFactory(watchList)
?? throw new InvalidOperationException("Standby alarm consumer factory returned null.");
}
if (comFactory is null)
{
throw new InvalidOperationException(
"Subtag alarm consumer requires an IMxAccessComObjectFactory; the alarm command "
+ "handler was constructed without one and no standby factory was supplied.");
}
return new SubtagAlarmConsumer(new LmxSubtagAlarmSource(comFactory), watchList);
}
private void OnProviderModeChanged(object? sender, AlarmProviderModeChange change)
{
if (change is null) return;
eventQueue.Enqueue(mapper.CreateOnAlarmProviderModeChanged(
subscribeSessionId, change.Mode, change.Reason, change.HResult, change.AtUtc));
}
private void DetachProviderModeChanged()
{
if (failoverConsumer is not null && providerModeChangedHandler is not null)
{
try { failoverConsumer.ProviderModeChanged -= providerModeChangedHandler; }
catch { /* swallow */ }
}
failoverConsumer = null;
providerModeChangedHandler = null;
}
/// <inheritdoc />
public void Unsubscribe()
{
@@ -131,6 +273,7 @@ public sealed class AlarmCommandHandler : IAlarmCommandHandler
AlarmDispatcher? toDispose;
lock (syncRoot)
{
DetachProviderModeChanged();
toDispose = dispatcher;
dispatcher = null;
}

Some files were not shown because too many files have changed in this diff Show More