From d2cc4a12226ecc4f06884e4d7519fd314a4820b0 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Thu, 11 Jun 2026 11:12:49 -0400 Subject: [PATCH] test+docs(galaxy): verify alarm acknowledger recovers after transient fault; document alarm-client reconnect --- docs/drivers/Galaxy.md | 2 + .../GatewayGalaxyAlarmAcknowledgerTests.cs | 202 ++++++++++++++++++ 2 files changed, 204 insertions(+) create mode 100644 tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests/Runtime/GatewayGalaxyAlarmAcknowledgerTests.cs diff --git a/docs/drivers/Galaxy.md b/docs/drivers/Galaxy.md index b0cc8054..d9326532 100644 --- a/docs/drivers/Galaxy.md +++ b/docs/drivers/Galaxy.md @@ -92,6 +92,8 @@ Full per-field descriptions live in `Config/GalaxyDriverOptions.cs`. The full JS `ReconnectSupervisor` owns an exponential-backoff loop bounded by `Reconnect.InitialBackoffMs` / `MaxBackoffMs`. On session loss it calls `GalaxyDriver.ReopenAsync`, which invokes `GalaxyMxSession.RecreateAsync` to dispose the stale/faulted session and client before rebuilding (`OpenSessionAsync` + `RegisterAsync`). Previously `ConnectAsync` was a no-op when a stale session handle was still present, so the reopen supervisor looped forever without recovering. After a successful reopen — when `ReplayOnSessionLost = true` — the supervisor calls the gateway's `ReplaySubscriptions` RPC with the cached subscription set from `SubscriptionRegistry` instead of re-subscribing tag-by-tag. The gateway's worker then re-issues `AdviseSupervisory` server-side under the apartment lock. +The session-less alarm feed (`GatewayGalaxyAlarmFeed`) and alarm acknowledger (`GatewayGalaxyAlarmAcknowledger`) run on a separate always-on `_ownedMxClient` that is intentionally **not** recreated when `ReconnectSupervisor` rebuilds the worker session. The feed has its own re-invoke loop (~5 s backoff) that reopens `StreamAlarms` after any stream fault, and gRPC.NET's channel auto-reconnect recovers the underlying HTTP/2 connection after a gateway restart — so both the alarm stream and individual ack calls recover without client replacement. The acknowledger is completely stateless between calls: each `AcknowledgeAsync` issues a single unary RPC and returns; no dead-client state is latched on failure. Channel-level keepalive hardening (TCP keep-alive intervals, gRPC ping frames) would require exposing additional knobs on `MxGatewayClientOptions` in the sibling `mxaccessgw` repo — a future option if flaky long-lived connections are observed in production. + ## Testing - **Unit tests**: `tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests/` — fakes the gateway gRPC surface; covers Browse, Runtime, Health, and Config in isolation. diff --git a/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests/Runtime/GatewayGalaxyAlarmAcknowledgerTests.cs b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests/Runtime/GatewayGalaxyAlarmAcknowledgerTests.cs new file mode 100644 index 00000000..dad23968 --- /dev/null +++ b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests/Runtime/GatewayGalaxyAlarmAcknowledgerTests.cs @@ -0,0 +1,202 @@ +using Microsoft.Extensions.Logging.Abstractions; +using Shouldly; +using Xunit; +using ZB.MOM.WW.MxGateway.Client; +using ZB.MOM.WW.OtOpcUa.Core.Abstractions; +using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Config; +using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime; + +namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests.Runtime; + +/// +/// Pins — the session-less unary caller +/// that routes operator acknowledgements through +/// MxGatewayClient.AcknowledgeAlarmAsync. +/// +/// +/// The acknowledger is completely stateless w.r.t. transport faults: it holds an +/// but has no internal dead-client latch. Each +/// AcknowledgeAsync call issues exactly one unary RPC; if that RPC throws the +/// call site (the scripted-alarm engine) is responsible for retry. gRPC.NET channel +/// auto-reconnect ensures the underlying HTTP/2 connection recovers between calls +/// without the client being replaced — these tests verify that behaviour. +/// +public sealed class GatewayGalaxyAlarmAcknowledgerTests +{ + // ------------------------------------------------------------------ helpers + + /// + /// Fake that records every call and lets + /// tests control per-call outcomes. is + /// the internal test seam — production wires + /// behind it via the + /// internal constructor. + /// + private sealed class FakeAcknowledger : IGalaxyAlarmAcknowledger + { + private readonly Queue> _outcomes = new(); + + /// Calls recorded in the order received. + public List<(string AlarmRef, string Comment, string Operator)> Calls { get; } = []; + + /// Enqueues an outcome for the next call (completes normally or throws). + public void Enqueue(Func outcome) => _outcomes.Enqueue(outcome); + + public Task AcknowledgeAsync( + string alarmFullReference, + string comment, + string operatorUser, + CancellationToken cancellationToken) + { + Calls.Add((alarmFullReference, comment, operatorUser)); + return _outcomes.Count > 0 ? _outcomes.Dequeue()() : Task.CompletedTask; + } + } + + private static GalaxyDriverOptions Opts() => new( + new GalaxyGatewayOptions("https://mxgw.test:5001", "key"), + new GalaxyMxAccessOptions("OtOpcUa-A"), + new GalaxyRepositoryOptions(), + new GalaxyReconnectOptions()); + + private static AlarmAcknowledgeRequest Ack(string conditionId, string comment = "c", string source = "") + => new(SourceNodeId: source, ConditionId: conditionId, Comment: comment); + + // ------------------------------------------------------------------ tests + + /// + /// Verifies the acknowledger is stateless: a transient fault on the first call + /// does NOT latch it into a dead state — a second call after the fault still + /// reaches the acknowledger and succeeds. + /// + [Fact] + public async Task Acknowledge_after_transient_fault_succeeds_on_retry() + { + var fake = new FakeAcknowledger(); + // First call: simulates a transient transport blip (e.g. RpcException from gRPC). + fake.Enqueue(() => Task.FromException(new IOException("synthetic transport fault"))); + // Second call: succeeds — proves no dead-state latch. + fake.Enqueue(() => Task.CompletedTask); + + var driver = new GalaxyDriver("g", Opts(), + hierarchySource: null, alarmAcknowledger: fake); + + await Should.ThrowAsync( + () => driver.AcknowledgeAsync( + [Ack("Tank01.Level.HiHi", "investigating")], CancellationToken.None)); + + // After the first failure the fake (and by contract the production + // GatewayGalaxyAlarmAcknowledger) must still be reachable — it holds no + // dead-client latch. + await driver.AcknowledgeAsync( + [Ack("Tank01.Level.HiHi", "resolved")], CancellationToken.None); + + fake.Calls.Count.ShouldBe(2); + fake.Calls[0].AlarmRef.ShouldBe("Tank01.Level.HiHi"); + fake.Calls[0].Comment.ShouldBe("investigating"); + fake.Calls[1].Comment.ShouldBe("resolved"); + } + + /// + /// Verifies that a second acknowledge call after a protocol-level failure + /// still reaches the transport — confirming no state leaks across calls. + /// + [Fact] + public async Task Acknowledge_after_protocol_failure_still_issues_next_call() + { + var fake = new FakeAcknowledger(); + fake.Enqueue(() => Task.FromException(new InvalidOperationException("gateway overload"))); + fake.Enqueue(() => Task.CompletedTask); + + var driver = new GalaxyDriver("g", Opts(), + hierarchySource: null, alarmAcknowledger: fake); + + await Should.ThrowAsync( + () => driver.AcknowledgeAsync( + [Ack("Valve01.PV.Hi", "c1")], CancellationToken.None)); + + await driver.AcknowledgeAsync( + [Ack("Valve01.PV.Hi", "c2")], CancellationToken.None); + + fake.Calls.Count.ShouldBe(2); + } + + /// + /// Verifies that the driver's AcknowledgeAsync forwards the condition ID + /// (alarm full reference) and comment to the acknowledger without modification. + /// + [Fact] + public async Task Acknowledge_forwards_all_fields_to_acknowledger() + { + var fake = new FakeAcknowledger(); + + var driver = new GalaxyDriver("g", Opts(), + hierarchySource: null, alarmAcknowledger: fake); + + await driver.AcknowledgeAsync( + [Ack("Pump01.Flow.Low", "operator comment")], CancellationToken.None); + + fake.Calls.ShouldHaveSingleItem(); + fake.Calls[0].AlarmRef.ShouldBe("Pump01.Flow.Low"); + fake.Calls[0].Comment.ShouldBe("operator comment"); + } + + // ------------------------------------------------------------------ + // Production class smoke — exercises GatewayGalaxyAlarmAcknowledger directly. + // + // IMxGatewayClientTransport (the transport seam on MxGatewayClient) is internal + // to the NuGet package and cannot be implemented from outside the package. + // The lightest available smoke is: construct GatewayGalaxyAlarmAcknowledger with + // a real MxGatewayClient pointing at an unreachable endpoint, call it twice, and + // verify both calls fault with the same transport-level exception type — NOT an + // ObjectDisposedException or any "client is in a terminal dead state" variant. + // That proves the concrete class latches no dead-client state between calls. + // ------------------------------------------------------------------ + + /// + /// Verifies that is stateless: + /// two consecutive calls both throw the same transport fault rather than a + /// degraded "object-already-failed" state exception on the second call. + /// + [Fact] + public async Task GatewayAcknowledger_consecutive_faults_both_reach_grpc_layer() + { + // Point at a guaranteed-unreachable endpoint so every call fails at the + // gRPC transport layer. Both calls must throw the same *kind* of exception + // (transport failure), not a "client object is dead" exception on the second. + var client = MxGatewayClient.Create(new MxGatewayClientOptions + { + Endpoint = new Uri("http://127.0.0.1:1", UriKind.Absolute), // port 1 = unreachable + ApiKey = "test-key", + UseTls = false, + ConnectTimeout = TimeSpan.FromMilliseconds(200), + DefaultCallTimeout = TimeSpan.FromMilliseconds(200), + }); + + await using (client) + { + var acknowledger = new GatewayGalaxyAlarmAcknowledger(client, NullLogger.Instance); + + var ex1 = await Record.ExceptionAsync( + () => acknowledger.AcknowledgeAsync( + "Tank01.Level.HiHi", "c1", "alice", CancellationToken.None)); + var ex2 = await Record.ExceptionAsync( + () => acknowledger.AcknowledgeAsync( + "Tank01.Level.HiHi", "c2", "alice", CancellationToken.None)); + + // Both calls must fault — not a success. + ex1.ShouldNotBeNull("first call to unreachable endpoint should throw"); + ex2.ShouldNotBeNull("second call to unreachable endpoint should throw"); + + // The key invariant: neither call must throw ObjectDisposedException. + // If the acknowledger latched a dead-client state after the first failure, + // the second call would surface an "object is disposed/dead" error instead + // of another transport fault. Both must be transport-level failures. + ex1.ShouldNotBeOfType( + "first call must not get an 'object is disposed' error"); + ex2.ShouldNotBeOfType( + "second call must not get a 'client is dead/disposed' error — " + + "the acknowledger must not latch any dead state between calls"); + } + } +}