From 1164d423b6e1fb96fef4079dce6d4f59f077f16f Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Tue, 16 Jun 2026 07:32:59 -0400 Subject: [PATCH] =?UTF-8?q?fix(probe):=20Galaxy=20gRPC=20ping=20=E2=80=94?= =?UTF-8?q?=20drop=20invalid=20Retry,=20treat=20MxGatewayAuth=20exceptions?= =?UTF-8?q?=20as=20reachable=20(live=20/run)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs caught by live verification against the mxaccessgw at 10.100.0.48:5120: - MaxAttempts=1 produced an invalid Polly RetryStrategyOptions -> the probe failed on every real gateway. Removed the Retry override (matches GalaxyDriver); fail-fast is already guaranteed by the TCP preflight + the per-call deadline. - A rejected key surfaces as a typed MxGatewayAuthenticationException, not a raw RpcException, so 'auth-rejection = reachable' was bypassed. Catch the typed auth/ authorization exceptions -> Ok=true. Adds DriverProbeHandshakeE2eTests: direct-probe, skip-gated cross-protocol green/red discrimination (Modbus, OpcUaClient, Galaxy + a local real OPC UA server). --- docs/drivers/TestConnectProbes.md | 12 +- .../GalaxyDriverProbe.cs | 17 +- .../DriverProbeHandshakeE2eTests.cs | 180 ++++++++++++++++++ 3 files changed, 205 insertions(+), 4 deletions(-) create mode 100644 tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests/DriverProbeHandshakeE2eTests.cs diff --git a/docs/drivers/TestConnectProbes.md b/docs/drivers/TestConnectProbes.md index c539f888..dcdba91d 100644 --- a/docs/drivers/TestConnectProbes.md +++ b/docs/drivers/TestConnectProbes.md @@ -107,10 +107,20 @@ key string in the transient config (possibly empty or unresolved) is used as-is. - `Unavailable` / transport failure → `Ok=false` (gateway is down or unreachable). - `Unauthenticated` / `PermissionDenied` → **`Ok=true`**, - `"gateway reachable & speaking gRPC; auth not checked"` — an auth rejection + `"gateway reachable & speaking gRPC (auth not checked)"` — an auth rejection proves a live mxaccessgw gRPC server. This is the correct result: the driver's own session-layer will handle auth; the probe is testing reachability only. +The mxaccessgw client surfaces a rejected key as a typed +`MxGatewayAuthenticationException` / `MxGatewayAuthorizationException`, **not** a +raw `RpcException` — the probe catches both and maps them to the reachable result +above. (Live verification on `10.100.0.48:5120` with no key returns +`MxGatewayAuthenticationException("Missing or invalid API key.")` → `Ok=true`.) + +> **Config note:** `UseTls` must match the endpoint scheme — `UseTls:false` for an +> `http://` (h2c) gateway, `UseTls:true` for `https://`. A mismatch fails the +> client's own validation (the same constraint the Galaxy driver enforces). + --- ## Live-verify scope diff --git a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/GalaxyDriverProbe.cs b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/GalaxyDriverProbe.cs index 726b88a1..eb940508 100644 --- a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/GalaxyDriverProbe.cs +++ b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/GalaxyDriverProbe.cs @@ -101,6 +101,15 @@ public sealed class GalaxyDriverProbe : IDriverProbe var (ok, message) = ClassifyRpc(ex.StatusCode, host, port); return new(ok, message, ok ? sw.Elapsed : null); } + catch (Exception ex) when (ex is MxGatewayAuthenticationException or MxGatewayAuthorizationException) + { + // The gateway authenticated/authorized our call and rejected the (unresolved / + // placeholder) key — the mxaccessgw client surfaces this as a typed exception, NOT a + // raw RpcException. It still PROVES a live gateway gRPC server answered, so auth + // rejection counts as reachable (the probe never resolves the real secret). + sw.Stop(); + return new(true, "gateway reachable & speaking gRPC (auth not checked)", sw.Elapsed); + } catch (OperationCanceledException) when (ct.IsCancellationRequested) { // The caller cancelled (their own timeout / shutdown) — surface a timeout message. @@ -169,9 +178,11 @@ public sealed class GalaxyDriverProbe : IDriverProbe CaCertificatePath = gw.CaCertificatePath, ConnectTimeout = budget, DefaultCallTimeout = budget, - // One shot — the probe must not spin on transient (Unavailable/DeadlineExceeded) - // retries; the linked deadline above bounds the whole call regardless. - Retry = new MxGatewayClientRetryOptions { MaxAttempts = 1 }, + // Leave Retry at the client default (as GalaxyDriver does) — an explicit + // MaxAttempts=1 maps to 0 Polly retries, which Polly rejects as an invalid + // RetryStrategyOptions. Fast-fail is already guaranteed: the TCP preflight rejects + // unreachable hosts before the gRPC call, and the linked deadline caps the call to + // the probe budget regardless of retries. }; } diff --git a/tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests/DriverProbeHandshakeE2eTests.cs b/tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests/DriverProbeHandshakeE2eTests.cs new file mode 100644 index 00000000..6dbb1367 --- /dev/null +++ b/tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests/DriverProbeHandshakeE2eTests.cs @@ -0,0 +1,180 @@ +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Driver.AbCip; +using ZB.MOM.WW.OtOpcUa.Driver.Galaxy; +using ZB.MOM.WW.OtOpcUa.Driver.Modbus; +using ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient; +using ZB.MOM.WW.OtOpcUa.Driver.S7; + +namespace ZB.MOM.WW.OtOpcUa.Host.IntegrationTests; + +/// +/// Phase 5 live verification that the real protocol-handshake Test-Connect probes actually +/// discriminate a speaking device from a merely-TCP-reachable one. Each probe is exercised +/// DIRECTLY (no cluster harness / SQL needed) against the shared docker-host sims, skip-gated +/// on reachability so dotnet test stays clean on a machine without fixture access. +/// +/// The decisive assertions are the cross-protocol RED cases: pointing a probe at a +/// DIFFERENT protocol's open port (which accepts TCP but does not speak the probe's protocol) +/// must now read Ok = false — the exact false-green bug Phase 5 fixes. Before Phase 5 +/// every one of these read a false-healthy green. +/// +/// S7 (:1102) and AbCip (:44818) happy-path verification skips unless those +/// fixtures are up (lmxopcua-fix up s7 s7_1500 / up abcip controllogix); they are +/// unit-proven + code-reviewed. AbLegacy / TwinCAT / FOCAS have no rig target and are +/// unit-proven + degrade-guarded only (see docs/drivers/TestConnectProbes.md). +/// +[Trait("Category", "Integration")] +[Trait("Phase", "5-probes")] +public sealed class DriverProbeHandshakeE2eTests +{ + private const string DockerHost = "10.100.0.35"; + private const int ModbusPort = 5020; // pymodbus sim — speaks Modbus + private const int OpcUaPort = 50000; // opc-plc — speaks OPC UA + private const int S7Port = 1102; + private const int AbCipPort = 44818; + private const string GalaxyHost = "10.100.0.48"; + private const int GalaxyPort = 5120; // mxaccessgw — speaks gRPC + + // Local docker-dev rig (on the dev host): a REAL OPC UA server + a real non-OPC-UA server. + private const int LocalOpcUaPort = 4840; // central-1 OtOpcUa OPC UA server — speaks OPC UA + private const int LocalSqlPort = 14330; // SQL Server — accepts TCP, speaks neither OPC UA nor gRPC + + private static readonly TimeSpan Timeout = TimeSpan.FromSeconds(10); + private static CancellationToken Ct => TestContext.Current.CancellationToken; + + private static void SkipUnless(string host, int port) + { + // Generous timeout: the first connect from a cold test process (JIT + DNS warmup) can + // exceed the 500 ms default, and these targets may be a VPN hop away. + if (!DockerFixtureAvailability.IsReachable(host, port, 3000)) + Assert.Skip($"Fixture {host}:{port} unreachable — skipping live handshake check."); + } + + // ---- Modbus : FC03 handshake ---- + + [Fact] + public async Task Modbus_Green_AgainstModbusSim() + { + SkipUnless(DockerHost, ModbusPort); + var result = await new ModbusDriverProbe().ProbeAsync( + $"{{\"Host\":\"{DockerHost}\",\"Port\":{ModbusPort}}}", Timeout, Ct); + result.Ok.ShouldBeTrue($"Probe message: {result.Message}"); + result.Message!.ShouldContain("Modbus FC03"); + result.Latency.ShouldNotBeNull(); + } + + [Fact] + public async Task Modbus_Red_AgainstNonModbusPort() + { + // The OPC UA port accepts TCP but does not speak Modbus — must NOT read green. + SkipUnless(DockerHost, OpcUaPort); + var result = await new ModbusDriverProbe().ProbeAsync( + $"{{\"Host\":\"{DockerHost}\",\"Port\":{OpcUaPort}}}", Timeout, Ct); + result.Ok.ShouldBeFalse("A non-Modbus TCP server must not pass the FC03 handshake."); + } + + // ---- OpcUaClient : GetEndpoints handshake ---- + + [Fact] + public async Task OpcUaClient_Green_AgainstOpcPlc() + { + SkipUnless(DockerHost, OpcUaPort); + var result = await new OpcUaClientDriverProbe().ProbeAsync( + $"{{\"EndpointUrl\":\"opc.tcp://{DockerHost}:{OpcUaPort}\"}}", Timeout, Ct); + result.Ok.ShouldBeTrue($"Probe message: {result.Message}"); + result.Message!.ShouldContain("OPC UA"); + result.Latency.ShouldNotBeNull(); + } + + [Fact] + public async Task OpcUaClient_Red_AgainstNonOpcUaPort() + { + // The Modbus port accepts TCP but does not speak OPC UA — must NOT read green. + SkipUnless(DockerHost, ModbusPort); + var result = await new OpcUaClientDriverProbe().ProbeAsync( + $"{{\"EndpointUrl\":\"opc.tcp://{DockerHost}:{ModbusPort}\"}}", Timeout, Ct); + result.Ok.ShouldBeFalse("A non-OPC-UA TCP server must not pass the GetEndpoints handshake."); + } + + // ---- Galaxy : gRPC ping (auth-rejection = reachable) ---- + + [Fact] + public async Task Galaxy_Green_AgainstGateway() + { + SkipUnless(GalaxyHost, GalaxyPort); + // No API key supplied — an Unauthenticated reply still proves a live mxaccessgw gRPC server. + // UseTls:false matches the dev gateway's http2-cleartext endpoint (mirrors the dev config). + var result = await new GalaxyDriverProbe().ProbeAsync( + $"{{\"Gateway\":{{\"Endpoint\":\"http://{GalaxyHost}:{GalaxyPort}\",\"UseTls\":false}}}}", Timeout, Ct); + result.Ok.ShouldBeTrue($"Probe message: {result.Message}"); + result.Latency.ShouldNotBeNull(); + } + + [Fact] + public async Task Galaxy_Red_AgainstNonGrpcPort() + { + // The Modbus port accepts TCP but does not speak gRPC — must NOT read green. + SkipUnless(DockerHost, ModbusPort); + var result = await new GalaxyDriverProbe().ProbeAsync( + $"{{\"Gateway\":{{\"Endpoint\":\"http://{DockerHost}:{ModbusPort}\",\"UseTls\":false}}}}", Timeout, Ct); + result.Ok.ShouldBeFalse("A non-gRPC TCP server must not pass the gateway gRPC handshake."); + } + + // ---- Local docker-dev rig: real OPC UA server (central-1) vs a real non-OPC-UA server ---- + + [Fact] + public async Task OpcUaClient_Green_AgainstLocalOtOpcUaServer() + { + SkipUnless("127.0.0.1", LocalOpcUaPort); + var result = await new OpcUaClientDriverProbe().ProbeAsync( + $"{{\"EndpointUrl\":\"opc.tcp://127.0.0.1:{LocalOpcUaPort}\"}}", Timeout, Ct); + result.Ok.ShouldBeTrue($"Probe message: {result.Message}"); + result.Message!.ShouldContain("OPC UA"); + result.Latency.ShouldNotBeNull(); + } + + [Fact] + public async Task OpcUaClient_Red_AgainstLocalNonOpcUaServer() + { + // SQL Server accepts TCP but does not speak OPC UA — the false-green bug Phase 5 fixes. + SkipUnless("127.0.0.1", LocalSqlPort); + var result = await new OpcUaClientDriverProbe().ProbeAsync( + $"{{\"EndpointUrl\":\"opc.tcp://127.0.0.1:{LocalSqlPort}\"}}", Timeout, Ct); + result.Ok.ShouldBeFalse("A SQL Server (non-OPC-UA) must not pass the GetEndpoints handshake."); + } + + [Fact] + public async Task Modbus_Red_AgainstLocalNonModbusServer() + { + // SQL Server accepts TCP but does not speak Modbus. + SkipUnless("127.0.0.1", LocalSqlPort); + var result = await new ModbusDriverProbe().ProbeAsync( + $"{{\"Host\":\"127.0.0.1\",\"Port\":{LocalSqlPort}}}", Timeout, Ct); + result.Ok.ShouldBeFalse("A SQL Server (non-Modbus) must not pass the FC03 handshake."); + } + + // ---- S7 : Plc.OpenAsync handshake (skips unless the sim fixture is up) ---- + + [Fact] + public async Task S7_Green_AgainstSim() + { + SkipUnless(DockerHost, S7Port); + var result = await new S7DriverProbe().ProbeAsync( + $"{{\"Host\":\"{DockerHost}\",\"Port\":{S7Port},\"CpuType\":\"S71500\",\"Rack\":0,\"Slot\":1}}", Timeout, Ct); + result.Ok.ShouldBeTrue($"Probe message: {result.Message}"); + result.Message!.ShouldContain("S7 connected"); + } + + // ---- AbCip : libplctag CIP session handshake (skips unless the sim fixture is up) ---- + + [Fact] + public async Task AbCip_Green_AgainstSim() + { + SkipUnless(DockerHost, AbCipPort); + var result = await new AbCipDriverProbe().ProbeAsync( + $"{{\"Devices\":[{{\"HostAddress\":\"ab://{DockerHost}:{AbCipPort}/1,0\"}}]}}", Timeout, Ct); + result.Ok.ShouldBeTrue($"Probe message: {result.Message}"); + result.Message!.ShouldContain("CIP session OK"); + } +}