From b663ae6effd2f3f0c8aa5df16f9d96c89777631e Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Tue, 16 Jun 2026 06:48:22 -0400 Subject: [PATCH] feat(probe): TwinCAT Test-Connect does an ADS ReadState (degrade-guarded) --- .../TwinCATDriverProbe.cs | 158 +++++++++++-- .../TwinCATDriverProbeTests.cs | 222 ++++++++++++++++++ 2 files changed, 362 insertions(+), 18 deletions(-) create mode 100644 tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.TwinCAT.Tests/TwinCATDriverProbeTests.cs diff --git a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.TwinCAT/TwinCATDriverProbe.cs b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.TwinCAT/TwinCATDriverProbe.cs index e77d4dc2..fedf29cf 100644 --- a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.TwinCAT/TwinCATDriverProbe.cs +++ b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.TwinCAT/TwinCATDriverProbe.cs @@ -2,22 +2,65 @@ using System.Diagnostics; using System.Net.Sockets; using System.Text.Json; using System.Text.Json.Serialization; +using TwinCAT.Ads; using ZB.MOM.WW.OtOpcUa.Core.Abstractions; namespace ZB.MOM.WW.OtOpcUa.Driver.TwinCAT; /// -/// Cheap TCP-connect probe for the -shaped driver config. -/// Opens a socket to the first device's AMS router host (first four octets of the AMS Net ID) -/// on the AMS port from the address and closes immediately. Surfaces a green tick + latency -/// on success; red chip + SocketError on failure; "timed out" on the caller's cancellation. -/// Does NOT exchange any ADS bytes — a richer ADS-state probe is a documented follow-up. +/// Two-phase, degrade-guarded Test-Connect probe for the -shaped +/// driver config. Phase 1: a bare TCP connect to the first device's AMS router host (first four +/// octets of the AMS Net ID) on the AMS port — fast rejection of unreachable targets. Phase 2: +/// a real ADS handshake — AdsClient.Connect(netId, port) + ReadStateAsync — to +/// confirm the endpoint speaks ADS and report the controller's run-state, not just that a TCP +/// socket opened. +/// +/// Outcome classification — three cases: +/// +/// +/// ADS connected + ReadState OKOk=true, message "ADS state: {AdsState}" +/// (e.g. "Run" / "Config" / "Stop"), with latency. +/// +/// +/// Route/auth rejection from a reachable router — an +/// (or a non-success ReadStateAsync result) whose means the +/// router answered but won't let us in (e.g. , +/// , , +/// ) → Ok=false, message +/// "Reachable at {host}:{port} but ADS handshake failed: {code} — check the target's ADS +/// route table authorizes this host". This is a TRUE red: the driver itself also needs +/// the route, so a green tick here would be a false positive. +/// +/// +/// Handshake could not be ATTEMPTED on this host — the managed AMS router cannot run +/// headless (Beckhoff's AdsClient.Connect throws a server exception +/// "Check for a running TwinCAT router instance!"), or a +/// / / / +/// surfaces, or ReadStateAsync reports a client-side +/// port-not-open status → DEGRADE: Ok=true, message +/// "Reachable at {host}:{port} (ADS handshake unavailable on this host — TCP reachability +/// only)", with latency. The probe NEVER produces a result worse than the old TCP-only +/// probe. +/// +/// +/// /// /// -/// AMS Net ID format is six dot-separated octets (e.g. 192.168.1.10.1.1); the first -/// four are typically the host IPv4 address by Beckhoff convention, but the AMS router -/// resolves the real IP route server-side. The probe uses the first-four-octet heuristic +/// The line between case 2 (device-rejected → RED) and case 3 (can't-attempt → DEGRADE) is the +/// crux. Classification rests on the exception's identity: only an +/// (or a result ) carrying a route/target-port code is a RED — that is the +/// ADS router answering and refusing the route. Beckhoff's TwinCAT.Ads.Server.AdsServerException +/// ("running TwinCAT router instance!") derives from plain , NOT from +/// , so it is correctly classified as "can't attempt → DEGRADE". +/// When genuinely ambiguous, the probe DEGRADES (Ok=true, TCP-only note) rather than emit a false RED. +/// +/// AMS Net ID format is six dot-separated octets (e.g. 192.168.1.10.1.1); the first four +/// are typically the host IPv4 address by Beckhoff convention. The AMS router resolves the real IP +/// route server-side; the probe uses the first-four-octet heuristic for the TCP preflight target, /// which is reliable for the overwhelming majority of production deployments. +/// +/// The probe is read-only — ReadStateAsync never mutates PLC state — and always disposes the +/// . /// public sealed class TwinCATDriverProbe : IDriverProbe { @@ -27,6 +70,22 @@ public sealed class TwinCATDriverProbe : IDriverProbe UnmappedMemberHandling = JsonUnmappedMemberHandling.Skip, }; + /// + /// AMS error codes that mean "the router answered but refused the route / target port" — a + /// genuine RED. The driver itself would also be denied, so a green tick would be a false + /// positive. Everything NOT in this set (client-side port/connection errors, sync timeouts, + /// router-not-initialised, etc.) is treated as "couldn't attempt the handshake" → DEGRADE. + /// + private static readonly HashSet _routeRejectCodes = + [ + AdsErrorCode.TargetPortNotFound, + AdsErrorCode.TargetMachineNotFound, + AdsErrorCode.PortNotConnected, + AdsErrorCode.PortDisabled, + AdsErrorCode.AccessDenied, + AdsErrorCode.DeviceAccessDenied, + ]; + /// public string DriverType => "TwinCAT"; @@ -38,23 +97,23 @@ public sealed class TwinCATDriverProbe : IDriverProbe catch (Exception ex) { return new(false, $"Config JSON is invalid: {ex.Message}", null); } if (opts is null) return new(false, "Config JSON deserialized to null.", null); - var (host, port) = ExtractTarget(opts); - if (string.IsNullOrWhiteSpace(host) || port <= 0) + var (host, port, parsed) = ExtractTarget(opts); + if (parsed is null || string.IsNullOrWhiteSpace(host) || port <= 0) return new(false, "Config has no host/port to probe.", null); + // Phase 1: bare TCP preflight — fast rejection for unreachable hosts. Messages here are + // UNCHANGED from the original TCP-only probe. var sw = Stopwatch.StartNew(); try { using var socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); await socket.ConnectAsync(host, port, ct); - sw.Stop(); - return new(true, null, sw.Elapsed); } catch (SocketException ex) { return new(false, $"Connect failed: {ex.SocketErrorCode}", null); } - catch (OperationCanceledException) + catch (OperationCanceledException) when (ct.IsCancellationRequested) { return new(false, $"Probe timed out after {timeout.TotalSeconds:F0}s.", null); } @@ -62,23 +121,86 @@ public sealed class TwinCATDriverProbe : IDriverProbe { return new(false, ex.Message, null); } + + // Phase 2: real ADS handshake. Connect + ReadStateAsync. The crux is the three-way + // classification of how this can fail — see the class-doc and ClassifyHandshakeFailure. + var degradeNote = $"Reachable at {host}:{port} (ADS handshake unavailable on this host — TCP reachability only)"; + try + { + using var client = new AdsClient(); + // Bound the ADS round-trip by the caller's timeout (clamped >=1s, mirrors AdsTwinCATClient). + client.Timeout = (int)Math.Max(1_000, timeout.TotalMilliseconds); + + // Connect can throw a server exception ("running TwinCAT router instance!") on a headless + // host with no AMS router — that is a can't-attempt DEGRADE, classified below. + client.Connect(AmsNetId.Parse(parsed.NetId), parsed.Port); + + var state = await client.ReadStateAsync(ct).ConfigureAwait(false); + sw.Stop(); + + if (state.Succeeded) + return new(true, $"ADS state: {state.State.AdsState}", sw.Elapsed); + + // Non-throwing failure carried in the result's error code. + return state.ErrorCode == AdsErrorCode.ClientPortNotOpen + ? new(true, degradeNote, sw.Elapsed) // client never opened — DEGRADE + : ClassifyHandshakeFailure(state.ErrorCode, host, port, sw, degradeNote); + } + catch (OperationCanceledException) when (ct.IsCancellationRequested) + { + // Caller timeout — keep the original timed-out message. + return new(false, $"Probe timed out after {timeout.TotalSeconds:F0}s.", null); + } + catch (AdsErrorException ex) + { + // The router answered with an ADS-level error. Route/auth rejection → RED; anything + // else (sync timeout, client port issues, …) → DEGRADE. + return ClassifyHandshakeFailure(ex.ErrorCode, host, port, sw, degradeNote); + } + catch (Exception) + { + // Everything else — TwinCAT.Ads.Server.AdsServerException ("running TwinCAT router + // instance!"), PlatformNotSupportedException, TypeInitializationException, + // DllNotFoundException, NotSupportedException, etc. — means the handshake could not be + // ATTEMPTED on this host. DEGRADE: never worse than the TCP-only probe. + sw.Stop(); + return new(true, degradeNote, sw.Elapsed); + } } - private static (string host, int port) ExtractTarget(TwinCATDriverOptions opts) + /// + /// Classifies an ADS-level failure (from an or a non-success + /// ReadStateAsync result). A route/target-port/access code means the router answered + /// but refused the route → RED. Any other code is treated as "couldn't attempt" → DEGRADE, + /// so the probe never under-reports a host with no usable ADS runtime. + /// + private static DriverProbeResult ClassifyHandshakeFailure( + AdsErrorCode code, string host, int port, Stopwatch sw, string degradeNote) + { + if (_routeRejectCodes.Contains(code)) + return new(false, + $"Reachable at {host}:{port} but ADS handshake failed: {code} — check the target's ADS route table authorizes this host", + null); + + sw.Stop(); + return new(true, degradeNote, sw.Elapsed); + } + + private static (string host, int port, TwinCATAmsAddress? parsed) ExtractTarget(TwinCATDriverOptions opts) { // Parse the first device's ads:// address. AMS Net ID is six-octet; by Beckhoff // convention the first four octets are the host IPv4. Extract those as the TCP target. var firstDevice = opts.Devices.FirstOrDefault(); - if (firstDevice is null) return (string.Empty, 0); + if (firstDevice is null) return (string.Empty, 0, null); var parsed = TwinCATAmsAddress.TryParse(firstDevice.HostAddress); - if (parsed is null) return (string.Empty, 0); + if (parsed is null) return (string.Empty, 0, null); // NetId = "a.b.c.d.e.f" — take the first 4 octets as the host IP. var parts = parsed.NetId.Split('.'); - if (parts.Length < 4) return (string.Empty, 0); + if (parts.Length < 4) return (string.Empty, 0, null); var hostIp = string.Join('.', parts[0], parts[1], parts[2], parts[3]); - return (hostIp, parsed.Port); + return (hostIp, parsed.Port, parsed); } } diff --git a/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.TwinCAT.Tests/TwinCATDriverProbeTests.cs b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.TwinCAT.Tests/TwinCATDriverProbeTests.cs new file mode 100644 index 00000000..8d9a8c23 --- /dev/null +++ b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.TwinCAT.Tests/TwinCATDriverProbeTests.cs @@ -0,0 +1,222 @@ +// Unit tests for TwinCATDriverProbe — the degrade-guarded ADS ReadState Test-Connect probe. +// +// Coverage here is the set of outcomes that are *deterministic on a CI/dev box with no TwinCAT +// runtime*: invalid JSON, missing host/port, an unreachable TCP target, and — the crux — the +// DEGRADE path. On this box there is no AMS router, so once the TCP preflight succeeds against a +// reachable listener the managed ADS client cannot initialise (it throws a "Check for a running +// TwinCAT router instance!" server exception, NOT an ADS device rejection). The probe MUST treat +// that as "TCP reachability only" (Ok=true) so it never regresses below the old TCP-only probe. +// +// The two paths that REQUIRE a live ADS target are LIVE-VERIFY DEFERRED (no TwinCAT runtime on the +// rig): (a) the happy "ADS state: Run/Config/Stop" path (Ok=true with the AdsState name), and +// (b) the route/auth-reject RED path (a reachable router that refuses the route, surfaced as an +// AdsErrorException carrying a TargetPortNotFound / route AdsErrorCode → Ok=false with the +// "check the target's ADS route table" message). Those are exercised against a real TwinCAT XAR / +// remote runtime, out of scope for this unit suite. + +using System.Net; +using System.Net.Sockets; +using Shouldly; +using Xunit; + +namespace ZB.MOM.WW.OtOpcUa.Driver.TwinCAT.Tests; + +[Trait("Category", "Unit")] +public sealed class TwinCATDriverProbeTests +{ + private static readonly TwinCATDriverProbe Probe = new(); + + // ------------------------------------------------------------------------- + // 1. Invalid JSON + // ------------------------------------------------------------------------- + + /// Invalid JSON returns Ok=false with a message containing "invalid". + [Fact] + public async Task InvalidJson_Returns_OkFalse_WithInvalidMessage() + { + var result = await Probe.ProbeAsync( + "not-valid-json{{{", + TimeSpan.FromSeconds(3), + TestContext.Current.CancellationToken); + + result.Ok.ShouldBeFalse(); + result.Message.ShouldNotBeNull(); + result.Message!.ToLowerInvariant().ShouldContain("invalid"); + result.Latency.ShouldBeNull(); + } + + // ------------------------------------------------------------------------- + // 2. Config with no device / no host + // ------------------------------------------------------------------------- + + /// An empty Devices list returns Ok=false with a "no host/port" message. + [Fact] + public async Task NoDevices_Returns_OkFalse_WithNoHostPortMessage() + { + var result = await Probe.ProbeAsync( + "{\"devices\":[]}", + TimeSpan.FromSeconds(3), + TestContext.Current.CancellationToken); + + result.Ok.ShouldBeFalse(); + result.Message.ShouldNotBeNull(); + result.Message!.ShouldContain("no host/port"); + result.Latency.ShouldBeNull(); + } + + /// + /// A first device with a malformed (non-ads://) host address returns Ok=false with a + /// "no host/port" message because returns null. + /// + [Fact] + public async Task MalformedHostAddress_Returns_OkFalse_WithNoHostPortMessage() + { + var result = await Probe.ProbeAsync( + "{\"devices\":[{\"hostAddress\":\"not-an-ads-url\"}]}", + TimeSpan.FromSeconds(3), + TestContext.Current.CancellationToken); + + result.Ok.ShouldBeFalse(); + result.Message.ShouldNotBeNull(); + result.Message!.ShouldContain("no host/port"); + result.Latency.ShouldBeNull(); + } + + // ------------------------------------------------------------------------- + // 3. Unreachable target — TCP preflight fails first + // ------------------------------------------------------------------------- + + /// + /// A closed port causes the TCP preflight ConnectAsync to throw + /// ; the probe returns Ok=false with "Connect failed". + /// The port is reserved then released so the OS refuses the connection immediately. + /// + [Fact] + public async Task ClosedPort_Returns_OkFalse_WithConnectFailedMessage() + { + // Reserve an ephemeral port then release it so the port is definitely closed. + var reserved = new TcpListener(IPAddress.Loopback, 0); + reserved.Start(); + var port = ((IPEndPoint)reserved.LocalEndpoint).Port; + reserved.Stop(); + + // 127.0.0.1 as the first four AMS-Net-ID octets → the probe's TCP target is 127.0.0.1. + var configJson = $"{{\"devices\":[{{\"hostAddress\":\"ads://127.0.0.1.1.1:{port}\"}}]}}"; + + using var cts = CancellationTokenSource.CreateLinkedTokenSource( + TestContext.Current.CancellationToken); + cts.CancelAfter(TimeSpan.FromSeconds(5)); + + var result = await Probe.ProbeAsync( + configJson, + TimeSpan.FromSeconds(3), + cts.Token); + + result.Ok.ShouldBeFalse(); + result.Message.ShouldNotBeNull(); + // Either a fast SocketException ("Connect failed") or, if the OS stalls, a timeout. + var lower = result.Message!.ToLowerInvariant(); + (lower.Contains("connect failed") || lower.Contains("timed out")).ShouldBeTrue( + $"Expected 'Connect failed' or 'timed out' but got: {result.Message}"); + result.Latency.ShouldBeNull(); + } + + /// + /// A target on a non-routable IP (192.0.2.1 — TEST-NET-1, RFC 5737) with a short + /// cancellation window times out (or fails fast on EHOSTUNREACH) at the TCP preflight. + /// + [Fact] + public async Task BlackHoleTarget_Returns_OkFalse_WithTimedOutOrConnectFailed() + { + using var cts = CancellationTokenSource.CreateLinkedTokenSource( + TestContext.Current.CancellationToken); + cts.CancelAfter(TimeSpan.FromSeconds(1)); + + var configJson = "{\"devices\":[{\"hostAddress\":\"ads://192.0.2.1.1.1:851\"}]}"; + var result = await Probe.ProbeAsync( + configJson, + TimeSpan.FromSeconds(1), + cts.Token); + + result.Ok.ShouldBeFalse(); + result.Message.ShouldNotBeNull(); + var lower = result.Message!.ToLowerInvariant(); + (lower.Contains("timed out") || lower.Contains("connect failed")).ShouldBeTrue( + $"Expected 'timed out' or 'Connect failed' but got: {result.Message}"); + result.Latency.ShouldBeNull(); + } + + // ------------------------------------------------------------------------- + // 4. DEGRADE path — TCP reachable, but the ADS handshake can't be attempted here + // ------------------------------------------------------------------------- + + /// + /// The crux. A reachable TCP listener satisfies the preflight, but on this box (no TwinCAT + /// AMS router) the managed ADS client cannot establish a session — AdsClient.Connect + /// throws an environment/router exception (NOT an ADS device rejection). The probe MUST + /// DEGRADE: Ok=true with the "TCP reachability only" note, never a false RED. This + /// guarantees the new probe never reports worse than the old TCP-only probe on a host with + /// no ADS runtime. + /// + [Fact] + public async Task ReachableTcp_NoAdsRuntime_Degrades_To_OkTrue_WithTcpOnlyNote() + { + // Stand up a real TCP listener and accept (then drop) connections so the preflight + // ConnectAsync succeeds. We never speak ADS — the ADS handshake is what must degrade. + var listener = new TcpListener(IPAddress.Loopback, 0); + listener.Start(); + var port = ((IPEndPoint)listener.LocalEndpoint).Port; + + using var acceptCts = CancellationTokenSource.CreateLinkedTokenSource( + TestContext.Current.CancellationToken); + var acceptLoop = Task.Run(async () => + { + try + { + while (true) + { + var client = await listener.AcceptTcpClientAsync(acceptCts.Token); + // Hold the socket briefly so the preflight sees a clean accept, then close. + _ = Task.Run(async () => + { + await Task.Delay(200); + client.Dispose(); + }); + } + } + catch (OperationCanceledException) { /* accept cancelled — expected on teardown */ } + catch (ObjectDisposedException) { /* listener stopped — expected on teardown */ } + catch (SocketException) { /* listener stopped — expected on teardown */ } + }); + + try + { + var configJson = $"{{\"devices\":[{{\"hostAddress\":\"ads://127.0.0.1.1.1:{port}\"}}]}}"; + + using var cts = CancellationTokenSource.CreateLinkedTokenSource( + TestContext.Current.CancellationToken); + cts.CancelAfter(TimeSpan.FromSeconds(10)); + + var result = await Probe.ProbeAsync( + configJson, + TimeSpan.FromSeconds(3), + cts.Token); + + // DEGRADE guard: must be Ok=true (never worse than TCP-only), with the explicit note + // and a latency. If on some box AdsClient instead threw synchronously at construction, + // the probe still degrades the same way — this assertion holds either way. + result.Ok.ShouldBeTrue( + $"Degrade guard violated — probe returned a worse-than-TCP result: {result.Message}"); + result.Message.ShouldNotBeNull(); + result.Message!.ShouldContain("ADS handshake unavailable on this host"); + result.Message!.ShouldContain("TCP reachability only"); + result.Latency.ShouldNotBeNull(); + } + finally + { + await acceptCts.CancelAsync(); + listener.Stop(); + try { await acceptLoop; } catch { /* ignore teardown races */ } + } + } +}