feat(probe): TwinCAT Test-Connect does an ADS ReadState (degrade-guarded)

This commit is contained in:
Joseph Doherty
2026-06-16 06:48:22 -04:00
parent 5ed0276ffb
commit b663ae6eff
2 changed files with 362 additions and 18 deletions
@@ -2,22 +2,65 @@ using System.Diagnostics;
using System.Net.Sockets;
using System.Text.Json;
using System.Text.Json.Serialization;
using TwinCAT.Ads;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.TwinCAT;
/// <summary>
/// Cheap TCP-connect probe for the <see cref="TwinCATDriverOptions"/>-shaped driver config.
/// Opens a socket to the first device's AMS router host (first four octets of the AMS Net ID)
/// on the AMS port from the address and closes immediately. Surfaces a green tick + latency
/// on success; red chip + SocketError on failure; "timed out" on the caller's cancellation.
/// Does NOT exchange any ADS bytes — a richer ADS-state probe is a documented follow-up.
/// Two-phase, degrade-guarded Test-Connect probe for the <see cref="TwinCATDriverOptions"/>-shaped
/// driver config. Phase 1: a bare TCP connect to the first device's AMS router host (first four
/// octets of the AMS Net ID) on the AMS port — fast rejection of unreachable targets. Phase 2:
/// a real ADS handshake — <c>AdsClient.Connect(netId, port)</c> + <c>ReadStateAsync</c> — to
/// confirm the endpoint speaks ADS and report the controller's run-state, not just that a TCP
/// socket opened.
/// <para>
/// <b>Outcome classification</b> — three cases:
/// <list type="number">
/// <item>
/// <b>ADS connected + ReadState OK</b> → <c>Ok=true</c>, message <c>"ADS state: {AdsState}"</c>
/// (e.g. "Run" / "Config" / "Stop"), with latency.
/// </item>
/// <item>
/// <b>Route/auth rejection from a reachable router</b> — an <see cref="AdsErrorException"/>
/// (or a non-success <c>ReadStateAsync</c> result) whose <see cref="AdsErrorCode"/> means the
/// router answered but won't let us in (e.g. <see cref="AdsErrorCode.TargetPortNotFound"/>,
/// <see cref="AdsErrorCode.TargetMachineNotFound"/>, <see cref="AdsErrorCode.PortNotConnected"/>,
/// <see cref="AdsErrorCode.PortDisabled"/>) → <c>Ok=false</c>, message
/// <c>"Reachable at {host}:{port} but ADS handshake failed: {code} — check the target's ADS
/// route table authorizes this host"</c>. This is a TRUE red: the driver itself also needs
/// the route, so a green tick here would be a false positive.
/// </item>
/// <item>
/// <b>Handshake could not be ATTEMPTED on this host</b> — the managed AMS router cannot run
/// headless (Beckhoff's <c>AdsClient.Connect</c> throws a server exception
/// "Check for a running TwinCAT router instance!"), or a <see cref="PlatformNotSupportedException"/>
/// / <see cref="TypeInitializationException"/> / <see cref="DllNotFoundException"/> /
/// <see cref="NotSupportedException"/> surfaces, or <c>ReadStateAsync</c> reports a client-side
/// port-not-open status → <b>DEGRADE</b>: <c>Ok=true</c>, message
/// <c>"Reachable at {host}:{port} (ADS handshake unavailable on this host — TCP reachability
/// only)"</c>, with latency. The probe NEVER produces a result worse than the old TCP-only
/// probe.
/// </item>
/// </list>
/// </para>
/// </summary>
/// <remarks>
/// AMS Net ID format is six dot-separated octets (e.g. <c>192.168.1.10.1.1</c>); the first
/// four are typically the host IPv4 address by Beckhoff convention, but the AMS router
/// resolves the real IP route server-side. The probe uses the first-four-octet heuristic
/// The line between case 2 (device-rejected → RED) and case 3 (can't-attempt → DEGRADE) is the
/// crux. Classification rests on the exception's identity: only an <see cref="AdsErrorException"/>
/// (or a result <see cref="AdsErrorCode"/>) carrying a route/target-port code is a RED — that is the
/// ADS router answering and refusing the route. Beckhoff's <c>TwinCAT.Ads.Server.AdsServerException</c>
/// ("running TwinCAT router instance!") derives from plain <see cref="Exception"/>, NOT from
/// <see cref="AdsErrorException"/>, so it is correctly classified as "can't attempt → DEGRADE".
/// When genuinely ambiguous, the probe DEGRADES (Ok=true, TCP-only note) rather than emit a false RED.
/// <para>
/// AMS Net ID format is six dot-separated octets (e.g. <c>192.168.1.10.1.1</c>); the first four
/// are typically the host IPv4 address by Beckhoff convention. The AMS router resolves the real IP
/// route server-side; the probe uses the first-four-octet heuristic for the TCP preflight target,
/// which is reliable for the overwhelming majority of production deployments.
/// </para>
/// The probe is read-only — <c>ReadStateAsync</c> never mutates PLC state — and always disposes the
/// <see cref="AdsClient"/>.
/// </remarks>
public sealed class TwinCATDriverProbe : IDriverProbe
{
@@ -27,6 +70,22 @@ public sealed class TwinCATDriverProbe : IDriverProbe
UnmappedMemberHandling = JsonUnmappedMemberHandling.Skip,
};
/// <summary>
/// AMS error codes that mean "the router answered but refused the route / target port" — a
/// genuine RED. The driver itself would also be denied, so a green tick would be a false
/// positive. Everything NOT in this set (client-side port/connection errors, sync timeouts,
/// router-not-initialised, etc.) is treated as "couldn't attempt the handshake" → DEGRADE.
/// </summary>
private static readonly HashSet<AdsErrorCode> _routeRejectCodes =
[
AdsErrorCode.TargetPortNotFound,
AdsErrorCode.TargetMachineNotFound,
AdsErrorCode.PortNotConnected,
AdsErrorCode.PortDisabled,
AdsErrorCode.AccessDenied,
AdsErrorCode.DeviceAccessDenied,
];
/// <inheritdoc />
public string DriverType => "TwinCAT";
@@ -38,23 +97,23 @@ public sealed class TwinCATDriverProbe : IDriverProbe
catch (Exception ex) { return new(false, $"Config JSON is invalid: {ex.Message}", null); }
if (opts is null) return new(false, "Config JSON deserialized to null.", null);
var (host, port) = ExtractTarget(opts);
if (string.IsNullOrWhiteSpace(host) || port <= 0)
var (host, port, parsed) = ExtractTarget(opts);
if (parsed is null || string.IsNullOrWhiteSpace(host) || port <= 0)
return new(false, "Config has no host/port to probe.", null);
// Phase 1: bare TCP preflight — fast rejection for unreachable hosts. Messages here are
// UNCHANGED from the original TCP-only probe.
var sw = Stopwatch.StartNew();
try
{
using var socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
await socket.ConnectAsync(host, port, ct);
sw.Stop();
return new(true, null, sw.Elapsed);
}
catch (SocketException ex)
{
return new(false, $"Connect failed: {ex.SocketErrorCode}", null);
}
catch (OperationCanceledException)
catch (OperationCanceledException) when (ct.IsCancellationRequested)
{
return new(false, $"Probe timed out after {timeout.TotalSeconds:F0}s.", null);
}
@@ -62,23 +121,86 @@ public sealed class TwinCATDriverProbe : IDriverProbe
{
return new(false, ex.Message, null);
}
// Phase 2: real ADS handshake. Connect + ReadStateAsync. The crux is the three-way
// classification of how this can fail — see the class-doc and ClassifyHandshakeFailure.
var degradeNote = $"Reachable at {host}:{port} (ADS handshake unavailable on this host — TCP reachability only)";
try
{
using var client = new AdsClient();
// Bound the ADS round-trip by the caller's timeout (clamped >=1s, mirrors AdsTwinCATClient).
client.Timeout = (int)Math.Max(1_000, timeout.TotalMilliseconds);
// Connect can throw a server exception ("running TwinCAT router instance!") on a headless
// host with no AMS router — that is a can't-attempt DEGRADE, classified below.
client.Connect(AmsNetId.Parse(parsed.NetId), parsed.Port);
var state = await client.ReadStateAsync(ct).ConfigureAwait(false);
sw.Stop();
if (state.Succeeded)
return new(true, $"ADS state: {state.State.AdsState}", sw.Elapsed);
// Non-throwing failure carried in the result's error code.
return state.ErrorCode == AdsErrorCode.ClientPortNotOpen
? new(true, degradeNote, sw.Elapsed) // client never opened — DEGRADE
: ClassifyHandshakeFailure(state.ErrorCode, host, port, sw, degradeNote);
}
catch (OperationCanceledException) when (ct.IsCancellationRequested)
{
// Caller timeout — keep the original timed-out message.
return new(false, $"Probe timed out after {timeout.TotalSeconds:F0}s.", null);
}
catch (AdsErrorException ex)
{
// The router answered with an ADS-level error. Route/auth rejection → RED; anything
// else (sync timeout, client port issues, …) → DEGRADE.
return ClassifyHandshakeFailure(ex.ErrorCode, host, port, sw, degradeNote);
}
catch (Exception)
{
// Everything else — TwinCAT.Ads.Server.AdsServerException ("running TwinCAT router
// instance!"), PlatformNotSupportedException, TypeInitializationException,
// DllNotFoundException, NotSupportedException, etc. — means the handshake could not be
// ATTEMPTED on this host. DEGRADE: never worse than the TCP-only probe.
sw.Stop();
return new(true, degradeNote, sw.Elapsed);
}
}
private static (string host, int port) ExtractTarget(TwinCATDriverOptions opts)
/// <summary>
/// Classifies an ADS-level failure (from an <see cref="AdsErrorException"/> or a non-success
/// <c>ReadStateAsync</c> result). A route/target-port/access code means the router answered
/// but refused the route → RED. Any other code is treated as "couldn't attempt" → DEGRADE,
/// so the probe never under-reports a host with no usable ADS runtime.
/// </summary>
private static DriverProbeResult ClassifyHandshakeFailure(
AdsErrorCode code, string host, int port, Stopwatch sw, string degradeNote)
{
if (_routeRejectCodes.Contains(code))
return new(false,
$"Reachable at {host}:{port} but ADS handshake failed: {code} — check the target's ADS route table authorizes this host",
null);
sw.Stop();
return new(true, degradeNote, sw.Elapsed);
}
private static (string host, int port, TwinCATAmsAddress? parsed) ExtractTarget(TwinCATDriverOptions opts)
{
// Parse the first device's ads:// address. AMS Net ID is six-octet; by Beckhoff
// convention the first four octets are the host IPv4. Extract those as the TCP target.
var firstDevice = opts.Devices.FirstOrDefault();
if (firstDevice is null) return (string.Empty, 0);
if (firstDevice is null) return (string.Empty, 0, null);
var parsed = TwinCATAmsAddress.TryParse(firstDevice.HostAddress);
if (parsed is null) return (string.Empty, 0);
if (parsed is null) return (string.Empty, 0, null);
// NetId = "a.b.c.d.e.f" — take the first 4 octets as the host IP.
var parts = parsed.NetId.Split('.');
if (parts.Length < 4) return (string.Empty, 0);
if (parts.Length < 4) return (string.Empty, 0, null);
var hostIp = string.Join('.', parts[0], parts[1], parts[2], parts[3]);
return (hostIp, parsed.Port);
return (hostIp, parsed.Port, parsed);
}
}
@@ -0,0 +1,222 @@
// Unit tests for TwinCATDriverProbe — the degrade-guarded ADS ReadState Test-Connect probe.
//
// Coverage here is the set of outcomes that are *deterministic on a CI/dev box with no TwinCAT
// runtime*: invalid JSON, missing host/port, an unreachable TCP target, and — the crux — the
// DEGRADE path. On this box there is no AMS router, so once the TCP preflight succeeds against a
// reachable listener the managed ADS client cannot initialise (it throws a "Check for a running
// TwinCAT router instance!" server exception, NOT an ADS device rejection). The probe MUST treat
// that as "TCP reachability only" (Ok=true) so it never regresses below the old TCP-only probe.
//
// The two paths that REQUIRE a live ADS target are LIVE-VERIFY DEFERRED (no TwinCAT runtime on the
// rig): (a) the happy "ADS state: Run/Config/Stop" path (Ok=true with the AdsState name), and
// (b) the route/auth-reject RED path (a reachable router that refuses the route, surfaced as an
// AdsErrorException carrying a TargetPortNotFound / route AdsErrorCode → Ok=false with the
// "check the target's ADS route table" message). Those are exercised against a real TwinCAT XAR /
// remote runtime, out of scope for this unit suite.
using System.Net;
using System.Net.Sockets;
using Shouldly;
using Xunit;
namespace ZB.MOM.WW.OtOpcUa.Driver.TwinCAT.Tests;
[Trait("Category", "Unit")]
public sealed class TwinCATDriverProbeTests
{
private static readonly TwinCATDriverProbe Probe = new();
// -------------------------------------------------------------------------
// 1. Invalid JSON
// -------------------------------------------------------------------------
/// <summary>Invalid JSON returns Ok=false with a message containing "invalid".</summary>
[Fact]
public async Task InvalidJson_Returns_OkFalse_WithInvalidMessage()
{
var result = await Probe.ProbeAsync(
"not-valid-json{{{",
TimeSpan.FromSeconds(3),
TestContext.Current.CancellationToken);
result.Ok.ShouldBeFalse();
result.Message.ShouldNotBeNull();
result.Message!.ToLowerInvariant().ShouldContain("invalid");
result.Latency.ShouldBeNull();
}
// -------------------------------------------------------------------------
// 2. Config with no device / no host
// -------------------------------------------------------------------------
/// <summary>An empty Devices list returns Ok=false with a "no host/port" message.</summary>
[Fact]
public async Task NoDevices_Returns_OkFalse_WithNoHostPortMessage()
{
var result = await Probe.ProbeAsync(
"{\"devices\":[]}",
TimeSpan.FromSeconds(3),
TestContext.Current.CancellationToken);
result.Ok.ShouldBeFalse();
result.Message.ShouldNotBeNull();
result.Message!.ShouldContain("no host/port");
result.Latency.ShouldBeNull();
}
/// <summary>
/// A first device with a malformed (non-<c>ads://</c>) host address returns Ok=false with a
/// "no host/port" message because <see cref="TwinCATAmsAddress.TryParse"/> returns null.
/// </summary>
[Fact]
public async Task MalformedHostAddress_Returns_OkFalse_WithNoHostPortMessage()
{
var result = await Probe.ProbeAsync(
"{\"devices\":[{\"hostAddress\":\"not-an-ads-url\"}]}",
TimeSpan.FromSeconds(3),
TestContext.Current.CancellationToken);
result.Ok.ShouldBeFalse();
result.Message.ShouldNotBeNull();
result.Message!.ShouldContain("no host/port");
result.Latency.ShouldBeNull();
}
// -------------------------------------------------------------------------
// 3. Unreachable target — TCP preflight fails first
// -------------------------------------------------------------------------
/// <summary>
/// A closed port causes the TCP preflight <c>ConnectAsync</c> to throw
/// <see cref="SocketException"/>; the probe returns Ok=false with "Connect failed".
/// The port is reserved then released so the OS refuses the connection immediately.
/// </summary>
[Fact]
public async Task ClosedPort_Returns_OkFalse_WithConnectFailedMessage()
{
// Reserve an ephemeral port then release it so the port is definitely closed.
var reserved = new TcpListener(IPAddress.Loopback, 0);
reserved.Start();
var port = ((IPEndPoint)reserved.LocalEndpoint).Port;
reserved.Stop();
// 127.0.0.1 as the first four AMS-Net-ID octets → the probe's TCP target is 127.0.0.1.
var configJson = $"{{\"devices\":[{{\"hostAddress\":\"ads://127.0.0.1.1.1:{port}\"}}]}}";
using var cts = CancellationTokenSource.CreateLinkedTokenSource(
TestContext.Current.CancellationToken);
cts.CancelAfter(TimeSpan.FromSeconds(5));
var result = await Probe.ProbeAsync(
configJson,
TimeSpan.FromSeconds(3),
cts.Token);
result.Ok.ShouldBeFalse();
result.Message.ShouldNotBeNull();
// Either a fast SocketException ("Connect failed") or, if the OS stalls, a timeout.
var lower = result.Message!.ToLowerInvariant();
(lower.Contains("connect failed") || lower.Contains("timed out")).ShouldBeTrue(
$"Expected 'Connect failed' or 'timed out' but got: {result.Message}");
result.Latency.ShouldBeNull();
}
/// <summary>
/// A target on a non-routable IP (192.0.2.1 — TEST-NET-1, RFC 5737) with a short
/// cancellation window times out (or fails fast on EHOSTUNREACH) at the TCP preflight.
/// </summary>
[Fact]
public async Task BlackHoleTarget_Returns_OkFalse_WithTimedOutOrConnectFailed()
{
using var cts = CancellationTokenSource.CreateLinkedTokenSource(
TestContext.Current.CancellationToken);
cts.CancelAfter(TimeSpan.FromSeconds(1));
var configJson = "{\"devices\":[{\"hostAddress\":\"ads://192.0.2.1.1.1:851\"}]}";
var result = await Probe.ProbeAsync(
configJson,
TimeSpan.FromSeconds(1),
cts.Token);
result.Ok.ShouldBeFalse();
result.Message.ShouldNotBeNull();
var lower = result.Message!.ToLowerInvariant();
(lower.Contains("timed out") || lower.Contains("connect failed")).ShouldBeTrue(
$"Expected 'timed out' or 'Connect failed' but got: {result.Message}");
result.Latency.ShouldBeNull();
}
// -------------------------------------------------------------------------
// 4. DEGRADE path — TCP reachable, but the ADS handshake can't be attempted here
// -------------------------------------------------------------------------
/// <summary>
/// The crux. A reachable TCP listener satisfies the preflight, but on this box (no TwinCAT
/// AMS router) the managed ADS client cannot establish a session — <c>AdsClient.Connect</c>
/// throws an environment/router exception (NOT an ADS device rejection). The probe MUST
/// DEGRADE: <c>Ok=true</c> with the "TCP reachability only" note, never a false RED. This
/// guarantees the new probe never reports worse than the old TCP-only probe on a host with
/// no ADS runtime.
/// </summary>
[Fact]
public async Task ReachableTcp_NoAdsRuntime_Degrades_To_OkTrue_WithTcpOnlyNote()
{
// Stand up a real TCP listener and accept (then drop) connections so the preflight
// ConnectAsync succeeds. We never speak ADS — the ADS handshake is what must degrade.
var listener = new TcpListener(IPAddress.Loopback, 0);
listener.Start();
var port = ((IPEndPoint)listener.LocalEndpoint).Port;
using var acceptCts = CancellationTokenSource.CreateLinkedTokenSource(
TestContext.Current.CancellationToken);
var acceptLoop = Task.Run(async () =>
{
try
{
while (true)
{
var client = await listener.AcceptTcpClientAsync(acceptCts.Token);
// Hold the socket briefly so the preflight sees a clean accept, then close.
_ = Task.Run(async () =>
{
await Task.Delay(200);
client.Dispose();
});
}
}
catch (OperationCanceledException) { /* accept cancelled — expected on teardown */ }
catch (ObjectDisposedException) { /* listener stopped — expected on teardown */ }
catch (SocketException) { /* listener stopped — expected on teardown */ }
});
try
{
var configJson = $"{{\"devices\":[{{\"hostAddress\":\"ads://127.0.0.1.1.1:{port}\"}}]}}";
using var cts = CancellationTokenSource.CreateLinkedTokenSource(
TestContext.Current.CancellationToken);
cts.CancelAfter(TimeSpan.FromSeconds(10));
var result = await Probe.ProbeAsync(
configJson,
TimeSpan.FromSeconds(3),
cts.Token);
// DEGRADE guard: must be Ok=true (never worse than TCP-only), with the explicit note
// and a latency. If on some box AdsClient instead threw synchronously at construction,
// the probe still degrades the same way — this assertion holds either way.
result.Ok.ShouldBeTrue(
$"Degrade guard violated — probe returned a worse-than-TCP result: {result.Message}");
result.Message.ShouldNotBeNull();
result.Message!.ShouldContain("ADS handshake unavailable on this host");
result.Message!.ShouldContain("TCP reachability only");
result.Latency.ShouldNotBeNull();
}
finally
{
await acceptCts.CancelAsync();
listener.Stop();
try { await acceptLoop; } catch { /* ignore teardown races */ }
}
}
}