Phase 3 PR 53 -- Transport reconnect-on-drop + SO_KEEPALIVE for DL205 no-keepalive quirk. AutomationDirect H2-ECOM100 does NOT send TCP keepalives per docs/v2/dl205.md behavioral-oddities section -- any NAT/firewall device between the gateway and the PLC can silently close an idle socket after 2-5 minutes of inactivity. The PLC itself never notices and the first SendAsync after the drop would previously surface as IOException / EndOfStreamException / SocketException to the caller even though the PLC is perfectly healthy. PR 53 makes ModbusTcpTransport survive mid-session socket drops: SendAsync wraps the previous body as SendOnceAsync; on the first attempt, if the failure is a socket-layer error (IOException, SocketException, EndOfStreamException, ObjectDisposedException) AND autoReconnect is enabled (default true), the transport tears down the dead socket, calls ConnectAsync to re-establish, and resends the PDU exactly once. Deliberately single-retry -- further failures propagate so the driver health surface reflects the real state, no masking a dead PLC. Protocol-layer failures (e.g. ModbusException with exception code 02) are specifically NOT caught by the reconnect path -- they would just come back with the same exception code after the reconnect, so retrying is wasted wire time. Socket-level vs protocol-level is a discriminator inside IsSocketLevelFailure. Also enables SO_KEEPALIVE on the TcpClient with aggressive timing: TcpKeepAliveTime=30s, TcpKeepAliveInterval=10s, TcpKeepAliveRetryCount=3. Total time-to-detect-dead-socket = 30 + 10*3 = 60s, vs the Windows default 2-hour idle + 9 retries = 2h40min. Best-effort: older OSes that don't expose the fine-grained keepalive knobs silently skip them (catch {}). New ModbusDriverOptions.AutoReconnect bool (default true) threads through to the default transport factory in ModbusDriver -- callers wanting the old 'fail loud on drop' behavior can set AutoReconnect=false, or use a custom transportFactory that ignores the option. Unit tests: ModbusTcpReconnectTests boots a FlakeyModbusServer in-process (real TcpListener on loopback) that serves one valid FC03 response then forcibly shuts down the socket. Transport_recovers_from_mid_session_drop_and_retries_successfully issues two consecutive SendAsync calls and asserts both return valid PDUs -- the second must trigger the reconnect path transparently. Transport_without_AutoReconnect_propagates_drop_to_caller asserts the legacy behavior when the opt-out is taken. Validates real socket semantics rather than mocked exceptions. 142/142 Modbus.Tests pass (113 prior + 2 mapper + 2 reconnect + 25 accumulated across PRs 45-52); 11/11 DL205 integration tests still pass with MODBUS_SIM_PROFILE=dl205 -- no regression from the transport change.

This commit is contained in:
Joseph Doherty
2026-04-18 22:32:13 -04:00
parent cde018aec1
commit 793c787315
4 changed files with 271 additions and 37 deletions

View File

@@ -8,22 +8,40 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Modbus;
/// support concurrent transactions, but the single-flight model keeps the wire trace
/// easy to diagnose and avoids interleaved-response correlation bugs.
/// </summary>
/// <remarks>
/// <para>
/// Survives mid-transaction socket drops: when a send/read fails with a socket-level
/// error (<see cref="IOException"/>, <see cref="SocketException"/>, <see cref="EndOfStreamException"/>)
/// the transport disposes the dead socket, reconnects, and retries the PDU exactly
/// once. Deliberately limited to a single retry — further failures bubble up so the
/// driver's health surface reflects the real state instead of masking a dead PLC.
/// </para>
/// <para>
/// Why this matters for DL205/DL260: the AutomationDirect H2-ECOM100 does NOT send
/// TCP keepalives per <c>docs/v2/dl205.md</c> §behavioral-oddities, so any NAT/firewall
/// between the gateway and PLC can silently close an idle socket after 2-5 minutes.
/// Also enables OS-level <c>SO_KEEPALIVE</c> so the driver's own side detects a stuck
/// socket in reasonable time even when the application is mostly idle.
/// </para>
/// </remarks>
public sealed class ModbusTcpTransport : IModbusTransport
{
private readonly string _host;
private readonly int _port;
private readonly TimeSpan _timeout;
private readonly bool _autoReconnect;
private readonly SemaphoreSlim _gate = new(1, 1);
private TcpClient? _client;
private NetworkStream? _stream;
private ushort _nextTx;
private bool _disposed;
public ModbusTcpTransport(string host, int port, TimeSpan timeout)
public ModbusTcpTransport(string host, int port, TimeSpan timeout, bool autoReconnect = true)
{
_host = host;
_port = port;
_timeout = timeout;
_autoReconnect = autoReconnect;
}
public async Task ConnectAsync(CancellationToken ct)
@@ -39,12 +57,34 @@ public sealed class ModbusTcpTransport : IModbusTransport
var target = ipv4 ?? (addresses.Length > 0 ? addresses[0] : System.Net.IPAddress.Loopback);
_client = new TcpClient(target.AddressFamily);
EnableKeepAlive(_client);
using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
cts.CancelAfter(_timeout);
await _client.ConnectAsync(target, _port, cts.Token).ConfigureAwait(false);
_stream = _client.GetStream();
}
/// <summary>
/// Enable SO_KEEPALIVE with aggressive probe timing. DL205/DL260 doesn't send keepalives
/// itself; having the OS probe the socket every ~30s lets the driver notice a dead PLC
/// or broken NAT path long before the default 2-hour Windows idle timeout fires.
/// Non-fatal if the underlying OS rejects the option (some older Linux / container
/// sandboxes don't expose the fine-grained timing levers — the driver still works,
/// application-level probe still detects problems).
/// </summary>
private static void EnableKeepAlive(TcpClient client)
{
try
{
client.Client.SetSocketOption(SocketOptionLevel.Socket, SocketOptionName.KeepAlive, true);
client.Client.SetSocketOption(SocketOptionLevel.Tcp, SocketOptionName.TcpKeepAliveTime, 30);
client.Client.SetSocketOption(SocketOptionLevel.Tcp, SocketOptionName.TcpKeepAliveInterval, 10);
client.Client.SetSocketOption(SocketOptionLevel.Tcp, SocketOptionName.TcpKeepAliveRetryCount, 3);
}
catch { /* best-effort; older OSes may not expose the granular knobs */ }
}
public async Task<byte[]> SendAsync(byte unitId, byte[] pdu, CancellationToken ct)
{
if (_disposed) throw new ObjectDisposedException(nameof(ModbusTcpTransport));
@@ -53,43 +93,18 @@ public sealed class ModbusTcpTransport : IModbusTransport
await _gate.WaitAsync(ct).ConfigureAwait(false);
try
{
var txId = ++_nextTx;
// MBAP: [TxId(2)][Proto=0(2)][Length(2)][UnitId(1)] + PDU
var adu = new byte[7 + pdu.Length];
adu[0] = (byte)(txId >> 8);
adu[1] = (byte)(txId & 0xFF);
// protocol id already zero
var len = (ushort)(1 + pdu.Length); // unit id + pdu
adu[4] = (byte)(len >> 8);
adu[5] = (byte)(len & 0xFF);
adu[6] = unitId;
Buffer.BlockCopy(pdu, 0, adu, 7, pdu.Length);
using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
cts.CancelAfter(_timeout);
await _stream.WriteAsync(adu.AsMemory(), cts.Token).ConfigureAwait(false);
await _stream.FlushAsync(cts.Token).ConfigureAwait(false);
var header = new byte[7];
await ReadExactlyAsync(_stream, header, cts.Token).ConfigureAwait(false);
var respTxId = (ushort)((header[0] << 8) | header[1]);
if (respTxId != txId)
throw new InvalidDataException($"Modbus TxId mismatch: expected {txId} got {respTxId}");
var respLen = (ushort)((header[4] << 8) | header[5]);
if (respLen < 1) throw new InvalidDataException($"Modbus response length too small: {respLen}");
var respPdu = new byte[respLen - 1];
await ReadExactlyAsync(_stream, respPdu, cts.Token).ConfigureAwait(false);
// Exception PDU: function code has high bit set.
if ((respPdu[0] & 0x80) != 0)
try
{
var fc = (byte)(respPdu[0] & 0x7F);
var ex = respPdu[1];
throw new ModbusException(fc, ex, $"Modbus exception fc={fc} code={ex}");
return await SendOnceAsync(unitId, pdu, ct).ConfigureAwait(false);
}
catch (Exception ex) when (_autoReconnect && IsSocketLevelFailure(ex))
{
// Mid-transaction drop: tear down the dead socket, reconnect, resend. Single
// retry — if it fails again, let it propagate so health/status reflect reality.
await TearDownAsync().ConfigureAwait(false);
await ConnectAsync(ct).ConfigureAwait(false);
return await SendOnceAsync(unitId, pdu, ct).ConfigureAwait(false);
}
return respPdu;
}
finally
{
@@ -97,6 +112,68 @@ public sealed class ModbusTcpTransport : IModbusTransport
}
}
private async Task<byte[]> SendOnceAsync(byte unitId, byte[] pdu, CancellationToken ct)
{
if (_stream is null) throw new InvalidOperationException("Transport not connected");
var txId = ++_nextTx;
// MBAP: [TxId(2)][Proto=0(2)][Length(2)][UnitId(1)] + PDU
var adu = new byte[7 + pdu.Length];
adu[0] = (byte)(txId >> 8);
adu[1] = (byte)(txId & 0xFF);
// protocol id already zero
var len = (ushort)(1 + pdu.Length); // unit id + pdu
adu[4] = (byte)(len >> 8);
adu[5] = (byte)(len & 0xFF);
adu[6] = unitId;
Buffer.BlockCopy(pdu, 0, adu, 7, pdu.Length);
using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
cts.CancelAfter(_timeout);
await _stream.WriteAsync(adu.AsMemory(), cts.Token).ConfigureAwait(false);
await _stream.FlushAsync(cts.Token).ConfigureAwait(false);
var header = new byte[7];
await ReadExactlyAsync(_stream, header, cts.Token).ConfigureAwait(false);
var respTxId = (ushort)((header[0] << 8) | header[1]);
if (respTxId != txId)
throw new InvalidDataException($"Modbus TxId mismatch: expected {txId} got {respTxId}");
var respLen = (ushort)((header[4] << 8) | header[5]);
if (respLen < 1) throw new InvalidDataException($"Modbus response length too small: {respLen}");
var respPdu = new byte[respLen - 1];
await ReadExactlyAsync(_stream, respPdu, cts.Token).ConfigureAwait(false);
// Exception PDU: function code has high bit set.
if ((respPdu[0] & 0x80) != 0)
{
var fc = (byte)(respPdu[0] & 0x7F);
var ex = respPdu[1];
throw new ModbusException(fc, ex, $"Modbus exception fc={fc} code={ex}");
}
return respPdu;
}
/// <summary>
/// Distinguish socket-layer failures (eligible for reconnect-and-retry) from
/// protocol-layer failures (must propagate — retrying the same PDU won't help if the
/// PLC just returned exception 02 Illegal Data Address).
/// </summary>
private static bool IsSocketLevelFailure(Exception ex) =>
ex is EndOfStreamException
|| ex is IOException
|| ex is SocketException
|| ex is ObjectDisposedException;
private async Task TearDownAsync()
{
try { if (_stream is not null) await _stream.DisposeAsync().ConfigureAwait(false); }
catch { /* best-effort */ }
_stream = null;
try { _client?.Dispose(); } catch { }
_client = null;
}
private static async Task ReadExactlyAsync(Stream s, byte[] buf, CancellationToken ct)
{
var read = 0;