From 3b0e093002f1864bd7df8cfb7b2533ccc7dbf58d Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Sat, 25 Apr 2026 01:01:42 -0400 Subject: [PATCH] =?UTF-8?q?Task=20#148=20=E2=80=94=20Modbus=20block-coales?= =?UTF-8?q?cing:=20auto-recover=20from=20protected=20register=20holes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-#148 behaviour: a coalesced FC03/FC04 read that crossed a write-only or PLC-fault register marked every member tag Bad until the operator manually flagged the offending tag with CoalesceProhibited. Healthy tags around the hole stayed broken indefinitely. Post-#148: two-stage recovery, no operator intervention needed. 1. Same-scan fallback: when a coalesced read fails with a Modbus exception (IllegalDataAddress, SlaveDeviceFailure, etc.), the planner does NOT mark members handled. The per-tag fallback in the same scan reads each member individually — non-protected members surface Good values immediately, and only the actual protected register stays Bad. 2. Cross-scan prohibition: the failed range (Unit, Region, Start, End) is recorded in a per-driver `_autoProhibited` set. On subsequent scans the planner checks each candidate merge against the set and refuses to re-form any block that overlaps a known-bad range. Net effect: after one scan with a failure, the protected range goes "per-tag mode" indefinitely while ranges around it keep coalescing normally. Communication failures (timeouts, socket drops) are NOT auto-prohibited — they're transport-level, not structural. The same coalesced read can succeed once the transport recovers; recording it as "permanently bad" would defeat coalescing for the whole driver instance. Auto-prohibition state lives for the driver lifetime and clears on ReinitializeAsync (operator restart). A periodic re-probe is a follow-up if deployments need it without a restart. Implementation: - Added `_autoProhibited` HashSet<(byte, ModbusRegion, ushort, ushort)> + `_autoProhibitedLock` on ModbusDriver. - `RangeIsAutoProhibited(unit, region, start, end)` overlap check called from the planner when forming blocks. - `RecordAutoProhibition(...)` called from the catch (ModbusException) branch. - The catch (Exception) branch (non-Modbus failures) keeps the pre-#148 "mark all Bad in this scan, don't auto-prohibit" behaviour. - Internal `AutoProhibitedRangeCount` accessor for tests. Tests (3 new ModbusCoalescingAutoRecoveryTests): - First_Failure_Falls_Back_To_PerTag_Same_Scan — three tags around a protected register at 102: T100 + T104 surface Good values via the per-tag fallback in the SAME scan; T102 surfaces the exception. - Second_Scan_Skips_Coalesced_Read_Of_Prohibited_Range — confirms scan 2 doesn't re-attempt the failed merge (no FC03 with quantity > 1 at the prohibited start). - Tags_Outside_Prohibited_Range_Still_Coalesce — separate cluster at HR 200..202 keeps coalescing normally even after the 100..104 cluster is prohibited. 234/234 unit tests green. Follow-ups intentionally NOT shipped (smaller, independent changes): - Bisection-style range narrowing — currently the prohibition range is the full failed block; the planner doesn't try to find the exact protected register. Operator-visible diagnostic + prohibition stays correct. - Periodic re-probe to clear stale prohibitions. - Surface auto-prohibited ranges through GetHostStatuses or a new diagnostic so the Admin UI can show what's been auto-isolated. --- .../ModbusDriver.cs | 60 +++++++- .../ModbusCoalescingAutoRecoveryTests.cs | 136 ++++++++++++++++++ 2 files changed, 193 insertions(+), 3 deletions(-) create mode 100644 tests/ZB.MOM.WW.OtOpcUa.Driver.Modbus.Tests/ModbusCoalescingAutoRecoveryTests.cs diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.Modbus/ModbusDriver.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.Modbus/ModbusDriver.cs index 7da8eac..c33d911 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Driver.Modbus/ModbusDriver.cs +++ b/src/ZB.MOM.WW.OtOpcUa.Driver.Modbus/ModbusDriver.cs @@ -386,6 +386,43 @@ public sealed class ModbusDriver /// Resolve the UnitId for a tag — per-tag override (#142) or driver-level fallback. private byte ResolveUnitId(ModbusTagDefinition tag) => tag.UnitId ?? _options.UnitId; + /// + /// #148 — runtime-discovered ranges where coalesced reads have failed (typically because + /// the PLC has a write-only or protected register mid-block). Subsequent scans skip + /// coalescing across these ranges and let the per-tag fallback handle the members. + /// Cleared by ReinitializeAsync (operator restart) or by an explicit re-probe API + /// (not yet shipped). + /// + private readonly HashSet<(byte Unit, ModbusRegion Region, ushort Start, ushort End)> _autoProhibited = new(); + private readonly object _autoProhibitedLock = new(); + + private bool RangeIsAutoProhibited(byte unit, ModbusRegion region, ushort start, ushort end) + { + lock (_autoProhibitedLock) + { + foreach (var p in _autoProhibited) + { + // A candidate (start..end) range is prohibited if it overlaps any recorded + // failure. Overlap rule: max-start ≤ min-end. We don't try to be smart about + // partial overlap — once a range fails, any superset of it is also untrusted. + if (p.Unit != unit || p.Region != region) continue; + if (Math.Max(start, p.Start) <= Math.Min(end, p.End)) return true; + } + return false; + } + } + + private void RecordAutoProhibition(byte unit, ModbusRegion region, ushort start, ushort end) + { + lock (_autoProhibitedLock) _autoProhibited.Add((unit, region, start, end)); + } + + /// Test/diagnostic accessor — returns the current auto-prohibited range count. + internal int AutoProhibitedRangeCount + { + get { lock (_autoProhibitedLock) return _autoProhibited.Count; } + } + /// /// #143 block-read coalescing planner. Groups eligible tags by (UnitId, Region), sorts /// by start address, and merges adjacent / near-adjacent (gap ≤ MaxReadGap) into single @@ -438,7 +475,10 @@ public sealed class ModbusDriver var gap = tagStart - last.End - 1; var newEnd = Math.Max(tagEnd, last.End); var newSpan = newEnd - last.Start + 1; - if (gap <= _options.MaxReadGap && newSpan <= cap) + // #148 — skip merges that would re-attempt a known-bad range. The + // per-tag fallback will read each member individually instead. + var crossesProhibition = RangeIsAutoProhibited(group.Key.Unit, group.Key.Region, last.Start, (ushort)newEnd); + if (gap <= _options.MaxReadGap && newSpan <= cap && !crossesProhibition) { last.Members.Add((idx, tag)); blocks[^1] = (last.Start, (ushort)newEnd, last.Members); @@ -477,16 +517,30 @@ public sealed class ModbusDriver } catch (ModbusException mex) { + // #148 — record the failed range so the planner stops re-coalescing across + // it on subsequent scans. Per-tag fallback reads each member individually + // next time, so healthy tags around the protected hole keep working without + // operator intervention. + RecordAutoProhibition(group.Key.Unit, group.Key.Region, block.Start, block.End); + var status = MapModbusExceptionToStatus(mex.ExceptionCode); foreach (var (idx, _) in block.Members) { - results[idx] = new DataValueSnapshot(null, status, null, timestamp); - handled.Add(idx); + // Don't mark members handled — leave them for the per-tag fallback in + // the same scan so single-register reads can succeed for any non- + // protected member. (Pre-#148 behaviour was to mark all Bad and skip.) + // Members that ARE the protected register will fail again at single-tag + // granularity and surface the per-tag exception code naturally. } _health = new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, mex.Message); } catch (Exception ex) { + // Communication failures (timeout, socket drop) aren't a structural reason + // to prohibit the range — the same coalesced read might succeed once the + // transport recovers. Mark members Bad for this scan but don't auto-prohibit + // and don't deflect to per-tag fallback (which would just hit the same dead + // socket). foreach (var (idx, _) in block.Members) { results[idx] = new DataValueSnapshot(null, StatusBadCommunicationError, null, timestamp); diff --git a/tests/ZB.MOM.WW.OtOpcUa.Driver.Modbus.Tests/ModbusCoalescingAutoRecoveryTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Driver.Modbus.Tests/ModbusCoalescingAutoRecoveryTests.cs new file mode 100644 index 0000000..5e03a8a --- /dev/null +++ b/tests/ZB.MOM.WW.OtOpcUa.Driver.Modbus.Tests/ModbusCoalescingAutoRecoveryTests.cs @@ -0,0 +1,136 @@ +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Core.Abstractions; + +namespace ZB.MOM.WW.OtOpcUa.Driver.Modbus.Tests; + +/// +/// #148 — block-coalescing auto-recovery from protected register holes. When a coalesced +/// FC03 fails with a Modbus exception, the planner records the failed range and stops +/// re-coalescing across it on subsequent scans. Healthy tags around the protected hole +/// keep working without operator intervention. +/// +[Trait("Category", "Unit")] +public sealed class ModbusCoalescingAutoRecoveryTests +{ + /// + /// Programmable transport that returns IllegalDataAddress (Modbus exception code 0x02) + /// when a read covers a configured "protected" register address. Otherwise responds + /// normally with zero-filled data of the requested size. + /// + private sealed class ProtectedHoleTransport : IModbusTransport + { + public ushort ProtectedAddress { get; set; } = ushort.MaxValue; + public readonly List<(byte Fc, ushort Address, ushort Quantity)> Reads = new(); + public Task ConnectAsync(CancellationToken ct) => Task.CompletedTask; + public Task SendAsync(byte unitId, byte[] pdu, CancellationToken ct) + { + var addr = (ushort)((pdu[1] << 8) | pdu[2]); + var qty = (ushort)((pdu[3] << 8) | pdu[4]); + if (pdu[0] is 0x03 or 0x04) Reads.Add((pdu[0], addr, qty)); + + // If the protected address falls within the request span, return a Modbus exception + // PDU. The driver's transport layer detects exceptions by the high bit on the FC. + if (pdu[0] is 0x03 or 0x04 && ProtectedAddress >= addr && ProtectedAddress < addr + qty) + return Task.FromException(new ModbusException(pdu[0], 0x02, "IllegalDataAddress")); + + switch (pdu[0]) + { + case 0x03: case 0x04: + { + var resp = new byte[2 + qty * 2]; + resp[0] = pdu[0]; resp[1] = (byte)(qty * 2); + return Task.FromResult(resp); + } + default: return Task.FromResult(new byte[] { pdu[0], 0, 0 }); + } + } + public ValueTask DisposeAsync() => ValueTask.CompletedTask; + } + + [Fact] + public async Task First_Failure_Falls_Back_To_PerTag_Same_Scan() + { + var fake = new ProtectedHoleTransport { ProtectedAddress = 102 }; + // Three tags: 100, 102 (protected), 104. With MaxReadGap=5, the coalesced block is + // 100..104 — covers the protected register, so FC03 quantity=5 fails. Pre-#148 marked + // ALL three Bad. Post-#148, the failure auto-falls back to per-tag in the same scan + // so 100 and 104 still surface Good values. + var t100 = new ModbusTagDefinition("T100", ModbusRegion.HoldingRegisters, 100, ModbusDataType.Int16); + var t102 = new ModbusTagDefinition("T102", ModbusRegion.HoldingRegisters, 102, ModbusDataType.Int16); + var t104 = new ModbusTagDefinition("T104", ModbusRegion.HoldingRegisters, 104, ModbusDataType.Int16); + var opts = new ModbusDriverOptions { Host = "f", Tags = [t100, t102, t104], MaxReadGap = 5, + Probe = new ModbusProbeOptions { Enabled = false } }; + var drv = new ModbusDriver(opts, "m1", _ => fake); + await drv.InitializeAsync("{}", CancellationToken.None); + + var values = await drv.ReadAsync(["T100", "T102", "T104"], CancellationToken.None); + + // T100 + T104 should fall through per-tag and succeed; T102 is the protected register + // and surfaces the exception status code at single-tag granularity. + values[0].StatusCode.ShouldBe(0u, "T100 should succeed via per-tag fallback"); + values[2].StatusCode.ShouldBe(0u, "T104 should succeed via per-tag fallback"); + values[1].StatusCode.ShouldNotBe(0u, "T102 is the protected address — single-tag read still surfaces the exception"); + + await drv.ShutdownAsync(CancellationToken.None); + } + + [Fact] + public async Task Second_Scan_Skips_Coalesced_Read_Of_Prohibited_Range() + { + var fake = new ProtectedHoleTransport { ProtectedAddress = 102 }; + var t100 = new ModbusTagDefinition("T100", ModbusRegion.HoldingRegisters, 100, ModbusDataType.Int16); + var t102 = new ModbusTagDefinition("T102", ModbusRegion.HoldingRegisters, 102, ModbusDataType.Int16); + var t104 = new ModbusTagDefinition("T104", ModbusRegion.HoldingRegisters, 104, ModbusDataType.Int16); + var opts = new ModbusDriverOptions { Host = "f", Tags = [t100, t102, t104], MaxReadGap = 5, + Probe = new ModbusProbeOptions { Enabled = false } }; + var drv = new ModbusDriver(opts, "m1", _ => fake); + await drv.InitializeAsync("{}", CancellationToken.None); + + // Scan 1: planner forms 100..104 block, fails, records the prohibition. + await drv.ReadAsync(["T100", "T102", "T104"], CancellationToken.None); + drv.AutoProhibitedRangeCount.ShouldBe(1); + + var scan1Reads = fake.Reads.Count; + // Scan 2: planner sees the prohibition, doesn't form the 100..104 block, falls back to + // per-tag for everyone. Total scan-2 PDUs: 3 (one per tag) — vs 1 failed coalesced + // read + 3 per-tag fallbacks if we re-tried the merge. + fake.Reads.Clear(); + await drv.ReadAsync(["T100", "T102", "T104"], CancellationToken.None); + + var coalescedAttemptedAgain = fake.Reads.Any(r => r.Address == 100 && r.Quantity > 1); + coalescedAttemptedAgain.ShouldBeFalse("planner must NOT re-attempt the prohibited block"); + + await drv.ShutdownAsync(CancellationToken.None); + } + + [Fact] + public async Task Tags_Outside_Prohibited_Range_Still_Coalesce() + { + var fake = new ProtectedHoleTransport { ProtectedAddress = 102 }; + // Tags split across the protected boundary: cluster 100..104 (will fail) and cluster + // 200..204 (well clear of the protected register). The 200-cluster should keep + // coalescing on subsequent scans even after the 100-cluster is prohibited. + var t100 = new ModbusTagDefinition("T100", ModbusRegion.HoldingRegisters, 100, ModbusDataType.Int16); + var t102 = new ModbusTagDefinition("T102", ModbusRegion.HoldingRegisters, 102, ModbusDataType.Int16); + var t104 = new ModbusTagDefinition("T104", ModbusRegion.HoldingRegisters, 104, ModbusDataType.Int16); + var t200 = new ModbusTagDefinition("T200", ModbusRegion.HoldingRegisters, 200, ModbusDataType.Int16); + var t202 = new ModbusTagDefinition("T202", ModbusRegion.HoldingRegisters, 202, ModbusDataType.Int16); + var opts = new ModbusDriverOptions { Host = "f", Tags = [t100, t102, t104, t200, t202], MaxReadGap = 5, + Probe = new ModbusProbeOptions { Enabled = false } }; + var drv = new ModbusDriver(opts, "m1", _ => fake); + await drv.InitializeAsync("{}", CancellationToken.None); + + await drv.ReadAsync(["T100", "T102", "T104", "T200", "T202"], CancellationToken.None); + + fake.Reads.Clear(); + await drv.ReadAsync(["T100", "T102", "T104", "T200", "T202"], CancellationToken.None); + + // The 200..202 block should still coalesce — its range doesn't overlap the + // 100..104 prohibition. + var coalesced200Block = fake.Reads.Any(r => r.Address == 200 && r.Quantity == 3); + coalesced200Block.ShouldBeTrue("the 200..202 block must keep coalescing — it's outside the prohibited range"); + + await drv.ShutdownAsync(CancellationToken.None); + } +}