diff --git a/docs/Driver.AbLegacy.Cli.md b/docs/Driver.AbLegacy.Cli.md index 33fd68a..0ad3b31 100644 --- a/docs/Driver.AbLegacy.Cli.md +++ b/docs/Driver.AbLegacy.Cli.md @@ -21,6 +21,9 @@ dotnet run --project src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Cli -- --help | `-P` / `--plc-type` | `Slc500` | Slc500 / MicroLogix / Plc5 / LogixPccc | | `--timeout-ms` | `5000` | Per-operation timeout — see precedence note below | | `--retries` | `0` | Retry count on transient `BadCommunicationError` (PR 9 / #252) | +| `--demote-failure-threshold` | `3` | **PR ablegacy-12 / #255** — consecutive comm failures before the device is auto-demoted | +| `--demote-for-ms` | `30000` | **PR ablegacy-12 / #255** — auto-demote cool-down window in ms | +| `--no-demote` | off | **PR ablegacy-12 / #255** — disable auto-demote entirely (counters still tick) | | `--verbose` | off | Serilog debug output | Family ↔ CIP-path cheat sheet: @@ -84,6 +87,37 @@ otopcua-ablegacy-cli probe -g ab://192.168.1.20/1,0 otopcua-ablegacy-cli probe -g ab://192.168.1.30/ -P MicroLogix -a S:0 ``` +`probe` output (PR ablegacy-12 / #255) reports both `Health` (driver health +state) and `Host state`. The latter is sourced from `IHostConnectivityProbe` +and surfaces `Demoted` when the auto-demote threshold has tripped — a fast +visual signal that the CLI is short-circuiting future reads against this +device until the cool-down expires: + +```text +Gateway: ab://192.168.1.20/1,0 +PLC type: Slc500 +Health: Degraded +Host state: Demoted +Last error: libplctag status -33 reading N7:0 +``` + +### Auto-demote knobs + +```powershell +# Trip after just one comm failure, hold for 60s. +otopcua-ablegacy-cli read -g ab://192.168.1.20/1,0 -a N7:0 -t Int ` + --demote-failure-threshold 1 --demote-for-ms 60000 + +# Opt out of auto-demote — stresses the link without short-circuiting. +otopcua-ablegacy-cli read -g ab://192.168.1.20/1,0 -a N7:0 -t Int --no-demote +``` + +The CLI is a one-shot test client — auto-demote primarily matters in the +server-side multi-device deployment, where a single demoted PLC can no +longer block reads against its healthy peers. Use the CLI flags to +reproduce a flapping-link scenario locally before tuning the server-side +`appsettings.json` `Demote` block. + ### `read` ```powershell diff --git a/docs/drivers/AbLegacy-Diagnostics.md b/docs/drivers/AbLegacy-Diagnostics.md index e5307e3..c94e351 100644 --- a/docs/drivers/AbLegacy-Diagnostics.md +++ b/docs/drivers/AbLegacy-Diagnostics.md @@ -7,10 +7,12 @@ directly without going through a separate diagnostics RPC. Mirrors the AB CIP Closes #253 (PR ablegacy-10). -## The seven counters +## The nine counters -Each device managed by the `AbLegacyDriver` exposes seven read-only nodes under -`AbLegacy//_Diagnostics/`: +Each device managed by the `AbLegacyDriver` exposes nine read-only nodes under +`AbLegacy//_Diagnostics/`. The first seven shipped in PR ablegacy-10; +`DemoteCount` + `LastDemotedUtc` arrived with PR ablegacy-12 / #255 (auto-demote +on comm failure). | Name | Type | Semantics | |---|---|---| @@ -21,6 +23,8 @@ Each device managed by the `AbLegacyDriver` exposes seven read-only nodes under | `LastErrorCode` | Int32 | Most recent libplctag status code on a failed read; `0` when no error has been seen since the last reset. | | `LastErrorMessage` | String | Most recent libplctag error message on a failed read; empty when no error has been seen since the last reset. | | `CommFailures` | Int64 | Count of read failures mapped to `BadCommunicationError`. Spans transient libplctag throws + retried-out chains so operators see a single "wire fell off" counter. | +| `DemoteCount` | Int64 | **PR ablegacy-12** — cumulative auto-demote events for this device. Bumps every time the driver crosses the consecutive-failure threshold and arms a fresh cool-down window. Cumulative across `ReinitializeAsync` (preserved through redeploys) so a flapping link surfaces as a steadily climbing counter. | +| `LastDemotedUtc` | String | **PR ablegacy-12** — ISO-8601 UTC timestamp of the most recent auto-demotion. Empty string when this device has never been demoted. | **Address shape**: `_Diagnostics//` — e.g. `_Diagnostics/ab://10.0.0.5/1,0/RequestCount`. @@ -34,10 +38,11 @@ user-config tag node, just under a reserved sibling folder. | Trigger | Effect | |---|---| -| `ReinitializeAsync` | Every counter for every device resets to zero, plus `LastErrorMessage` clears to empty. | -| `ShutdownAsync` | Same as Reinitialize — counters drop with the device map. | +| `ReinitializeAsync` | Every counter for every device resets to zero, plus `LastErrorMessage` clears to empty. **PR ablegacy-12 exception:** `DemoteCount` + `LastDemotedUtc` survive the reinit so an operator redeploying mid-incident doesn't lose the flapping-link history. | +| `ShutdownAsync` | All counters drop with the device map (including `DemoteCount`). | | Driver process restart | Counters start at zero. | | Probe transition Stopped→Running | **No automatic reset** — counters are cumulative across reconnect events so operators can spot intermittent links by watching `CommFailures` keep climbing. | +| Probe transition Demoted→Running | **PR ablegacy-12** — early-clear of the active demote window, but the cumulative `DemoteCount` stays put. | There is no in-process "reset" RPC at the time of writing. If you need to clear counters without a redeploy, kick a `ReinitializeAsync` from the Admin @@ -99,14 +104,85 @@ overview dashboard, plus a faster rate (1 s) on `LastErrorMessage` / short-circuit makes every read O(1) — there's no penalty for fast polling of the counter itself, only the OPC UA subscription bookkeeping. +## Auto-demote on comm failure (PR ablegacy-12 / #255) + +When a device fails N consecutive reads or probes the driver marks it +**Demoted** for a configurable cool-down window. Reads against a demoted +device short-circuit with `BadCommunicationError` *without invoking +libplctag* — that's the whole point of the feature: one slow PLC sharing +the driver thread can't starve faster peers reading from healthy hosts on +the same `AbLegacyDriver` instance. + +### Configuration + +Per-device, optional. `null` keeps the documented defaults (auto-demote +**enabled** with 3 failures / 30 s). + +```jsonc +{ + "Devices": [ + { + "HostAddress": "ab://10.0.0.5/1,0", + "PlcFamily": "Slc500", + "Demote": { + "FailureThreshold": 3, // default 3 + "DemoteForMs": 30000, // default 30s + "Enabled": true // default true + } + } + ] +} +``` + +| Knob | Default | Notes | +|---|---|---| +| `FailureThreshold` | `3` | Consecutive comm failures before the device is demoted. A successful read or probe resets the tally. Terminal failures (`BadNodeIdUnknown`, `BadTypeMismatch`, …) **do not count** — they're config / decoder mismatches, not field outages. | +| `DemoteForMs` | `30000` (30s) | Cool-down window. Reads while this is active short-circuit; a successful probe clears it early. | +| `Enabled` | `true` | Set to `false` to keep the diagnostic counters but skip the auto-throttle. The failure tally still ticks but never arms the cool-down. | + +### Recovery + +Three ways out of Demoted, in order of likelihood: + +1. **Probe success** — the per-device probe loop (`Probe.Enabled = true`, + default address `S:0`) is the fast path. The next probe iteration after + demotion will exercise the wire; on success it clears + `DemotedUntilUtc` immediately and transitions the host to `Running`. +2. **Window expiry** — once `DemoteForMs` elapses the demote marker + clears on the next read attempt. The read goes through; if it fails, + the failure tally keeps counting from where it left off (so a + permanently-down device re-arms the window after one more consecutive + failure rather than having to repeat the full threshold). +3. **`ReinitializeAsync`** — clears `ConsecutiveFailures` + + `DemotedUntilUtc` outright. Cumulative `DemoteCount` survives. + +### Observability + +`DemoteCount` is the headline counter — it bumps once per demotion event, +not per short-circuited read. A device that flaps every hour for a week +shows `DemoteCount = ~168` on Friday afternoon, which is the operator +signal you actually want. + +`LastDemotedUtc` is the ISO-8601 UTC timestamp of the most recent +demotion. Bind it on a per-device tile alongside `DemoteCount` for +"flapping link" alerting. + +### Host-state surface + +A demoted device reports `HostState.Demoted` (new in PR ablegacy-12 +on `Core.Abstractions/IHostConnectivityProbe.cs`). Consumers that +predate the new value (the central `HostStatusPublisher`) safely treat +it as `Stopped` — no schema migration needed. + ## Cross-references - [`AbLegacyDiagnosticTags.cs`](../../src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDiagnosticTags.cs) — counter store + read short-circuit - [`AbLegacyDriver.cs`](../../src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDriver.cs) - — increment sites in `ReadAsync`, discovery emission in `DiscoverAsync` + — increment sites in `ReadAsync`, discovery emission in `DiscoverAsync`, + auto-demote bookkeeping in `RecordFailureAndMaybeDemote` + `ProbeLoopAsync` - [`AbLegacy-Test-Fixture.md`](AbLegacy-Test-Fixture.md) — `AbLegacyDiagnosticsTests` - + collision-rejection contract + + `AbLegacyAutoDemoteTests` + collision-rejection contract - [AB CIP `_System/` parallel](../../src/ZB.MOM.WW.OtOpcUa.Driver.AbCip/AbCipSystemTagSource.cs) — same pattern with the CIP-specific six entries (incl. writeable `_RefreshTagDb` trigger) diff --git a/docs/drivers/AbLegacy-Test-Fixture.md b/docs/drivers/AbLegacy-Test-Fixture.md index 70c4dee..54d3af8 100644 --- a/docs/drivers/AbLegacy-Test-Fixture.md +++ b/docs/drivers/AbLegacy-Test-Fixture.md @@ -53,12 +53,31 @@ supplies a `FakeAbLegacyTag`. counters: 5 reads (3 ok / 2 fail) → `RequestCount=5`, `ResponseCount=3`, `ErrorCount=2`; `LastErrorCode` reflects the most recent libplctag status; `RetryCount` increments per retry attempt beyond the first; counters reset - on `ReinitializeAsync`; discovery emits exactly 7 diagnostic variables per - device under `_Diagnostics/`; collision rejection at `InitializeAsync` for - user tags shadowing reserved names or `_Diagnostics/` addresses; the - `_Diagnostics//` short-circuit returns the live snapshot through - `ReadAsync` without bumping `RequestCount`; two devices keep counters - independent. + on `ReinitializeAsync`; discovery emits the canonical diagnostic variables + per device under `_Diagnostics/` (now 9 with PR ablegacy-12); collision + rejection at `InitializeAsync` for user tags shadowing reserved names or + `_Diagnostics/` addresses; the `_Diagnostics//` short-circuit + returns the live snapshot through `ReadAsync` without bumping + `RequestCount`; two devices keep counters independent. +- `AbLegacyAutoDemoteTests` — **PR ablegacy-12 / #255** auto-demote on comm + failure: 3 consecutive failures arm the demote window and surface + `HostState.Demoted`; subsequent reads short-circuit with + `BadCommunicationError` *without invoking libplctag* (verified via + `factory.Tags["N7:0"].ReadCount` not advancing); successful read resets + the consecutive-failure counter; failure-success-failure pattern doesn't + cross the threshold; `DemoteCount` + `LastDemotedUtc` surface via + `_Diagnostics/`; `Enabled=false` opts out (failures still count, demotion + never fires); `ReinitializeAsync` clears the active window but preserves + cumulative `DemoteCount`; cool-down expiry allows the next read through; + two devices in one driver — one faulty, one healthy — proves the faulty + side's demotion doesn't starve the healthy side; `BadNodeIdUnknown` + (terminal) does not count toward the comm-failure tally; DTO JSON + round-trip preserves `FailureThreshold` / `DemoteForMs` / `Enabled` at + the per-device level; `HostState.Demoted` enum value is wired through + `Core.Abstractions`. Companion integration test in + `tests/.../IntegrationTests/AbLegacyAutoDemoteTests.cs` runs the + two-device-one-unreachable scenario against a live ab_server fixture + using `127.0.0.1:1` as the unreachable peer. - `RsLogixSymbolImportTests` — ablegacy-11 / #254 RSLogix CSV symbol-import parser: canonical 8-row CSV (one row per N/F/B/L/ST/T/C/R) → 8 typed `AbLegacyTagDefinition`s with the right `DataType`; header + comment-line diff --git a/scripts/e2e/test-ablegacy.ps1 b/scripts/e2e/test-ablegacy.ps1 index 4593764..0537784 100644 --- a/scripts/e2e/test-ablegacy.ps1 +++ b/scripts/e2e/test-ablegacy.ps1 @@ -39,6 +39,21 @@ client may have bumped it by more, so the comparison is `>=`). NodeId form: ns=;s=AbLegacy//_Diagnostics/RequestCount. Mirrors the -SystemConnectionStatusNodeId knob on test-abcip.ps1. + +.PARAMETER DiagnosticsDemoteCountNodeId + Optional NodeId for the synthetic _Diagnostics//DemoteCount variable + emitted by AB Legacy discovery (PR ablegacy-12 / #255). When supplied, the + script runs the auto-demote assertion: kills the simulator container so + reads start failing, hammers the user-tag BridgeNodeId at least + FailureThreshold times to trip the demotion, then reads the diagnostic + counter and asserts the value increased by >= 1. NodeId form: + ns=;s=AbLegacy//_Diagnostics/DemoteCount. The simulator + must support `docker stop otopcua-ab-server-slc500` for the kill stage. + +.PARAMETER FailureThresholdForDemote + Failure threshold the server is configured with (default 3). The + demote assertion writes/reads N+1 times against the killed simulator + to guarantee the threshold trips even if some reads beat the kill. #> param( @@ -47,7 +62,9 @@ param( [string]$Address = "N7:5", [string]$OpcUaUrl = "opc.tcp://localhost:4840", [Parameter(Mandatory)] [string]$BridgeNodeId, - [string]$DiagnosticsRequestCountNodeId + [string]$DiagnosticsRequestCountNodeId, + [string]$DiagnosticsDemoteCountNodeId, + [int]$FailureThresholdForDemote = 3 ) $ErrorActionPreference = "Stop" @@ -245,5 +262,67 @@ finally { Remove-Item -Path $importJsonPath -ErrorAction SilentlyContinue } +# PR ablegacy-12 / #255 — auto-demote round-trip. Kill the simulator container, +# hammer the bridge NodeId past the failure threshold, then assert the +# DemoteCount diagnostic incremented. Restart the simulator at the end so the +# next run gets a clean baseline. Gated on -DiagnosticsDemoteCountNodeId so +# environments without docker-side control of the simulator can opt out. +if ($DiagnosticsDemoteCountNodeId) { + Write-Header "AutoDemote (kill simulator + observe DemoteCount from $DiagnosticsDemoteCountNodeId)" + $baselineDemoteOut = & $opcUaCli.File @($opcUaCli.PrefixArgs) ` + @("read", "-u", $OpcUaUrl, "-n", $DiagnosticsDemoteCountNodeId) 2>&1 + $baselineDemote = 0 + if (($baselineDemoteOut -join "`n") -match '(\d+)') { $baselineDemote = [int64]$Matches[1] } + + # Best-effort container kill — prefer the slc500 profile name; fall back to + # micrologix / plc5 in case the operator pointed the e2e at a different family. + $simContainers = @("otopcua-ab-server-slc500", "otopcua-ab-server-micrologix", "otopcua-ab-server-plc5") + $killed = $false + foreach ($c in $simContainers) { + $stop = docker stop $c 2>$null + if ($LASTEXITCODE -eq 0 -and $stop) { + Write-Host "Stopped $c" + $killed = $true + break + } + } + if (-not $killed) { + Write-Fail "AutoDemote: no ab_server container found via 'docker stop' — skipping demote assertion" + $results += @{ Passed = $false; Reason = "no simulator container to kill" } + } + else { + # Hammer past the threshold. Each read against a now-unreachable simulator + # surfaces BadCommunicationError; FailureThreshold consecutive ones trip + # the demotion. We add 2 extra to absorb timing slack (one read may be + # in-flight when the kill lands). + $hammerCount = $FailureThresholdForDemote + 2 + for ($i = 0; $i -lt $hammerCount; $i++) { + & $opcUaCli.File @($opcUaCli.PrefixArgs) ` + @("read", "-u", $OpcUaUrl, "-n", $BridgeNodeId) 2>&1 | Out-Null + } + + Start-Sleep -Seconds 1 + + $afterDemoteOut = & $opcUaCli.File @($opcUaCli.PrefixArgs) ` + @("read", "-u", $OpcUaUrl, "-n", $DiagnosticsDemoteCountNodeId) 2>&1 + $afterDemote = 0 + if (($afterDemoteOut -join "`n") -match '(\d+)') { $afterDemote = [int64]$Matches[1] } + + $deltaDemote = $afterDemote - $baselineDemote + if ($deltaDemote -ge 1) { + Write-Pass "AutoDemote DemoteCount delta $deltaDemote >= 1 after $hammerCount failed reads" + $results += @{ Passed = $true } + } else { + Write-Fail "AutoDemote DemoteCount delta $deltaDemote < 1 (baseline=$baselineDemote after=$afterDemote)" + $results += @{ Passed = $false; Reason = "demote delta $deltaDemote" } + } + + # Restart the simulator so subsequent test runs have a clean baseline. + # Best-effort — if docker-compose isn't on the path the operator can + # bring it back manually via the Docker/docker-compose.yml profile. + try { docker start (docker ps -aq -f "name=otopcua-ab-server-") | Out-Null } catch { } + } +} + Write-Summary -Title "AB Legacy e2e" -Results $results if ($results | Where-Object { -not $_.Passed }) { exit 1 } diff --git a/scripts/smoke/seed-ablegacy-smoke.sql b/scripts/smoke/seed-ablegacy-smoke.sql index 4915c11..8a60586 100644 --- a/scripts/smoke/seed-ablegacy-smoke.sql +++ b/scripts/smoke/seed-ablegacy-smoke.sql @@ -96,7 +96,12 @@ VALUES (@Gen, @DrvId, @ClusterId, @NsId, 'ablegacy-smoke', 'AbLegacy', N'{ "PlcFamily": "Slc500", "DeviceName": "slc-500", "TimeoutMs": 500, - "Retries": 1 + "Retries": 1, + "Demote": { + "FailureThreshold": 3, + "DemoteForMs": 30000, + "Enabled": true + } } ], "Probe": { "Enabled": true, "IntervalMs": 5000, "TimeoutMs": 2000, "ProbeAddress": "S:0" }, @@ -155,7 +160,15 @@ PRINT ' e.g. "ab://:44818/" and re-run this seed.'; PRINT ''; PRINT 'PR ablegacy-10 / #253 — diagnostic counters auto-emit per device under'; PRINT ' AbLegacy//_Diagnostics/. No dbo.Tag rows needed — the'; -PRINT ' driver registers them at DiscoverAsync time. Seven counters per device:'; +PRINT ' driver registers them at DiscoverAsync time. Nine counters per device:'; PRINT ' RequestCount, ResponseCount, ErrorCount, RetryCount, LastErrorCode,'; -PRINT ' LastErrorMessage, CommFailures. See docs/drivers/AbLegacy-Diagnostics.md'; -PRINT ' for the full surface + reset semantics.'; +PRINT ' LastErrorMessage, CommFailures, DemoteCount, LastDemotedUtc. See'; +PRINT ' docs/drivers/AbLegacy-Diagnostics.md for the full surface + reset'; +PRINT ' semantics.'; +PRINT ''; +PRINT 'PR ablegacy-12 / #255 — auto-demote on comm failure: 3 consecutive'; +PRINT ' failed reads / probes mark the device Demoted for DemoteFor=PT30S'; +PRINT ' (30 s); reads against a demoted device short-circuit with'; +PRINT ' BadCommunicationError so one slow PLC can''t starve the driver.'; +PRINT ' Tune via the Demote block on each Devices[] row. DemoteCount +'; +PRINT ' LastDemotedUtc on the _Diagnostics folder surface flapping links.'; diff --git a/src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/IHostConnectivityProbe.cs b/src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/IHostConnectivityProbe.cs index 3a446b1..b3c9d6e 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/IHostConnectivityProbe.cs +++ b/src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/IHostConnectivityProbe.cs @@ -38,4 +38,16 @@ public sealed record HostStatusChangedEventArgs( HostState NewState); /// Host lifecycle state. Generalization of Galaxy's Platform/Engine ScanState. -public enum HostState { Unknown, Running, Stopped, Faulted } +/// +/// +/// (PR ablegacy-12 / #255) is a soft-stopped state used by drivers +/// that auto-throttle a host after N consecutive comm failures. Reads are short-circuited +/// with BadCommunicationError for a configurable cool-down window so one slow PLC +/// doesn't starve faster peers sharing the same driver. Demoted is *not* the same as +/// (which means "probe says it's down") nor +/// (which means "the driver itself is broken"); it's a deliberate driver-side back-off. +/// Consumers that don't recognize Demoted can safely treat it as Stopped +/// (see HostStatusPublisher.MapState). +/// +/// +public enum HostState { Unknown, Running, Stopped, Faulted, Demoted } diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Cli/AbLegacyCommandBase.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Cli/AbLegacyCommandBase.cs index 8e53728..b02a346 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Cli/AbLegacyCommandBase.cs +++ b/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Cli/AbLegacyCommandBase.cs @@ -25,6 +25,34 @@ public abstract class AbLegacyCommandBase : DriverCommandBase [CommandOption("timeout-ms", Description = "Per-operation timeout in ms (default 5000).")] public int TimeoutMs { get; init; } = 5000; + /// + /// PR ablegacy-12 / #255 — consecutive comm failures before this device is + /// auto-demoted. Reads against a demoted device short-circuit with + /// BadCommunicationError for ms so one + /// unreachable PLC can't starve faster peers sharing the driver thread. + /// + [CommandOption("demote-failure-threshold", Description = + "Consecutive comm failures before the device is auto-demoted (PR ablegacy-12). Default 3.")] + public int DemoteFailureThreshold { get; init; } = 3; + + /// + /// PR ablegacy-12 / #255 — auto-demote cool-down window in ms. Reads while + /// this window is active short-circuit with BadCommunicationError; + /// a successful probe clears it early. + /// + [CommandOption("demote-for-ms", Description = + "Auto-demote cool-down window in ms (PR ablegacy-12). Default 30000 (30s).")] + public int DemoteForMs { get; init; } = 30_000; + + /// + /// PR ablegacy-12 / #255 — opt out of the auto-demote behaviour. The + /// consecutive-failure tally still ticks (so DemoteCount/LastDemotedUtc + /// stay zero) but reads never short-circuit. + /// + [CommandOption("no-demote", Description = + "Disable auto-demote on consecutive comm failures (PR ablegacy-12). Default off (auto-demote enabled).")] + public bool NoDemote { get; init; } + /// public override TimeSpan Timeout { @@ -41,7 +69,11 @@ public abstract class AbLegacyCommandBase : DriverCommandBase Devices = [new AbLegacyDeviceOptions( HostAddress: Gateway, PlcFamily: PlcType, - DeviceName: $"cli-{PlcType}")], + DeviceName: $"cli-{PlcType}", + Demote: new AbLegacyDemoteOptions( + FailureThreshold: DemoteFailureThreshold, + DemoteFor: TimeSpan.FromMilliseconds(DemoteForMs), + Enabled: !NoDemote))], Tags = tags, Timeout = Timeout, Probe = new AbLegacyProbeOptions { Enabled = false }, diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Cli/Commands/ProbeCommand.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Cli/Commands/ProbeCommand.cs index 4c1d76c..0e9c925 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Cli/Commands/ProbeCommand.cs +++ b/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Cli/Commands/ProbeCommand.cs @@ -40,10 +40,19 @@ public sealed class ProbeCommand : AbLegacyCommandBase await driver.InitializeAsync("{}", ct); var snapshot = await driver.ReadAsync(["__probe"], ct); var health = driver.GetHealth(); + // PR ablegacy-12 / #255 — surface Demoted alongside the probe-driven + // HostState. After a one-shot probe the host hasn't been observed + // (no probe loop runs in CLI mode), so HostState is typically Unknown + // unless the read above tripped the demote threshold. + var hostStatus = driver.GetHostStatuses().FirstOrDefault(); await console.Output.WriteLineAsync($"Gateway: {Gateway}"); await console.Output.WriteLineAsync($"PLC type: {PlcType}"); await console.Output.WriteLineAsync($"Health: {health.State}"); + if (hostStatus is not null) + { + await console.Output.WriteLineAsync($"Host state: {hostStatus.State}"); + } if (health.LastError is { } err) await console.Output.WriteLineAsync($"Last error: {err}"); await console.Output.WriteLineAsync(); diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDiagnosticTags.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDiagnosticTags.cs index 4df009f..d6d2aa4 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDiagnosticTags.cs +++ b/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDiagnosticTags.cs @@ -40,6 +40,11 @@ public sealed class AbLegacyDiagnosticTags public const string DiagnosticsFolderPrefix = "_Diagnostics/"; /// Canonical names the diagnostics folder exposes. Keep in lockstep with discovery. + /// + /// PR ablegacy-12 / #255 — DemoteCount + LastDemotedUtc ride + /// alongside the original seven so HMIs can spot a flapping device by + /// watching DemoteCount climb without scraping logs. + /// public static readonly IReadOnlyList DiagnosticTagNames = [ "RequestCount", @@ -49,6 +54,9 @@ public sealed class AbLegacyDiagnosticTags "LastErrorCode", "LastErrorMessage", "CommFailures", + // PR ablegacy-12 / #255 — auto-demote on comm failure surface. + "DemoteCount", + "LastDemotedUtc", ]; private static readonly HashSet DiagnosticTagNameSet = @@ -130,6 +138,39 @@ public sealed class AbLegacyDiagnosticTags Interlocked.Increment(ref c.Retry); } + /// + /// PR ablegacy-12 / #255 — record an auto-demotion event: bumps cumulative + /// DemoteCount and stamps LastDemotedUtc. Fires every time the + /// driver crosses the failure threshold and arms a fresh cool-down window — + /// a single flapping link that demotes hourly will surface as a steadily + /// climbing counter, which is the operator-facing signal we want. + /// + public void RecordDemote(string deviceHostAddress, DateTime nowUtc) + { + ArgumentNullException.ThrowIfNull(deviceHostAddress); + var c = GetOrCreate(deviceHostAddress); + Interlocked.Increment(ref c.DemoteCount); + // DateTime is 64 bits — use Interlocked.Exchange on the Ticks field so a + // concurrent reader sees a torn-free snapshot. On x86 a 64-bit non-aligned + // write isn't atomic; on x64 it is, but routing through Interlocked is + // platform-independent + costs almost nothing. + Interlocked.Exchange(ref c.LastDemotedUtcTicks, nowUtc.Ticks); + } + + /// + /// PR ablegacy-12 / #255 — restore cumulative demote bookkeeping after a + /// cycle so an operator + /// redeploying config mid-incident doesn't lose flapping-link history. + /// Sets the counters to absolute values rather than incrementing. + /// + public void RestoreDemote(string deviceHostAddress, long demoteCount, DateTime? lastDemotedUtc) + { + ArgumentNullException.ThrowIfNull(deviceHostAddress); + var c = GetOrCreate(deviceHostAddress); + Interlocked.Exchange(ref c.DemoteCount, demoteCount); + Interlocked.Exchange(ref c.LastDemotedUtcTicks, lastDemotedUtc?.Ticks ?? 0); + } + /// Snapshot the current counters for a device. Returns zeros for unknown hosts. public DiagnosticsSnapshot Snapshot(string deviceHostAddress) { @@ -139,7 +180,8 @@ public sealed class AbLegacyDiagnosticTags { _counters.TryGetValue(deviceHostAddress, out c); } - if (c is null) return new DiagnosticsSnapshot(0, 0, 0, 0, 0, string.Empty, 0); + if (c is null) return new DiagnosticsSnapshot(0, 0, 0, 0, 0, string.Empty, 0, 0, null); + var ticks = Interlocked.Read(ref c.LastDemotedUtcTicks); return new DiagnosticsSnapshot( Request: Interlocked.Read(ref c.Request), Response: Interlocked.Read(ref c.Response), @@ -147,7 +189,10 @@ public sealed class AbLegacyDiagnosticTags Retry: Interlocked.Read(ref c.Retry), LastErrorCode: Volatile.Read(ref c.LastErrorCode), LastErrorMessage: c.LastErrorMessage ?? string.Empty, - CommFailures: Interlocked.Read(ref c.CommFailures)); + CommFailures: Interlocked.Read(ref c.CommFailures), + // PR ablegacy-12 / #255 — auto-demote surface. + DemoteCount: Interlocked.Read(ref c.DemoteCount), + LastDemotedUtc: ticks == 0 ? null : new DateTime(ticks, DateTimeKind.Utc)); } /// @@ -155,7 +200,14 @@ public sealed class AbLegacyDiagnosticTags /// from so a config redeploy starts /// with a clean diagnostic surface. /// - public void Reset(string deviceHostAddress) + /// + /// PR ablegacy-12 / #255 — when is true the + /// cumulative DemoteCount + LastDemotedUtc survive the reset. + /// uses that mode so an operator + /// redeploying a config doesn't lose their flapping-link history; a fresh process + /// start clears them naturally because the dictionary is rebuilt from scratch. + /// + public void Reset(string deviceHostAddress, bool preserveDemote = false) { ArgumentNullException.ThrowIfNull(deviceHostAddress); var c = GetOrCreate(deviceHostAddress); @@ -166,14 +218,40 @@ public sealed class AbLegacyDiagnosticTags Interlocked.Exchange(ref c.LastErrorCode, 0); c.LastErrorMessage = string.Empty; Interlocked.Exchange(ref c.CommFailures, 0); + if (!preserveDemote) + { + Interlocked.Exchange(ref c.DemoteCount, 0); + Interlocked.Exchange(ref c.LastDemotedUtcTicks, 0); + } } /// Reset every tracked device. Called on full ShutdownAsync. - public void ResetAll() + /// + /// PR ablegacy-12 / #255 — when is true the + /// cumulative demote counters survive a per-device reset of every other field. + /// The default (false) clears the dictionary outright, which is what + /// wants. + /// + public void ResetAll(bool preserveDemote = false) { + if (!preserveDemote) + { + lock (_lock) + { + _counters.Clear(); + } + return; + } + + // Preserve mode: keep the dictionary keys + cumulative demote fields, but + // zero everything else. Used by Reinitialize to span a config redeploy + // without losing flapping-link history. lock (_lock) { - _counters.Clear(); + foreach (var key in _counters.Keys.ToList()) + { + Reset(key, preserveDemote: true); + } } } @@ -205,6 +283,11 @@ public sealed class AbLegacyDiagnosticTags "LastErrorCode" => snapshot.LastErrorCode, "LastErrorMessage" => snapshot.LastErrorMessage, "CommFailures" => snapshot.CommFailures, + // PR ablegacy-12 / #255 — auto-demote surface. LastDemotedUtc returns + // the empty string when no demotion has happened yet, mirroring the + // LastErrorMessage convention so HMIs can bind directly to a string. + "DemoteCount" => snapshot.DemoteCount, + "LastDemotedUtc" => snapshot.LastDemotedUtc?.ToString("o") ?? string.Empty, _ => null, }; return true; @@ -236,6 +319,11 @@ public sealed class AbLegacyDiagnosticTags public int LastErrorCode; public string? LastErrorMessage = string.Empty; public long CommFailures; + // PR ablegacy-12 / #255 — cumulative across config redeploys. Cleared only + // on full driver process restart (the dictionary is rebuilt from scratch); + // ReinitializeAsync uses preserveDemote: true. + public long DemoteCount; + public long LastDemotedUtcTicks; } } @@ -251,6 +339,9 @@ public sealed class AbLegacyDiagnosticTags /// Most recent libplctag status code on a failed read. /// Most recent libplctag error message on a failed read. /// Count of read failures mapped to BadCommunicationError. +/// PR ablegacy-12 / #255 — cumulative auto-demote events. +/// PR ablegacy-12 / #255 — UTC timestamp of the most +/// recent demotion, or null if the device has never been demoted. public sealed record DiagnosticsSnapshot( long Request, long Response, @@ -258,4 +349,6 @@ public sealed record DiagnosticsSnapshot( long Retry, int LastErrorCode, string LastErrorMessage, - long CommFailures); + long CommFailures, + long DemoteCount, + DateTime? LastDemotedUtc); diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDriver.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDriver.cs index a7c2ceb..6496c9e 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDriver.cs +++ b/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDriver.cs @@ -217,6 +217,20 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover public async Task ReinitializeAsync(string driverConfigJson, CancellationToken cancellationToken) { + // PR ablegacy-12 / #255 — capture the cumulative DemoteCount + LastDemotedUtc + // for every currently-tracked device before we tear down. The Shutdown below + // calls ResetAll() which clears the dictionary; the per-host InitializeAsync + // below re-EnsureDevice's the slots; we restore the cumulative demote + // history so an operator who redeploys mid-incident doesn't lose the trail + // of how often this device was flapping. + var preservedDemote = new Dictionary( + StringComparer.OrdinalIgnoreCase); + foreach (var (host, _) in _devices) + { + var snap = _diagnosticTags.Snapshot(host); + preservedDemote[host] = (snap.DemoteCount, snap.LastDemotedUtc); + } + await ShutdownAsync(cancellationToken).ConfigureAwait(false); // PR ablegacy-10 / #253 — counters were dropped along with the device map when // ShutdownAsync called ResetAll; the InitializeAsync below re-EnsureDevice's each @@ -224,6 +238,16 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover // here in case a downstream override of either method skips the cycle. _diagnosticTags.ResetAll(); await InitializeAsync(driverConfigJson, cancellationToken).ConfigureAwait(false); + + // PR ablegacy-12 / #255 — restore the cumulative demote history. Only hosts + // that survive the redeploy get their counters back; a device removed from + // config legitimately drops its history (it isn't being tracked any more). + foreach (var (host, (count, lastUtc)) in preservedDemote) + { + if (count == 0 && lastUtc is null) continue; + if (!_devices.ContainsKey(host)) continue; + _diagnosticTags.RestoreDemote(host, count, lastUtc); + } } public async Task ShutdownAsync(CancellationToken cancellationToken) @@ -275,6 +299,38 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover internal int ResolveRetries(DeviceState device) => device.Options.Retries ?? _options.Retries ?? 0; + /// + /// PR ablegacy-12 / #255 — resolve the active for + /// a device. Per-device options win; otherwise the documented defaults (3 failures / + /// 30 s / enabled). Returns a non-null record so callers can assume a usable value. + /// + internal AbLegacyDemoteOptions ResolveDemote(DeviceState device) => + device.Options.Demote ?? new AbLegacyDemoteOptions(); + + /// + /// PR ablegacy-12 / #255 — common bookkeeping for one comm failure: bump the + /// consecutive-failure counter and arm the demote window once the threshold is + /// crossed. Returns true when this call tipped the device into Demoted (so + /// the caller can fire ); false when the + /// device was already demoted or stayed below the threshold. + /// + private bool RecordFailureAndMaybeDemote(DeviceState state, DateTime nowUtc) + { + var demote = ResolveDemote(state); + var consecutive = Interlocked.Increment(ref state.ConsecutiveFailures); + + if (!demote.Enabled || consecutive < demote.FailureThreshold) return false; + // Already demoted? Don't re-arm — the original window's expiry is the + // operator-facing recovery clock and re-arming on every subsequent failed + // read would suppress reads forever on a fully-down device. The probe + // loop is what eventually clears the demotion (or the window expiring). + if (state.DemotedUntilUtc is { } until && until > nowUtc) return false; + + state.DemotedUntilUtc = nowUtc + demote.EffectiveDemoteFor; + _diagnosticTags.RecordDemote(state.Options.HostAddress, nowUtc); + return true; + } + // ---- IReadable ---- public async Task> ReadAsync( @@ -323,6 +379,45 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover // double-counting the original attempt as a retry. _diagnosticTags.RecordRequest(def.DeviceHostAddress); + // PR ablegacy-12 / #255 — auto-demote short-circuit. When the device's demote + // window is still active we return BadCommunicationError immediately, without + // touching libplctag or its retry loop. That's the whole point of the feature: + // one slow PLC sharing the driver thread can't drag down healthy peers. We + // don't bump ErrorCount/CommFailures here because this isn't a fresh field + // failure — it's the cool-down on a previously-counted one. + if (device.DemotedUntilUtc is { } demotedUntil) + { + if (demotedUntil > now) + { + results[i] = new DataValueSnapshot(null, + AbLegacyStatusMapper.BadCommunicationError, null, now); + continue; + } + // Window expired without an early-clear from a probe success — drop the + // marker but don't reset ConsecutiveFailures yet. If this read also + // fails the failure tally keeps counting from where it left off, so a + // permanently-down device re-arms the window after one more + // consecutive failure (vs. having to repeat the full threshold). + lock (device.ProbeLock) + { + if (device.DemotedUntilUtc is { } stillUntil && stillUntil <= now) + { + device.DemotedUntilUtc = null; + // Mirror Stopped→Running on a probe-driven recovery: leave the + // HostState transition to the probe loop (or the upcoming success + // below); we just clear the cool-down marker so the next read + // dispatches normally. + if (device.HostState == HostState.Demoted) + { + // Surface a transition out of Demoted. The probe loop will + // bring it Running once a probe succeeds; until then leave + // it in Stopped to reflect "we don't actually know it's up". + TransitionDeviceState(device, HostState.Stopped); + } + } + } + } + // PR 9 — per-device retry loop: on transient BadCommunicationError (libplctag throw // OR a non-zero status that maps to BadCommunicationError) retry up to N times. A // terminal mapped status (e.g. BadNodeIdUnknown for a missing PLC tag, BadTypeMismatch @@ -360,6 +455,15 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover status, $"libplctag status {status} reading {reference}", commFailure: mappedStatus == AbLegacyStatusMapper.BadCommunicationError); + // PR ablegacy-12 / #255 — only comm failures count toward the + // demote tally. A BadNodeIdUnknown / BadTypeMismatch is a config + // / decoder mismatch, not a sign the host is unreachable, so + // demoting on it would punish the operator for a typo. + if (mappedStatus == AbLegacyStatusMapper.BadCommunicationError + && RecordFailureAndMaybeDemote(device, now)) + { + TransitionDeviceState(device, HostState.Demoted); + } snapshot = new DataValueSnapshot(null, mappedStatus, null, now); _health = new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, $"libplctag status {status} reading {reference}"); @@ -385,6 +489,13 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover _health = new DriverHealth(DriverState.Healthy, now, null); // PR ablegacy-10 / #253 — successful array read. _diagnosticTags.RecordResponse(def.DeviceHostAddress); + // PR ablegacy-12 / #255 — successful read clears the + // consecutive-failure tally. We do NOT auto-clear DemotedUntilUtc + // here — the demote window is honoured to its full duration so an + // intermittent link that just happened to answer once doesn't + // immediately re-flood the channel. Probe success is the early + // recovery path. + Interlocked.Exchange(ref device.ConsecutiveFailures, 0); break; } @@ -398,6 +509,10 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover _health = new DriverHealth(DriverState.Healthy, now, null); // PR ablegacy-10 / #253 — successful scalar / sub-element / bit read. _diagnosticTags.RecordResponse(def.DeviceHostAddress); + // PR ablegacy-12 / #255 — successful read clears the + // consecutive-failure tally; demote window keeps running + // until a probe success or natural expiry. + Interlocked.Exchange(ref device.ConsecutiveFailures, 0); break; } catch (OperationCanceledException) { throw; } @@ -414,6 +529,12 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover libplctagStatus: 0, errorMessage: ex.Message, commFailure: true); + // PR ablegacy-12 / #255 — exception-driven comm failure counts + // toward the demote tally just like a status-mapped one. + if (RecordFailureAndMaybeDemote(device, now)) + { + TransitionDeviceState(device, HostState.Demoted); + } snapshot = new DataValueSnapshot(null, AbLegacyStatusMapper.BadCommunicationError, null, now); _health = new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, ex.Message); @@ -591,6 +712,14 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover "Most recent libplctag error message on a failed read; empty when no error has been seen since the last reset."); EmitDiagnosticVariable(diag, deviceHostAddress, "CommFailures", DriverDataType.Int64, "Count of read failures mapped to BadCommunicationError. Spans transient libplctag throws + retried-out chains so operators see a single 'wire fell off' counter."); + // PR ablegacy-12 / #255 — auto-demote surface. DemoteCount is cumulative + // across reinit (preserved in ReinitializeAsync); LastDemotedUtc is a + // string (ISO-8601 UTC) so HMIs can bind directly without a separate + // DateTime decoder. Empty string means "never demoted". + EmitDiagnosticVariable(diag, deviceHostAddress, "DemoteCount", DriverDataType.Int64, + "Cumulative auto-demote events for this device — bumps every time the driver crosses the consecutive-failure threshold and arms a fresh cool-down window. Survives ReinitializeAsync."); + EmitDiagnosticVariable(diag, deviceHostAddress, "LastDemotedUtc", DriverDataType.String, + "ISO-8601 UTC timestamp of the most recent auto-demotion; empty when this device has never been demoted."); } private static void EmitDiagnosticVariable( @@ -665,7 +794,39 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover state.ProbeInitialized = false; } - TransitionDeviceState(state, success ? HostState.Running : HostState.Stopped); + // PR ablegacy-12 / #255 — probe success is the early-recovery path: clear + // any active demote window + reset the failure tally so the next read + // dispatches normally. Probe failure participates in the same shared + // failure-tally as ReadAsync so a device with no live read traffic still + // demotes on a sustained outage. + if (success) + { + bool wasDemoted; + lock (state.ProbeLock) + { + wasDemoted = state.DemotedUntilUtc is not null; + state.DemotedUntilUtc = null; + } + Interlocked.Exchange(ref state.ConsecutiveFailures, 0); + TransitionDeviceState(state, HostState.Running); + _ = wasDemoted; // intentionally observed for future telemetry hooks + } + else + { + if (RecordFailureAndMaybeDemote(state, DateTime.UtcNow)) + { + TransitionDeviceState(state, HostState.Demoted); + } + else + { + // Mid-tally probe failure: surface as Stopped if not already + // Demoted. This preserves pre-PR-12 behaviour for the common + // case (FailureThreshold=3 + a single hiccup ends up Stopped, + // not Demoted). + if (state.HostState != HostState.Demoted) + TransitionDeviceState(state, HostState.Stopped); + } + } try { await Task.Delay(_options.Probe.Interval, ct).ConfigureAwait(false); } catch (OperationCanceledException) { break; } @@ -890,6 +1051,25 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover public CancellationTokenSource? ProbeCts { get; set; } public bool ProbeInitialized { get; set; } + /// + /// PR ablegacy-12 / #255 — running tally of consecutive read / probe failures. + /// Reset on every successful read or probe; tripping + /// arms the demote window. + /// Read + written via because read + probe loops can + /// touch it concurrently. + /// + public int ConsecutiveFailures; + + /// + /// PR ablegacy-12 / #255 — when set, reads against this device short-circuit + /// with BadCommunicationError until the timestamp passes; cleared early + /// by a successful probe. Guarded by for the mutator + /// paths (TransitionDeviceState + RecordFailureAndMaybeDemote); reads grab + /// the property without locking — a torn DateTime? read is harmless here + /// because the worst case is one extra dispatched read on an x86 boundary. + /// + public DateTimeOffset? DemotedUntilUtc { get; set; } + public void DisposeRuntimes() { foreach (var r in Runtimes.Values) r.Dispose(); diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDriverFactoryExtensions.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDriverFactoryExtensions.cs index 1e465ca..4e161b3 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDriverFactoryExtensions.cs +++ b/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDriverFactoryExtensions.cs @@ -45,7 +45,14 @@ public static class AbLegacyDriverFactoryExtensions DeviceName: d.DeviceName, // PR 9 — per-device timeout / retry overrides. Device-level wins over driver-wide. Timeout: d.TimeoutMs is int devMs ? TimeSpan.FromMilliseconds(devMs) : null, - Retries: d.Retries))] + Retries: d.Retries, + // PR ablegacy-12 / #255 — auto-demote knobs. + Demote: d.Demote is null ? null : new AbLegacyDemoteOptions( + FailureThreshold: d.Demote.FailureThreshold ?? 3, + DemoteFor: d.Demote.DemoteForMs is int demMs + ? TimeSpan.FromMilliseconds(demMs) + : null, + Enabled: d.Demote.Enabled ?? true)))] : [], Tags = dto.Tags is { Count: > 0 } ? [.. dto.Tags.Select(t => new AbLegacyTagDefinition( @@ -209,6 +216,26 @@ public static class AbLegacyDriverFactoryExtensions /// null at both levels = single attempt. /// public int? Retries { get; init; } + + /// + /// PR ablegacy-12 / #255 — optional per-device auto-demote knobs. null + /// means "use the documented defaults" (FailureThreshold=3, + /// DemoteFor=30s, Enabled=true) — the driver still demotes by + /// default. Set Enabled=false in the JSON to opt out entirely. + /// + public AbLegacyDemoteDto? Demote { get; init; } + } + + /// + /// PR ablegacy-12 / #255 — JSON DTO for the auto-demote knobs. Times are + /// ms-suffixed for consistency with the rest of the driver config (TimeoutMs, + /// IntervalMs). + /// + internal sealed class AbLegacyDemoteDto + { + public int? FailureThreshold { get; init; } + public int? DemoteForMs { get; init; } + public bool? Enabled { get; init; } } internal sealed class AbLegacyTagDto diff --git a/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDriverOptions.cs b/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDriverOptions.cs index 5aef5ba..76fa89a 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDriverOptions.cs +++ b/src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDriverOptions.cs @@ -41,7 +41,39 @@ public sealed record AbLegacyDeviceOptions( AbLegacyPlcFamily PlcFamily = AbLegacyPlcFamily.Slc500, string? DeviceName = null, TimeSpan? Timeout = null, - int? Retries = null); + int? Retries = null, + AbLegacyDemoteOptions? Demote = null); + +/// +/// PR ablegacy-12 / #255 — auto-demote knobs. After +/// consecutive read / probe failures the driver +/// marks the device Demoted for ; reads against +/// a demoted device short-circuit with BadCommunicationError instead +/// of dispatching through libplctag, so one slow PLC can't starve faster +/// peers sharing the same driver. A successful probe clears the demotion +/// early; a successful read just resets the consecutive-failure counter +/// without leaving the demoted window. +/// +/// Consecutive read or probe failures that trip +/// the demotion. Default 3. +/// Cool-down window before reads are dispatched again +/// without a successful probe in between. Default 30s. +/// When false the failure tally still ticks but the +/// driver never sets the demoted window — useful when an operator wants the +/// diagnostic counters without the throttling behaviour. +public sealed record AbLegacyDemoteOptions( + int FailureThreshold = 3, + TimeSpan? DemoteFor = null, + bool Enabled = true) +{ + /// + /// Effective demote window. Records can't have TimeSpan defaults + /// because TimeSpan.FromSeconds(30) isn't a compile-time constant; + /// callers that pass null get the documented 30-second default + /// here. + /// + public TimeSpan EffectiveDemoteFor => DemoteFor ?? TimeSpan.FromSeconds(30); +} /// /// One PCCC-backed OPC UA variable. Address is the canonical PCCC file-address diff --git a/src/ZB.MOM.WW.OtOpcUa.Server/HostStatusPublisher.cs b/src/ZB.MOM.WW.OtOpcUa.Server/HostStatusPublisher.cs index 69153ce..c5f40d0 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Server/HostStatusPublisher.cs +++ b/src/ZB.MOM.WW.OtOpcUa.Server/HostStatusPublisher.cs @@ -138,6 +138,11 @@ public sealed class HostStatusPublisher( HostState.Running => DriverHostState.Running, HostState.Stopped => DriverHostState.Stopped, HostState.Faulted => DriverHostState.Faulted, + // PR ablegacy-12 / #255 — Demoted is a driver-side back-off (skipped reads while + // we wait for a flaky host to recover). The Configuration enum doesn't have a + // dedicated value; surface it as Stopped so the Admin UI lights it up red-ish + // without the publisher needing a schema migration to differentiate. + HostState.Demoted => DriverHostState.Stopped, _ => DriverHostState.Unknown, }; } diff --git a/tests/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.IntegrationTests/AbLegacyAutoDemoteTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.IntegrationTests/AbLegacyAutoDemoteTests.cs new file mode 100644 index 0000000..01fa3e9 --- /dev/null +++ b/tests/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.IntegrationTests/AbLegacyAutoDemoteTests.cs @@ -0,0 +1,102 @@ +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Core.Abstractions; +using ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.PlcFamilies; + +namespace ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.IntegrationTests; + +/// +/// PR ablegacy-12 / #255 — wire-level smoke for auto-demote on comm failure. +/// Runs only when ab_server is reachable. Two devices: one healthy (the live +/// ab_server slc500 simulator), one pointed at 127.0.0.1:1 which +/// refuses every connection. After three consecutive failures the faulty +/// device's reads must short-circuit with BadCommunicationError +/// while the healthy device keeps returning Good — the whole point +/// of the feature: one slow / unreachable PLC sharing the driver thread +/// can't starve faster peers. +/// +/// +/// +/// Build-only by default — the assertion that demotion latency is +/// bounded depends on the ab_server simulator timing out on the faulty +/// port within the per-device timeout. We pin the faulty endpoint at +/// 127.0.0.1:1 (the bogus-port standard) which RST's the +/// connection immediately on most stacks; environments that whitelist +/// outbound to localhost:1 will see different timing but still trip +/// the threshold within the test budget. +/// +/// +/// The Docker fixture extension (slc500-faulty) noted in the PR +/// plan is a documentation-only placeholder for now — implementing a +/// refusing-proxy container is non-trivial and the localhost:1 trick +/// covers the same surface deterministically. +/// +/// +[Collection(AbLegacyServerCollection.Name)] +[Trait("Category", "Integration")] +[Trait("Simulator", "ab_server-PCCC")] +public sealed class AbLegacyAutoDemoteTests(AbLegacyServerFixture sim) +{ + [AbLegacyFact] + public async Task Two_devices_one_unreachable_does_not_starve_healthy_reads() + { + if (sim.SkipReason is not null) Assert.Skip(sim.SkipReason); + + var healthy = $"ab://{sim.Host}:{sim.Port}/{sim.CipPath}"; + // 127.0.0.1:1 is the bogus-port standard — typical Linux/Windows TCP + // stacks RST immediately. The driver still reports it as a comm + // failure (libplctag wraps the failure as a transient throw). + var faulty = "ab://127.0.0.1:1/1,0"; + + await using var drv = new AbLegacyDriver(new AbLegacyDriverOptions + { + Devices = + [ + new AbLegacyDeviceOptions(healthy, AbLegacyPlcFamily.Slc500, + Timeout: TimeSpan.FromSeconds(5)), + new AbLegacyDeviceOptions(faulty, AbLegacyPlcFamily.Slc500, + // Snappy timeout so the test budget stays short. + Timeout: TimeSpan.FromMilliseconds(500), + Demote: new AbLegacyDemoteOptions( + FailureThreshold: 3, + DemoteFor: TimeSpan.FromSeconds(30))), + ], + Tags = + [ + new AbLegacyTagDefinition("Healthy", healthy, "N7:0", AbLegacyDataType.Int), + new AbLegacyTagDefinition("Faulty", faulty, "N7:0", AbLegacyDataType.Int), + ], + Probe = new AbLegacyProbeOptions { Enabled = false }, + }, driverInstanceId: "ablegacy-auto-demote-it"); + + await drv.InitializeAsync("{}", TestContext.Current.CancellationToken); + + // Trip the demote on the faulty device. + for (var i = 0; i < 3; i++) + { + await drv.ReadAsync(["Faulty"], TestContext.Current.CancellationToken); + } + + // Healthy host MUST keep returning Good even though the sibling is demoted. + var healthyResult = await drv.ReadAsync(["Healthy"], TestContext.Current.CancellationToken); + healthyResult[0].StatusCode.ShouldBe(AbLegacyStatusMapper.Good); + + // Faulty host now short-circuits without waiting on libplctag's timeout. + var sw = System.Diagnostics.Stopwatch.StartNew(); + var faultyResult = await drv.ReadAsync(["Faulty"], TestContext.Current.CancellationToken); + sw.Stop(); + faultyResult[0].StatusCode.ShouldBe(AbLegacyStatusMapper.BadCommunicationError); + // Short-circuit should be ~1 ms; pad generously for CI noise. The pre-PR-12 + // path would have waited the full 500 ms timeout. + sw.ElapsedMilliseconds.ShouldBeLessThan(200); + + // Counter access via the public diagnostic short-circuit path — the + // internal Snapshot() seam isn't visible from this assembly. + var demoteCountRef = $"_Diagnostics/{faulty}/DemoteCount"; + var lastDemotedRef = $"_Diagnostics/{faulty}/LastDemotedUtc"; + var diag = await drv.ReadAsync( + [demoteCountRef, lastDemotedRef], TestContext.Current.CancellationToken); + ((long)diag[0].Value!).ShouldBeGreaterThan(0); + ((string)diag[1].Value!).Length.ShouldBeGreaterThan(0); + } +} diff --git a/tests/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.IntegrationTests/Docker/docker-compose.yml b/tests/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.IntegrationTests/Docker/docker-compose.yml index 941e16e..88cb00d 100644 --- a/tests/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.IntegrationTests/Docker/docker-compose.yml +++ b/tests/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.IntegrationTests/Docker/docker-compose.yml @@ -72,3 +72,30 @@ services: "--tag=F8[120]", "--tag=B3[10]" ] + + # PR ablegacy-12 / #255 — faulty-PLC fixture for the auto-demote contract. + # FIXTURE-TIER FOLLOW-UP: implementing a refusing-proxy container that + # round-trips libplctag's CIP framing far enough to trigger comm failures + # (vs. just RST'ing the TCP handshake) is non-trivial — the integration + # test currently uses 127.0.0.1:1 (the bogus-port standard) which RST's + # immediately on most TCP stacks. That gets us deterministic comm-failure + # coverage without standing up a second container; if the localhost:1 + # trick stops working on a future test runner (e.g. a sandbox that + # blocks port 1) re-enable this stub: + # + # slc500-faulty: + # profiles: ["slc500-faulty"] + # image: otopcua-ab-server:libplctag-release + # build: + # context: ../../ZB.MOM.WW.OtOpcUa.Driver.AbCip.IntegrationTests/Docker + # dockerfile: Dockerfile + # container_name: otopcua-ab-server-slc500-faulty + # restart: "no" + # ports: + # - "44819:44819" + # # Hostile entrypoint: bind the port but exit immediately so subsequent + # # connection attempts get RST'd. Future iteration: a libplctag-aware + # # proxy that accepts the CIP open and then drops the wire halfway + # # through, exercising the read-timeout path rather than the + # # connection-refused path. + # entrypoint: ["sh", "-c", "exit 1"] diff --git a/tests/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Tests/AbLegacyAutoDemoteTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Tests/AbLegacyAutoDemoteTests.cs new file mode 100644 index 0000000..29e180e --- /dev/null +++ b/tests/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Tests/AbLegacyAutoDemoteTests.cs @@ -0,0 +1,380 @@ +using System.Text.Json; +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Core.Abstractions; +using ZB.MOM.WW.OtOpcUa.Driver.AbLegacy; +using ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.PlcFamilies; + +namespace ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Tests; + +/// +/// PR ablegacy-12 / #255 — auto-demote on consecutive comm failure. After +/// FailureThreshold consecutive read or probe failures the driver +/// marks the device Demoted for DemoteFor; subsequent reads +/// short-circuit with BadCommunicationError without invoking +/// libplctag, so one slow PLC sharing the driver thread can't starve faster +/// peers. Probe success clears the demote early; read success resets the +/// consecutive-failure tally without leaving the demote window. +/// +[Trait("Category", "Unit")] +public sealed class AbLegacyAutoDemoteTests +{ + private const string Host = "ab://10.0.0.5/1,0"; + private const string SecondHost = "ab://10.0.0.6/1,0"; + + /// + /// Disable the probe by default — every test wants deterministic + /// control over the failure tally without a background loop racing + /// against the read path. + /// + private static AbLegacyDriverOptions BaseOptions( + AbLegacyDemoteOptions? demote = null, + IReadOnlyList? devices = null, + IReadOnlyList? tags = null) => new() + { + Devices = devices ?? [new AbLegacyDeviceOptions(Host, AbLegacyPlcFamily.Slc500, Demote: demote)], + Tags = tags ?? [new AbLegacyTagDefinition("X", Host, "N7:0", AbLegacyDataType.Int)], + Probe = new AbLegacyProbeOptions { Enabled = false }, + }; + + private static (AbLegacyDriver drv, FakeAbLegacyTagFactory factory) NewDriver( + AbLegacyDemoteOptions? demote = null, + IReadOnlyList? devices = null, + IReadOnlyList? tags = null) + { + var factory = new FakeAbLegacyTagFactory(); + var drv = new AbLegacyDriver(BaseOptions(demote, devices, tags), "drv-demote", factory); + return (drv, factory); + } + + private static FakeAbLegacyTag SeedFailingTag(FakeAbLegacyTagFactory factory) + { + // Cause every read to throw — exception-driven failures count as + // BadCommunicationError per RecordError(commFailure:true). + factory.Customise = p => new FakeAbLegacyTag(p) + { + ThrowOnRead = true, + Exception = new TimeoutException("simulated comm failure"), + }; + // Return value is the prototype so a caller that wants to flip the + // failure off later can do so via factory.Tags["N7:0"]. + return null!; + } + + [Fact] + public async Task Three_consecutive_failures_demote_the_device() + { + var (drv, factory) = NewDriver(); + SeedFailingTag(factory); + await drv.InitializeAsync("{}", CancellationToken.None); + + await drv.ReadAsync(["X"], CancellationToken.None); + await drv.ReadAsync(["X"], CancellationToken.None); + await drv.ReadAsync(["X"], CancellationToken.None); + + var state = drv.GetDeviceState(Host).ShouldNotBeNull(); + state.DemotedUntilUtc.ShouldNotBeNull(); + var snap = drv.DiagnosticTags.Snapshot(Host); + snap.DemoteCount.ShouldBe(1); + snap.LastDemotedUtc.ShouldNotBeNull(); + drv.GetHostStatuses().Single().State.ShouldBe(HostState.Demoted); + } + + [Fact] + public async Task Reads_while_demoted_short_circuit_without_invoking_libplctag() + { + var (drv, factory) = NewDriver( + new AbLegacyDemoteOptions(FailureThreshold: 3, DemoteFor: TimeSpan.FromMinutes(5))); + SeedFailingTag(factory); + await drv.InitializeAsync("{}", CancellationToken.None); + + // Trip the demotion. + await drv.ReadAsync(["X"], CancellationToken.None); + await drv.ReadAsync(["X"], CancellationToken.None); + await drv.ReadAsync(["X"], CancellationToken.None); + var readsBeforeDemote = factory.Tags["N7:0"].ReadCount; + + // Subsequent reads MUST NOT call into libplctag — the short-circuit + // returns BadCommunicationError before EnsureTagRuntimeAsync. + var result = await drv.ReadAsync(["X"], CancellationToken.None); + result[0].StatusCode.ShouldBe(AbLegacyStatusMapper.BadCommunicationError); + factory.Tags["N7:0"].ReadCount.ShouldBe(readsBeforeDemote); + + var result2 = await drv.ReadAsync(["X"], CancellationToken.None); + result2[0].StatusCode.ShouldBe(AbLegacyStatusMapper.BadCommunicationError); + factory.Tags["N7:0"].ReadCount.ShouldBe(readsBeforeDemote); + } + + [Fact] + public async Task After_DemoteFor_expires_next_read_dispatches_through() + { + // Tiny window so the cool-down expires within the test. + var (drv, factory) = NewDriver( + new AbLegacyDemoteOptions(FailureThreshold: 2, DemoteFor: TimeSpan.FromMilliseconds(50))); + SeedFailingTag(factory); + await drv.InitializeAsync("{}", CancellationToken.None); + + // Trip with two failures. + await drv.ReadAsync(["X"], CancellationToken.None); + await drv.ReadAsync(["X"], CancellationToken.None); + + var state = drv.GetDeviceState(Host).ShouldNotBeNull(); + state.DemotedUntilUtc.ShouldNotBeNull(); + var readsBeforeWait = factory.Tags["N7:0"].ReadCount; + + // Flip the fake to succeed and wait past the demote window. + factory.Tags["N7:0"].ThrowOnRead = false; + factory.Tags["N7:0"].Value = 42; + factory.Tags["N7:0"].Status = 0; + await Task.Delay(TimeSpan.FromMilliseconds(120)); + + var result = await drv.ReadAsync(["X"], CancellationToken.None); + result[0].StatusCode.ShouldBe(AbLegacyStatusMapper.Good); + result[0].Value.ShouldBe(42); + // The window expiry path dispatched through to libplctag. + factory.Tags["N7:0"].ReadCount.ShouldBeGreaterThan(readsBeforeWait); + } + + [Fact] + public async Task Successful_read_resets_consecutive_failure_counter() + { + var (drv, factory) = NewDriver(); + // Initial state — every read fails. + SeedFailingTag(factory); + await drv.InitializeAsync("{}", CancellationToken.None); + + await drv.ReadAsync(["X"], CancellationToken.None); + await drv.ReadAsync(["X"], CancellationToken.None); + var state = drv.GetDeviceState(Host).ShouldNotBeNull(); + state.ConsecutiveFailures.ShouldBe(2); + + // One successful read — flip the existing fake. + factory.Tags["N7:0"].ThrowOnRead = false; + factory.Tags["N7:0"].Value = 99; + factory.Tags["N7:0"].Status = 0; + await drv.ReadAsync(["X"], CancellationToken.None); + + state.ConsecutiveFailures.ShouldBe(0); + state.DemotedUntilUtc.ShouldBeNull(); + } + + [Fact] + public async Task Failure_success_failure_does_not_demote_at_threshold_three() + { + var (drv, factory) = NewDriver( + new AbLegacyDemoteOptions(FailureThreshold: 3)); + SeedFailingTag(factory); + await drv.InitializeAsync("{}", CancellationToken.None); + + // 2 failures. + await drv.ReadAsync(["X"], CancellationToken.None); + await drv.ReadAsync(["X"], CancellationToken.None); + + // 1 success — counter resets. + factory.Tags["N7:0"].ThrowOnRead = false; + factory.Tags["N7:0"].Status = 0; + await drv.ReadAsync(["X"], CancellationToken.None); + + // 2 more failures — should still be below the threshold. + factory.Tags["N7:0"].ThrowOnRead = true; + factory.Tags["N7:0"].Exception = new TimeoutException("flap"); + await drv.ReadAsync(["X"], CancellationToken.None); + await drv.ReadAsync(["X"], CancellationToken.None); + + var state = drv.GetDeviceState(Host).ShouldNotBeNull(); + state.DemotedUntilUtc.ShouldBeNull(); + drv.DiagnosticTags.Snapshot(Host).DemoteCount.ShouldBe(0); + } + + [Fact] + public async Task DemoteCount_and_LastDemotedUtc_surface_via_diagnostic_short_circuit() + { + var (drv, factory) = NewDriver(); + SeedFailingTag(factory); + await drv.InitializeAsync("{}", CancellationToken.None); + + await drv.ReadAsync(["X"], CancellationToken.None); + await drv.ReadAsync(["X"], CancellationToken.None); + await drv.ReadAsync(["X"], CancellationToken.None); + + // Read the synthetic _Diagnostics counters. + var demoteCountRef = $"{AbLegacyDiagnosticTags.DiagnosticsFolderPrefix}{Host}/DemoteCount"; + var lastDemotedRef = $"{AbLegacyDiagnosticTags.DiagnosticsFolderPrefix}{Host}/LastDemotedUtc"; + var counts = await drv.ReadAsync([demoteCountRef, lastDemotedRef], CancellationToken.None); + + counts[0].StatusCode.ShouldBe(AbLegacyStatusMapper.Good); + counts[0].Value.ShouldBe(1L); + counts[1].StatusCode.ShouldBe(AbLegacyStatusMapper.Good); + counts[1].Value.ShouldBeOfType(); + ((string)counts[1].Value!).Length.ShouldBeGreaterThan(0); // ISO-8601 stamp + } + + [Fact] + public async Task Demote_disabled_never_short_circuits_reads() + { + var (drv, factory) = NewDriver( + new AbLegacyDemoteOptions(FailureThreshold: 1, Enabled: false)); + SeedFailingTag(factory); + await drv.InitializeAsync("{}", CancellationToken.None); + + // 5 failures — would normally trip a single-fail threshold, but Enabled=false. + for (var i = 0; i < 5; i++) await drv.ReadAsync(["X"], CancellationToken.None); + + var state = drv.GetDeviceState(Host).ShouldNotBeNull(); + state.DemotedUntilUtc.ShouldBeNull(); + var snap = drv.DiagnosticTags.Snapshot(Host); + snap.DemoteCount.ShouldBe(0); + // Failures still get recorded as comm errors though — the diagnostic + // surface is honest about what happened, just no auto-throttle. + snap.CommFailures.ShouldBe(5); + // libplctag was invoked every time — that's the whole point of opting out. + factory.Tags["N7:0"].ReadCount.ShouldBe(5); + } + + [Fact] + public async Task Reinit_preserves_DemoteCount_but_clears_active_demotion() + { + var (drv, factory) = NewDriver(); + SeedFailingTag(factory); + await drv.InitializeAsync("{}", CancellationToken.None); + await drv.ReadAsync(["X"], CancellationToken.None); + await drv.ReadAsync(["X"], CancellationToken.None); + await drv.ReadAsync(["X"], CancellationToken.None); + + drv.DiagnosticTags.Snapshot(Host).DemoteCount.ShouldBe(1); + drv.GetDeviceState(Host)!.DemotedUntilUtc.ShouldNotBeNull(); + + await drv.ReinitializeAsync("{}", CancellationToken.None); + + // Active demotion cleared (the device is freshly tracked); cumulative count survives. + drv.GetDeviceState(Host)!.DemotedUntilUtc.ShouldBeNull(); + drv.GetDeviceState(Host)!.ConsecutiveFailures.ShouldBe(0); + drv.DiagnosticTags.Snapshot(Host).DemoteCount.ShouldBe(1); + } + + [Fact] + public async Task Disposing_driver_after_demotion_does_not_throw() + { + var (drv, factory) = NewDriver(); + SeedFailingTag(factory); + await drv.InitializeAsync("{}", CancellationToken.None); + await drv.ReadAsync(["X"], CancellationToken.None); + await drv.ReadAsync(["X"], CancellationToken.None); + await drv.ReadAsync(["X"], CancellationToken.None); + + await drv.DisposeAsync(); + } + + [Fact] + public async Task Demote_options_dto_round_trips_through_factory_extensions() + { + const string json = """ + { + "Devices": [ + { + "HostAddress": "ab://10.0.0.5/1,0", + "PlcFamily": "Slc500", + "Demote": { + "FailureThreshold": 5, + "DemoteForMs": 60000, + "Enabled": true + } + } + ], + "Probe": { "Enabled": false }, + "Tags": [ + { "Name": "X", "DeviceHostAddress": "ab://10.0.0.5/1,0", "Address": "N7:0", "DataType": "Int" } + ] + } + """; + + var drv = AbLegacyDriverFactoryExtensions.CreateInstance("drv-demote-roundtrip", json); + await drv.InitializeAsync(json, CancellationToken.None); + + var state = drv.GetDeviceState(Host).ShouldNotBeNull(); + state.Options.Demote.ShouldNotBeNull(); + state.Options.Demote!.FailureThreshold.ShouldBe(5); + state.Options.Demote.EffectiveDemoteFor.ShouldBe(TimeSpan.FromMinutes(1)); + state.Options.Demote.Enabled.ShouldBeTrue(); + + await drv.ShutdownAsync(CancellationToken.None); + } + + [Fact] + public async Task Two_devices_one_faulty_does_not_starve_the_healthy_one() + { + // Mixed factory — one host's tag throws, the other's reads cleanly. + var factory = new FakeAbLegacyTagFactory(); + factory.Customise = p => + { + // Identify by the Gateway portion of the create params. + var fail = p.Gateway == "10.0.0.6"; + return new FakeAbLegacyTag(p) + { + ThrowOnRead = fail, + Exception = fail ? new TimeoutException("faulty") : null, + Value = 42, + Status = 0, + }; + }; + var drv = new AbLegacyDriver(new AbLegacyDriverOptions + { + Devices = + [ + new AbLegacyDeviceOptions(Host, AbLegacyPlcFamily.Slc500), + new AbLegacyDeviceOptions(SecondHost, AbLegacyPlcFamily.Slc500), + ], + Tags = + [ + new AbLegacyTagDefinition("Healthy", Host, "N7:0", AbLegacyDataType.Int), + new AbLegacyTagDefinition("Faulty", SecondHost, "N7:0", AbLegacyDataType.Int), + ], + Probe = new AbLegacyProbeOptions { Enabled = false }, + }, "drv-mix", factory); + await drv.InitializeAsync("{}", CancellationToken.None); + + // Trip the faulty side. + for (var i = 0; i < 3; i++) + await drv.ReadAsync(["Faulty"], CancellationToken.None); + + // Healthy host MUST keep returning Good even though the sibling is demoted. + var healthyResult = await drv.ReadAsync(["Healthy"], CancellationToken.None); + healthyResult[0].StatusCode.ShouldBe(AbLegacyStatusMapper.Good); + healthyResult[0].Value.ShouldBe(42); + + // Reads against the faulty host short-circuit. + var faultyResult = await drv.ReadAsync(["Faulty"], CancellationToken.None); + faultyResult[0].StatusCode.ShouldBe(AbLegacyStatusMapper.BadCommunicationError); + + drv.GetDeviceState(Host)!.DemotedUntilUtc.ShouldBeNull(); + drv.GetDeviceState(SecondHost)!.DemotedUntilUtc.ShouldNotBeNull(); + } + + [Fact] + public async Task BadNodeIdUnknown_does_not_count_toward_demote_tally() + { + // -14 maps to BadNodeIdUnknown — terminal, not a comm failure. + var (drv, factory) = NewDriver(); + factory.Customise = p => new FakeAbLegacyTag(p) { Status = -14 }; + await drv.InitializeAsync("{}", CancellationToken.None); + + for (var i = 0; i < 5; i++) + await drv.ReadAsync(["X"], CancellationToken.None); + + var state = drv.GetDeviceState(Host).ShouldNotBeNull(); + // Five terminal failures shouldn't trip the demote threshold — they're + // a config / decoder mismatch, not a sign of a flapping link. + state.DemotedUntilUtc.ShouldBeNull(); + drv.DiagnosticTags.Snapshot(Host).DemoteCount.ShouldBe(0); + } + + [Fact] + public void HostState_enum_has_Demoted_value() + { + // Belt-and-braces: the abstraction surface must carry the new value + // for downstream consumers (HostStatusPublisher, Admin UI, …) to + // see and route it. + Enum.IsDefined(typeof(HostState), HostState.Demoted).ShouldBeTrue(); + ((int)HostState.Demoted).ShouldBeGreaterThan((int)HostState.Faulted); + } +} diff --git a/tests/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Tests/AbLegacyDiagnosticsTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Tests/AbLegacyDiagnosticsTests.cs index 3bae820..889d09b 100644 --- a/tests/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Tests/AbLegacyDiagnosticsTests.cs +++ b/tests/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Tests/AbLegacyDiagnosticsTests.cs @@ -173,7 +173,9 @@ public sealed class AbLegacyDiagnosticsTests var diagVars = builder.Variables .Where(v => v.Info.FullName.StartsWith(AbLegacyDiagnosticTags.DiagnosticsFolderPrefix)) .ToList(); - diagVars.Count.ShouldBe(14); // 7 names × 2 devices + // PR ablegacy-12 / #255 — DemoteCount + LastDemotedUtc bring the canonical + // count to 9 names per device (was 7 in PR ablegacy-10). + diagVars.Count.ShouldBe(AbLegacyDiagnosticTags.DiagnosticTagNames.Count * 2); diagVars.ShouldAllBe(v => v.Info.SecurityClass == SecurityClassification.ViewOnly); } diff --git a/tests/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Tests/FakeAbLegacyTag.cs b/tests/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Tests/FakeAbLegacyTag.cs index 82456f5..0b81e10 100644 --- a/tests/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Tests/FakeAbLegacyTag.cs +++ b/tests/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Tests/FakeAbLegacyTag.cs @@ -84,7 +84,14 @@ internal class FakeAbLegacyTag : IAbLegacyTagRuntime internal sealed class FakeAbLegacyTagFactory : IAbLegacyTagFactory { - public Dictionary Tags { get; } = new(StringComparer.OrdinalIgnoreCase); + // PR ablegacy-12 / #255 — switched from plain Dictionary to ConcurrentDictionary so + // the read path (test thread) and the probe loop (background Task) can both call + // Create without corrupting the dict. Pre-PR-12 the race existed but only tipped + // a few percent of test runs into KeyNotFoundException; PR-12's added + // Interlocked.Exchange writes shifted timing enough to make it deterministic-flaky + // (~60%). + public System.Collections.Concurrent.ConcurrentDictionary Tags { get; } = + new(StringComparer.OrdinalIgnoreCase); public Func? Customise { get; set; } public IAbLegacyTagRuntime Create(AbLegacyTagCreateParams p)