[ablegacy] AbLegacy — Auto-demote on comm failure #399
@@ -21,6 +21,9 @@ dotnet run --project src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Cli -- --help
|
|||||||
| `-P` / `--plc-type` | `Slc500` | Slc500 / MicroLogix / Plc5 / LogixPccc |
|
| `-P` / `--plc-type` | `Slc500` | Slc500 / MicroLogix / Plc5 / LogixPccc |
|
||||||
| `--timeout-ms` | `5000` | Per-operation timeout — see precedence note below |
|
| `--timeout-ms` | `5000` | Per-operation timeout — see precedence note below |
|
||||||
| `--retries` | `0` | Retry count on transient `BadCommunicationError` (PR 9 / #252) |
|
| `--retries` | `0` | Retry count on transient `BadCommunicationError` (PR 9 / #252) |
|
||||||
|
| `--demote-failure-threshold` | `3` | **PR ablegacy-12 / #255** — consecutive comm failures before the device is auto-demoted |
|
||||||
|
| `--demote-for-ms` | `30000` | **PR ablegacy-12 / #255** — auto-demote cool-down window in ms |
|
||||||
|
| `--no-demote` | off | **PR ablegacy-12 / #255** — disable auto-demote entirely (counters still tick) |
|
||||||
| `--verbose` | off | Serilog debug output |
|
| `--verbose` | off | Serilog debug output |
|
||||||
|
|
||||||
Family ↔ CIP-path cheat sheet:
|
Family ↔ CIP-path cheat sheet:
|
||||||
@@ -84,6 +87,37 @@ otopcua-ablegacy-cli probe -g ab://192.168.1.20/1,0
|
|||||||
otopcua-ablegacy-cli probe -g ab://192.168.1.30/ -P MicroLogix -a S:0
|
otopcua-ablegacy-cli probe -g ab://192.168.1.30/ -P MicroLogix -a S:0
|
||||||
```
|
```
|
||||||
|
|
||||||
|
`probe` output (PR ablegacy-12 / #255) reports both `Health` (driver health
|
||||||
|
state) and `Host state`. The latter is sourced from `IHostConnectivityProbe`
|
||||||
|
and surfaces `Demoted` when the auto-demote threshold has tripped — a fast
|
||||||
|
visual signal that the CLI is short-circuiting future reads against this
|
||||||
|
device until the cool-down expires:
|
||||||
|
|
||||||
|
```text
|
||||||
|
Gateway: ab://192.168.1.20/1,0
|
||||||
|
PLC type: Slc500
|
||||||
|
Health: Degraded
|
||||||
|
Host state: Demoted
|
||||||
|
Last error: libplctag status -33 reading N7:0
|
||||||
|
```
|
||||||
|
|
||||||
|
### Auto-demote knobs
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
# Trip after just one comm failure, hold for 60s.
|
||||||
|
otopcua-ablegacy-cli read -g ab://192.168.1.20/1,0 -a N7:0 -t Int `
|
||||||
|
--demote-failure-threshold 1 --demote-for-ms 60000
|
||||||
|
|
||||||
|
# Opt out of auto-demote — stresses the link without short-circuiting.
|
||||||
|
otopcua-ablegacy-cli read -g ab://192.168.1.20/1,0 -a N7:0 -t Int --no-demote
|
||||||
|
```
|
||||||
|
|
||||||
|
The CLI is a one-shot test client — auto-demote primarily matters in the
|
||||||
|
server-side multi-device deployment, where a single demoted PLC can no
|
||||||
|
longer block reads against its healthy peers. Use the CLI flags to
|
||||||
|
reproduce a flapping-link scenario locally before tuning the server-side
|
||||||
|
`appsettings.json` `Demote` block.
|
||||||
|
|
||||||
### `read`
|
### `read`
|
||||||
|
|
||||||
```powershell
|
```powershell
|
||||||
|
|||||||
@@ -7,10 +7,12 @@ directly without going through a separate diagnostics RPC. Mirrors the AB CIP
|
|||||||
|
|
||||||
Closes #253 (PR ablegacy-10).
|
Closes #253 (PR ablegacy-10).
|
||||||
|
|
||||||
## The seven counters
|
## The nine counters
|
||||||
|
|
||||||
Each device managed by the `AbLegacyDriver` exposes seven read-only nodes under
|
Each device managed by the `AbLegacyDriver` exposes nine read-only nodes under
|
||||||
`AbLegacy/<host>/_Diagnostics/<name>`:
|
`AbLegacy/<host>/_Diagnostics/<name>`. The first seven shipped in PR ablegacy-10;
|
||||||
|
`DemoteCount` + `LastDemotedUtc` arrived with PR ablegacy-12 / #255 (auto-demote
|
||||||
|
on comm failure).
|
||||||
|
|
||||||
| Name | Type | Semantics |
|
| Name | Type | Semantics |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
@@ -21,6 +23,8 @@ Each device managed by the `AbLegacyDriver` exposes seven read-only nodes under
|
|||||||
| `LastErrorCode` | Int32 | Most recent libplctag status code on a failed read; `0` when no error has been seen since the last reset. |
|
| `LastErrorCode` | Int32 | Most recent libplctag status code on a failed read; `0` when no error has been seen since the last reset. |
|
||||||
| `LastErrorMessage` | String | Most recent libplctag error message on a failed read; empty when no error has been seen since the last reset. |
|
| `LastErrorMessage` | String | Most recent libplctag error message on a failed read; empty when no error has been seen since the last reset. |
|
||||||
| `CommFailures` | Int64 | Count of read failures mapped to `BadCommunicationError`. Spans transient libplctag throws + retried-out chains so operators see a single "wire fell off" counter. |
|
| `CommFailures` | Int64 | Count of read failures mapped to `BadCommunicationError`. Spans transient libplctag throws + retried-out chains so operators see a single "wire fell off" counter. |
|
||||||
|
| `DemoteCount` | Int64 | **PR ablegacy-12** — cumulative auto-demote events for this device. Bumps every time the driver crosses the consecutive-failure threshold and arms a fresh cool-down window. Cumulative across `ReinitializeAsync` (preserved through redeploys) so a flapping link surfaces as a steadily climbing counter. |
|
||||||
|
| `LastDemotedUtc` | String | **PR ablegacy-12** — ISO-8601 UTC timestamp of the most recent auto-demotion. Empty string when this device has never been demoted. |
|
||||||
|
|
||||||
**Address shape**: `_Diagnostics/<deviceHostAddress>/<name>` —
|
**Address shape**: `_Diagnostics/<deviceHostAddress>/<name>` —
|
||||||
e.g. `_Diagnostics/ab://10.0.0.5/1,0/RequestCount`.
|
e.g. `_Diagnostics/ab://10.0.0.5/1,0/RequestCount`.
|
||||||
@@ -34,10 +38,11 @@ user-config tag node, just under a reserved sibling folder.
|
|||||||
|
|
||||||
| Trigger | Effect |
|
| Trigger | Effect |
|
||||||
|---|---|
|
|---|---|
|
||||||
| `ReinitializeAsync` | Every counter for every device resets to zero, plus `LastErrorMessage` clears to empty. |
|
| `ReinitializeAsync` | Every counter for every device resets to zero, plus `LastErrorMessage` clears to empty. **PR ablegacy-12 exception:** `DemoteCount` + `LastDemotedUtc` survive the reinit so an operator redeploying mid-incident doesn't lose the flapping-link history. |
|
||||||
| `ShutdownAsync` | Same as Reinitialize — counters drop with the device map. |
|
| `ShutdownAsync` | All counters drop with the device map (including `DemoteCount`). |
|
||||||
| Driver process restart | Counters start at zero. |
|
| Driver process restart | Counters start at zero. |
|
||||||
| Probe transition Stopped→Running | **No automatic reset** — counters are cumulative across reconnect events so operators can spot intermittent links by watching `CommFailures` keep climbing. |
|
| Probe transition Stopped→Running | **No automatic reset** — counters are cumulative across reconnect events so operators can spot intermittent links by watching `CommFailures` keep climbing. |
|
||||||
|
| Probe transition Demoted→Running | **PR ablegacy-12** — early-clear of the active demote window, but the cumulative `DemoteCount` stays put. |
|
||||||
|
|
||||||
There is no in-process "reset" RPC at the time of writing. If you need to
|
There is no in-process "reset" RPC at the time of writing. If you need to
|
||||||
clear counters without a redeploy, kick a `ReinitializeAsync` from the Admin
|
clear counters without a redeploy, kick a `ReinitializeAsync` from the Admin
|
||||||
@@ -99,14 +104,85 @@ overview dashboard, plus a faster rate (1 s) on `LastErrorMessage` /
|
|||||||
short-circuit makes every read O(1) — there's no penalty for fast polling
|
short-circuit makes every read O(1) — there's no penalty for fast polling
|
||||||
of the counter itself, only the OPC UA subscription bookkeeping.
|
of the counter itself, only the OPC UA subscription bookkeeping.
|
||||||
|
|
||||||
|
## Auto-demote on comm failure (PR ablegacy-12 / #255)
|
||||||
|
|
||||||
|
When a device fails N consecutive reads or probes the driver marks it
|
||||||
|
**Demoted** for a configurable cool-down window. Reads against a demoted
|
||||||
|
device short-circuit with `BadCommunicationError` *without invoking
|
||||||
|
libplctag* — that's the whole point of the feature: one slow PLC sharing
|
||||||
|
the driver thread can't starve faster peers reading from healthy hosts on
|
||||||
|
the same `AbLegacyDriver` instance.
|
||||||
|
|
||||||
|
### Configuration
|
||||||
|
|
||||||
|
Per-device, optional. `null` keeps the documented defaults (auto-demote
|
||||||
|
**enabled** with 3 failures / 30 s).
|
||||||
|
|
||||||
|
```jsonc
|
||||||
|
{
|
||||||
|
"Devices": [
|
||||||
|
{
|
||||||
|
"HostAddress": "ab://10.0.0.5/1,0",
|
||||||
|
"PlcFamily": "Slc500",
|
||||||
|
"Demote": {
|
||||||
|
"FailureThreshold": 3, // default 3
|
||||||
|
"DemoteForMs": 30000, // default 30s
|
||||||
|
"Enabled": true // default true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
| Knob | Default | Notes |
|
||||||
|
|---|---|---|
|
||||||
|
| `FailureThreshold` | `3` | Consecutive comm failures before the device is demoted. A successful read or probe resets the tally. Terminal failures (`BadNodeIdUnknown`, `BadTypeMismatch`, …) **do not count** — they're config / decoder mismatches, not field outages. |
|
||||||
|
| `DemoteForMs` | `30000` (30s) | Cool-down window. Reads while this is active short-circuit; a successful probe clears it early. |
|
||||||
|
| `Enabled` | `true` | Set to `false` to keep the diagnostic counters but skip the auto-throttle. The failure tally still ticks but never arms the cool-down. |
|
||||||
|
|
||||||
|
### Recovery
|
||||||
|
|
||||||
|
Three ways out of Demoted, in order of likelihood:
|
||||||
|
|
||||||
|
1. **Probe success** — the per-device probe loop (`Probe.Enabled = true`,
|
||||||
|
default address `S:0`) is the fast path. The next probe iteration after
|
||||||
|
demotion will exercise the wire; on success it clears
|
||||||
|
`DemotedUntilUtc` immediately and transitions the host to `Running`.
|
||||||
|
2. **Window expiry** — once `DemoteForMs` elapses the demote marker
|
||||||
|
clears on the next read attempt. The read goes through; if it fails,
|
||||||
|
the failure tally keeps counting from where it left off (so a
|
||||||
|
permanently-down device re-arms the window after one more consecutive
|
||||||
|
failure rather than having to repeat the full threshold).
|
||||||
|
3. **`ReinitializeAsync`** — clears `ConsecutiveFailures` +
|
||||||
|
`DemotedUntilUtc` outright. Cumulative `DemoteCount` survives.
|
||||||
|
|
||||||
|
### Observability
|
||||||
|
|
||||||
|
`DemoteCount` is the headline counter — it bumps once per demotion event,
|
||||||
|
not per short-circuited read. A device that flaps every hour for a week
|
||||||
|
shows `DemoteCount = ~168` on Friday afternoon, which is the operator
|
||||||
|
signal you actually want.
|
||||||
|
|
||||||
|
`LastDemotedUtc` is the ISO-8601 UTC timestamp of the most recent
|
||||||
|
demotion. Bind it on a per-device tile alongside `DemoteCount` for
|
||||||
|
"flapping link" alerting.
|
||||||
|
|
||||||
|
### Host-state surface
|
||||||
|
|
||||||
|
A demoted device reports `HostState.Demoted` (new in PR ablegacy-12
|
||||||
|
on `Core.Abstractions/IHostConnectivityProbe.cs`). Consumers that
|
||||||
|
predate the new value (the central `HostStatusPublisher`) safely treat
|
||||||
|
it as `Stopped` — no schema migration needed.
|
||||||
|
|
||||||
## Cross-references
|
## Cross-references
|
||||||
|
|
||||||
- [`AbLegacyDiagnosticTags.cs`](../../src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDiagnosticTags.cs)
|
- [`AbLegacyDiagnosticTags.cs`](../../src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDiagnosticTags.cs)
|
||||||
— counter store + read short-circuit
|
— counter store + read short-circuit
|
||||||
- [`AbLegacyDriver.cs`](../../src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDriver.cs)
|
- [`AbLegacyDriver.cs`](../../src/ZB.MOM.WW.OtOpcUa.Driver.AbLegacy/AbLegacyDriver.cs)
|
||||||
— increment sites in `ReadAsync`, discovery emission in `DiscoverAsync`
|
— increment sites in `ReadAsync`, discovery emission in `DiscoverAsync`,
|
||||||
|
auto-demote bookkeeping in `RecordFailureAndMaybeDemote` + `ProbeLoopAsync`
|
||||||
- [`AbLegacy-Test-Fixture.md`](AbLegacy-Test-Fixture.md) — `AbLegacyDiagnosticsTests`
|
- [`AbLegacy-Test-Fixture.md`](AbLegacy-Test-Fixture.md) — `AbLegacyDiagnosticsTests`
|
||||||
+ collision-rejection contract
|
+ `AbLegacyAutoDemoteTests` + collision-rejection contract
|
||||||
- [AB CIP `_System/` parallel](../../src/ZB.MOM.WW.OtOpcUa.Driver.AbCip/AbCipSystemTagSource.cs)
|
- [AB CIP `_System/` parallel](../../src/ZB.MOM.WW.OtOpcUa.Driver.AbCip/AbCipSystemTagSource.cs)
|
||||||
— same pattern with the CIP-specific six entries (incl. writeable
|
— same pattern with the CIP-specific six entries (incl. writeable
|
||||||
`_RefreshTagDb` trigger)
|
`_RefreshTagDb` trigger)
|
||||||
|
|||||||
@@ -53,12 +53,31 @@ supplies a `FakeAbLegacyTag`.
|
|||||||
counters: 5 reads (3 ok / 2 fail) → `RequestCount=5`, `ResponseCount=3`,
|
counters: 5 reads (3 ok / 2 fail) → `RequestCount=5`, `ResponseCount=3`,
|
||||||
`ErrorCount=2`; `LastErrorCode` reflects the most recent libplctag status;
|
`ErrorCount=2`; `LastErrorCode` reflects the most recent libplctag status;
|
||||||
`RetryCount` increments per retry attempt beyond the first; counters reset
|
`RetryCount` increments per retry attempt beyond the first; counters reset
|
||||||
on `ReinitializeAsync`; discovery emits exactly 7 diagnostic variables per
|
on `ReinitializeAsync`; discovery emits the canonical diagnostic variables
|
||||||
device under `_Diagnostics/`; collision rejection at `InitializeAsync` for
|
per device under `_Diagnostics/` (now 9 with PR ablegacy-12); collision
|
||||||
user tags shadowing reserved names or `_Diagnostics/` addresses; the
|
rejection at `InitializeAsync` for user tags shadowing reserved names or
|
||||||
`_Diagnostics/<host>/<name>` short-circuit returns the live snapshot through
|
`_Diagnostics/` addresses; the `_Diagnostics/<host>/<name>` short-circuit
|
||||||
`ReadAsync` without bumping `RequestCount`; two devices keep counters
|
returns the live snapshot through `ReadAsync` without bumping
|
||||||
independent.
|
`RequestCount`; two devices keep counters independent.
|
||||||
|
- `AbLegacyAutoDemoteTests` — **PR ablegacy-12 / #255** auto-demote on comm
|
||||||
|
failure: 3 consecutive failures arm the demote window and surface
|
||||||
|
`HostState.Demoted`; subsequent reads short-circuit with
|
||||||
|
`BadCommunicationError` *without invoking libplctag* (verified via
|
||||||
|
`factory.Tags["N7:0"].ReadCount` not advancing); successful read resets
|
||||||
|
the consecutive-failure counter; failure-success-failure pattern doesn't
|
||||||
|
cross the threshold; `DemoteCount` + `LastDemotedUtc` surface via
|
||||||
|
`_Diagnostics/`; `Enabled=false` opts out (failures still count, demotion
|
||||||
|
never fires); `ReinitializeAsync` clears the active window but preserves
|
||||||
|
cumulative `DemoteCount`; cool-down expiry allows the next read through;
|
||||||
|
two devices in one driver — one faulty, one healthy — proves the faulty
|
||||||
|
side's demotion doesn't starve the healthy side; `BadNodeIdUnknown`
|
||||||
|
(terminal) does not count toward the comm-failure tally; DTO JSON
|
||||||
|
round-trip preserves `FailureThreshold` / `DemoteForMs` / `Enabled` at
|
||||||
|
the per-device level; `HostState.Demoted` enum value is wired through
|
||||||
|
`Core.Abstractions`. Companion integration test in
|
||||||
|
`tests/.../IntegrationTests/AbLegacyAutoDemoteTests.cs` runs the
|
||||||
|
two-device-one-unreachable scenario against a live ab_server fixture
|
||||||
|
using `127.0.0.1:1` as the unreachable peer.
|
||||||
- `RsLogixSymbolImportTests` — ablegacy-11 / #254 RSLogix CSV symbol-import parser:
|
- `RsLogixSymbolImportTests` — ablegacy-11 / #254 RSLogix CSV symbol-import parser:
|
||||||
canonical 8-row CSV (one row per N/F/B/L/ST/T/C/R) → 8 typed
|
canonical 8-row CSV (one row per N/F/B/L/ST/T/C/R) → 8 typed
|
||||||
`AbLegacyTagDefinition`s with the right `DataType`; header + comment-line
|
`AbLegacyTagDefinition`s with the right `DataType`; header + comment-line
|
||||||
|
|||||||
@@ -39,6 +39,21 @@
|
|||||||
client may have bumped it by more, so the comparison is `>=`). NodeId form:
|
client may have bumped it by more, so the comparison is `>=`). NodeId form:
|
||||||
ns=<n>;s=AbLegacy/<gateway>/_Diagnostics/RequestCount. Mirrors the
|
ns=<n>;s=AbLegacy/<gateway>/_Diagnostics/RequestCount. Mirrors the
|
||||||
-SystemConnectionStatusNodeId knob on test-abcip.ps1.
|
-SystemConnectionStatusNodeId knob on test-abcip.ps1.
|
||||||
|
|
||||||
|
.PARAMETER DiagnosticsDemoteCountNodeId
|
||||||
|
Optional NodeId for the synthetic _Diagnostics/<host>/DemoteCount variable
|
||||||
|
emitted by AB Legacy discovery (PR ablegacy-12 / #255). When supplied, the
|
||||||
|
script runs the auto-demote assertion: kills the simulator container so
|
||||||
|
reads start failing, hammers the user-tag BridgeNodeId at least
|
||||||
|
FailureThreshold times to trip the demotion, then reads the diagnostic
|
||||||
|
counter and asserts the value increased by >= 1. NodeId form:
|
||||||
|
ns=<n>;s=AbLegacy/<gateway>/_Diagnostics/DemoteCount. The simulator
|
||||||
|
must support `docker stop otopcua-ab-server-slc500` for the kill stage.
|
||||||
|
|
||||||
|
.PARAMETER FailureThresholdForDemote
|
||||||
|
Failure threshold the server is configured with (default 3). The
|
||||||
|
demote assertion writes/reads N+1 times against the killed simulator
|
||||||
|
to guarantee the threshold trips even if some reads beat the kill.
|
||||||
#>
|
#>
|
||||||
|
|
||||||
param(
|
param(
|
||||||
@@ -47,7 +62,9 @@ param(
|
|||||||
[string]$Address = "N7:5",
|
[string]$Address = "N7:5",
|
||||||
[string]$OpcUaUrl = "opc.tcp://localhost:4840",
|
[string]$OpcUaUrl = "opc.tcp://localhost:4840",
|
||||||
[Parameter(Mandatory)] [string]$BridgeNodeId,
|
[Parameter(Mandatory)] [string]$BridgeNodeId,
|
||||||
[string]$DiagnosticsRequestCountNodeId
|
[string]$DiagnosticsRequestCountNodeId,
|
||||||
|
[string]$DiagnosticsDemoteCountNodeId,
|
||||||
|
[int]$FailureThresholdForDemote = 3
|
||||||
)
|
)
|
||||||
|
|
||||||
$ErrorActionPreference = "Stop"
|
$ErrorActionPreference = "Stop"
|
||||||
@@ -245,5 +262,67 @@ finally {
|
|||||||
Remove-Item -Path $importJsonPath -ErrorAction SilentlyContinue
|
Remove-Item -Path $importJsonPath -ErrorAction SilentlyContinue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# PR ablegacy-12 / #255 — auto-demote round-trip. Kill the simulator container,
|
||||||
|
# hammer the bridge NodeId past the failure threshold, then assert the
|
||||||
|
# DemoteCount diagnostic incremented. Restart the simulator at the end so the
|
||||||
|
# next run gets a clean baseline. Gated on -DiagnosticsDemoteCountNodeId so
|
||||||
|
# environments without docker-side control of the simulator can opt out.
|
||||||
|
if ($DiagnosticsDemoteCountNodeId) {
|
||||||
|
Write-Header "AutoDemote (kill simulator + observe DemoteCount from $DiagnosticsDemoteCountNodeId)"
|
||||||
|
$baselineDemoteOut = & $opcUaCli.File @($opcUaCli.PrefixArgs) `
|
||||||
|
@("read", "-u", $OpcUaUrl, "-n", $DiagnosticsDemoteCountNodeId) 2>&1
|
||||||
|
$baselineDemote = 0
|
||||||
|
if (($baselineDemoteOut -join "`n") -match '(\d+)') { $baselineDemote = [int64]$Matches[1] }
|
||||||
|
|
||||||
|
# Best-effort container kill — prefer the slc500 profile name; fall back to
|
||||||
|
# micrologix / plc5 in case the operator pointed the e2e at a different family.
|
||||||
|
$simContainers = @("otopcua-ab-server-slc500", "otopcua-ab-server-micrologix", "otopcua-ab-server-plc5")
|
||||||
|
$killed = $false
|
||||||
|
foreach ($c in $simContainers) {
|
||||||
|
$stop = docker stop $c 2>$null
|
||||||
|
if ($LASTEXITCODE -eq 0 -and $stop) {
|
||||||
|
Write-Host "Stopped $c"
|
||||||
|
$killed = $true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (-not $killed) {
|
||||||
|
Write-Fail "AutoDemote: no ab_server container found via 'docker stop' — skipping demote assertion"
|
||||||
|
$results += @{ Passed = $false; Reason = "no simulator container to kill" }
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
# Hammer past the threshold. Each read against a now-unreachable simulator
|
||||||
|
# surfaces BadCommunicationError; FailureThreshold consecutive ones trip
|
||||||
|
# the demotion. We add 2 extra to absorb timing slack (one read may be
|
||||||
|
# in-flight when the kill lands).
|
||||||
|
$hammerCount = $FailureThresholdForDemote + 2
|
||||||
|
for ($i = 0; $i -lt $hammerCount; $i++) {
|
||||||
|
& $opcUaCli.File @($opcUaCli.PrefixArgs) `
|
||||||
|
@("read", "-u", $OpcUaUrl, "-n", $BridgeNodeId) 2>&1 | Out-Null
|
||||||
|
}
|
||||||
|
|
||||||
|
Start-Sleep -Seconds 1
|
||||||
|
|
||||||
|
$afterDemoteOut = & $opcUaCli.File @($opcUaCli.PrefixArgs) `
|
||||||
|
@("read", "-u", $OpcUaUrl, "-n", $DiagnosticsDemoteCountNodeId) 2>&1
|
||||||
|
$afterDemote = 0
|
||||||
|
if (($afterDemoteOut -join "`n") -match '(\d+)') { $afterDemote = [int64]$Matches[1] }
|
||||||
|
|
||||||
|
$deltaDemote = $afterDemote - $baselineDemote
|
||||||
|
if ($deltaDemote -ge 1) {
|
||||||
|
Write-Pass "AutoDemote DemoteCount delta $deltaDemote >= 1 after $hammerCount failed reads"
|
||||||
|
$results += @{ Passed = $true }
|
||||||
|
} else {
|
||||||
|
Write-Fail "AutoDemote DemoteCount delta $deltaDemote < 1 (baseline=$baselineDemote after=$afterDemote)"
|
||||||
|
$results += @{ Passed = $false; Reason = "demote delta $deltaDemote" }
|
||||||
|
}
|
||||||
|
|
||||||
|
# Restart the simulator so subsequent test runs have a clean baseline.
|
||||||
|
# Best-effort — if docker-compose isn't on the path the operator can
|
||||||
|
# bring it back manually via the Docker/docker-compose.yml profile.
|
||||||
|
try { docker start (docker ps -aq -f "name=otopcua-ab-server-") | Out-Null } catch { }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Write-Summary -Title "AB Legacy e2e" -Results $results
|
Write-Summary -Title "AB Legacy e2e" -Results $results
|
||||||
if ($results | Where-Object { -not $_.Passed }) { exit 1 }
|
if ($results | Where-Object { -not $_.Passed }) { exit 1 }
|
||||||
|
|||||||
@@ -96,7 +96,12 @@ VALUES (@Gen, @DrvId, @ClusterId, @NsId, 'ablegacy-smoke', 'AbLegacy', N'{
|
|||||||
"PlcFamily": "Slc500",
|
"PlcFamily": "Slc500",
|
||||||
"DeviceName": "slc-500",
|
"DeviceName": "slc-500",
|
||||||
"TimeoutMs": 500,
|
"TimeoutMs": 500,
|
||||||
"Retries": 1
|
"Retries": 1,
|
||||||
|
"Demote": {
|
||||||
|
"FailureThreshold": 3,
|
||||||
|
"DemoteForMs": 30000,
|
||||||
|
"Enabled": true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"Probe": { "Enabled": true, "IntervalMs": 5000, "TimeoutMs": 2000, "ProbeAddress": "S:0" },
|
"Probe": { "Enabled": true, "IntervalMs": 5000, "TimeoutMs": 2000, "ProbeAddress": "S:0" },
|
||||||
@@ -155,7 +160,15 @@ PRINT ' e.g. "ab://<plc-ip>:44818/" and re-run this seed.';
|
|||||||
PRINT '';
|
PRINT '';
|
||||||
PRINT 'PR ablegacy-10 / #253 — diagnostic counters auto-emit per device under';
|
PRINT 'PR ablegacy-10 / #253 — diagnostic counters auto-emit per device under';
|
||||||
PRINT ' AbLegacy/<host>/_Diagnostics/<name>. No dbo.Tag rows needed — the';
|
PRINT ' AbLegacy/<host>/_Diagnostics/<name>. No dbo.Tag rows needed — the';
|
||||||
PRINT ' driver registers them at DiscoverAsync time. Seven counters per device:';
|
PRINT ' driver registers them at DiscoverAsync time. Nine counters per device:';
|
||||||
PRINT ' RequestCount, ResponseCount, ErrorCount, RetryCount, LastErrorCode,';
|
PRINT ' RequestCount, ResponseCount, ErrorCount, RetryCount, LastErrorCode,';
|
||||||
PRINT ' LastErrorMessage, CommFailures. See docs/drivers/AbLegacy-Diagnostics.md';
|
PRINT ' LastErrorMessage, CommFailures, DemoteCount, LastDemotedUtc. See';
|
||||||
PRINT ' for the full surface + reset semantics.';
|
PRINT ' docs/drivers/AbLegacy-Diagnostics.md for the full surface + reset';
|
||||||
|
PRINT ' semantics.';
|
||||||
|
PRINT '';
|
||||||
|
PRINT 'PR ablegacy-12 / #255 — auto-demote on comm failure: 3 consecutive';
|
||||||
|
PRINT ' failed reads / probes mark the device Demoted for DemoteFor=PT30S';
|
||||||
|
PRINT ' (30 s); reads against a demoted device short-circuit with';
|
||||||
|
PRINT ' BadCommunicationError so one slow PLC can''t starve the driver.';
|
||||||
|
PRINT ' Tune via the Demote block on each Devices[] row. DemoteCount +';
|
||||||
|
PRINT ' LastDemotedUtc on the _Diagnostics folder surface flapping links.';
|
||||||
|
|||||||
@@ -38,4 +38,16 @@ public sealed record HostStatusChangedEventArgs(
|
|||||||
HostState NewState);
|
HostState NewState);
|
||||||
|
|
||||||
/// <summary>Host lifecycle state. Generalization of Galaxy's Platform/Engine ScanState.</summary>
|
/// <summary>Host lifecycle state. Generalization of Galaxy's Platform/Engine ScanState.</summary>
|
||||||
public enum HostState { Unknown, Running, Stopped, Faulted }
|
/// <remarks>
|
||||||
|
/// <para>
|
||||||
|
/// <see cref="Demoted"/> (PR ablegacy-12 / #255) is a soft-stopped state used by drivers
|
||||||
|
/// that auto-throttle a host after N consecutive comm failures. Reads are short-circuited
|
||||||
|
/// with <c>BadCommunicationError</c> for a configurable cool-down window so one slow PLC
|
||||||
|
/// doesn't starve faster peers sharing the same driver. Demoted is *not* the same as
|
||||||
|
/// <see cref="Stopped"/> (which means "probe says it's down") nor <see cref="Faulted"/>
|
||||||
|
/// (which means "the driver itself is broken"); it's a deliberate driver-side back-off.
|
||||||
|
/// Consumers that don't recognize <c>Demoted</c> can safely treat it as <c>Stopped</c>
|
||||||
|
/// (see <c>HostStatusPublisher.MapState</c>).
|
||||||
|
/// </para>
|
||||||
|
/// </remarks>
|
||||||
|
public enum HostState { Unknown, Running, Stopped, Faulted, Demoted }
|
||||||
|
|||||||
@@ -25,6 +25,34 @@ public abstract class AbLegacyCommandBase : DriverCommandBase
|
|||||||
[CommandOption("timeout-ms", Description = "Per-operation timeout in ms (default 5000).")]
|
[CommandOption("timeout-ms", Description = "Per-operation timeout in ms (default 5000).")]
|
||||||
public int TimeoutMs { get; init; } = 5000;
|
public int TimeoutMs { get; init; } = 5000;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// PR ablegacy-12 / #255 — consecutive comm failures before this device is
|
||||||
|
/// auto-demoted. Reads against a demoted device short-circuit with
|
||||||
|
/// <c>BadCommunicationError</c> for <see cref="DemoteForMs"/> ms so one
|
||||||
|
/// unreachable PLC can't starve faster peers sharing the driver thread.
|
||||||
|
/// </summary>
|
||||||
|
[CommandOption("demote-failure-threshold", Description =
|
||||||
|
"Consecutive comm failures before the device is auto-demoted (PR ablegacy-12). Default 3.")]
|
||||||
|
public int DemoteFailureThreshold { get; init; } = 3;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// PR ablegacy-12 / #255 — auto-demote cool-down window in ms. Reads while
|
||||||
|
/// this window is active short-circuit with <c>BadCommunicationError</c>;
|
||||||
|
/// a successful probe clears it early.
|
||||||
|
/// </summary>
|
||||||
|
[CommandOption("demote-for-ms", Description =
|
||||||
|
"Auto-demote cool-down window in ms (PR ablegacy-12). Default 30000 (30s).")]
|
||||||
|
public int DemoteForMs { get; init; } = 30_000;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// PR ablegacy-12 / #255 — opt out of the auto-demote behaviour. The
|
||||||
|
/// consecutive-failure tally still ticks (so DemoteCount/LastDemotedUtc
|
||||||
|
/// stay zero) but reads never short-circuit.
|
||||||
|
/// </summary>
|
||||||
|
[CommandOption("no-demote", Description =
|
||||||
|
"Disable auto-demote on consecutive comm failures (PR ablegacy-12). Default off (auto-demote enabled).")]
|
||||||
|
public bool NoDemote { get; init; }
|
||||||
|
|
||||||
/// <inheritdoc />
|
/// <inheritdoc />
|
||||||
public override TimeSpan Timeout
|
public override TimeSpan Timeout
|
||||||
{
|
{
|
||||||
@@ -41,7 +69,11 @@ public abstract class AbLegacyCommandBase : DriverCommandBase
|
|||||||
Devices = [new AbLegacyDeviceOptions(
|
Devices = [new AbLegacyDeviceOptions(
|
||||||
HostAddress: Gateway,
|
HostAddress: Gateway,
|
||||||
PlcFamily: PlcType,
|
PlcFamily: PlcType,
|
||||||
DeviceName: $"cli-{PlcType}")],
|
DeviceName: $"cli-{PlcType}",
|
||||||
|
Demote: new AbLegacyDemoteOptions(
|
||||||
|
FailureThreshold: DemoteFailureThreshold,
|
||||||
|
DemoteFor: TimeSpan.FromMilliseconds(DemoteForMs),
|
||||||
|
Enabled: !NoDemote))],
|
||||||
Tags = tags,
|
Tags = tags,
|
||||||
Timeout = Timeout,
|
Timeout = Timeout,
|
||||||
Probe = new AbLegacyProbeOptions { Enabled = false },
|
Probe = new AbLegacyProbeOptions { Enabled = false },
|
||||||
|
|||||||
@@ -40,10 +40,19 @@ public sealed class ProbeCommand : AbLegacyCommandBase
|
|||||||
await driver.InitializeAsync("{}", ct);
|
await driver.InitializeAsync("{}", ct);
|
||||||
var snapshot = await driver.ReadAsync(["__probe"], ct);
|
var snapshot = await driver.ReadAsync(["__probe"], ct);
|
||||||
var health = driver.GetHealth();
|
var health = driver.GetHealth();
|
||||||
|
// PR ablegacy-12 / #255 — surface Demoted alongside the probe-driven
|
||||||
|
// HostState. After a one-shot probe the host hasn't been observed
|
||||||
|
// (no probe loop runs in CLI mode), so HostState is typically Unknown
|
||||||
|
// unless the read above tripped the demote threshold.
|
||||||
|
var hostStatus = driver.GetHostStatuses().FirstOrDefault();
|
||||||
|
|
||||||
await console.Output.WriteLineAsync($"Gateway: {Gateway}");
|
await console.Output.WriteLineAsync($"Gateway: {Gateway}");
|
||||||
await console.Output.WriteLineAsync($"PLC type: {PlcType}");
|
await console.Output.WriteLineAsync($"PLC type: {PlcType}");
|
||||||
await console.Output.WriteLineAsync($"Health: {health.State}");
|
await console.Output.WriteLineAsync($"Health: {health.State}");
|
||||||
|
if (hostStatus is not null)
|
||||||
|
{
|
||||||
|
await console.Output.WriteLineAsync($"Host state: {hostStatus.State}");
|
||||||
|
}
|
||||||
if (health.LastError is { } err)
|
if (health.LastError is { } err)
|
||||||
await console.Output.WriteLineAsync($"Last error: {err}");
|
await console.Output.WriteLineAsync($"Last error: {err}");
|
||||||
await console.Output.WriteLineAsync();
|
await console.Output.WriteLineAsync();
|
||||||
|
|||||||
@@ -40,6 +40,11 @@ public sealed class AbLegacyDiagnosticTags
|
|||||||
public const string DiagnosticsFolderPrefix = "_Diagnostics/";
|
public const string DiagnosticsFolderPrefix = "_Diagnostics/";
|
||||||
|
|
||||||
/// <summary>Canonical names the diagnostics folder exposes. Keep in lockstep with discovery.</summary>
|
/// <summary>Canonical names the diagnostics folder exposes. Keep in lockstep with discovery.</summary>
|
||||||
|
/// <remarks>
|
||||||
|
/// PR ablegacy-12 / #255 — <c>DemoteCount</c> + <c>LastDemotedUtc</c> ride
|
||||||
|
/// alongside the original seven so HMIs can spot a flapping device by
|
||||||
|
/// watching <c>DemoteCount</c> climb without scraping logs.
|
||||||
|
/// </remarks>
|
||||||
public static readonly IReadOnlyList<string> DiagnosticTagNames =
|
public static readonly IReadOnlyList<string> DiagnosticTagNames =
|
||||||
[
|
[
|
||||||
"RequestCount",
|
"RequestCount",
|
||||||
@@ -49,6 +54,9 @@ public sealed class AbLegacyDiagnosticTags
|
|||||||
"LastErrorCode",
|
"LastErrorCode",
|
||||||
"LastErrorMessage",
|
"LastErrorMessage",
|
||||||
"CommFailures",
|
"CommFailures",
|
||||||
|
// PR ablegacy-12 / #255 — auto-demote on comm failure surface.
|
||||||
|
"DemoteCount",
|
||||||
|
"LastDemotedUtc",
|
||||||
];
|
];
|
||||||
|
|
||||||
private static readonly HashSet<string> DiagnosticTagNameSet =
|
private static readonly HashSet<string> DiagnosticTagNameSet =
|
||||||
@@ -130,6 +138,39 @@ public sealed class AbLegacyDiagnosticTags
|
|||||||
Interlocked.Increment(ref c.Retry);
|
Interlocked.Increment(ref c.Retry);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// PR ablegacy-12 / #255 — record an auto-demotion event: bumps cumulative
|
||||||
|
/// <c>DemoteCount</c> and stamps <c>LastDemotedUtc</c>. Fires every time the
|
||||||
|
/// driver crosses the failure threshold and arms a fresh cool-down window —
|
||||||
|
/// a single flapping link that demotes hourly will surface as a steadily
|
||||||
|
/// climbing counter, which is the operator-facing signal we want.
|
||||||
|
/// </summary>
|
||||||
|
public void RecordDemote(string deviceHostAddress, DateTime nowUtc)
|
||||||
|
{
|
||||||
|
ArgumentNullException.ThrowIfNull(deviceHostAddress);
|
||||||
|
var c = GetOrCreate(deviceHostAddress);
|
||||||
|
Interlocked.Increment(ref c.DemoteCount);
|
||||||
|
// DateTime is 64 bits — use Interlocked.Exchange on the Ticks field so a
|
||||||
|
// concurrent reader sees a torn-free snapshot. On x86 a 64-bit non-aligned
|
||||||
|
// write isn't atomic; on x64 it is, but routing through Interlocked is
|
||||||
|
// platform-independent + costs almost nothing.
|
||||||
|
Interlocked.Exchange(ref c.LastDemotedUtcTicks, nowUtc.Ticks);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// PR ablegacy-12 / #255 — restore cumulative demote bookkeeping after a
|
||||||
|
/// <see cref="AbLegacyDriver.ReinitializeAsync"/> cycle so an operator
|
||||||
|
/// redeploying config mid-incident doesn't lose flapping-link history.
|
||||||
|
/// Sets the counters to absolute values rather than incrementing.
|
||||||
|
/// </summary>
|
||||||
|
public void RestoreDemote(string deviceHostAddress, long demoteCount, DateTime? lastDemotedUtc)
|
||||||
|
{
|
||||||
|
ArgumentNullException.ThrowIfNull(deviceHostAddress);
|
||||||
|
var c = GetOrCreate(deviceHostAddress);
|
||||||
|
Interlocked.Exchange(ref c.DemoteCount, demoteCount);
|
||||||
|
Interlocked.Exchange(ref c.LastDemotedUtcTicks, lastDemotedUtc?.Ticks ?? 0);
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>Snapshot the current counters for a device. Returns zeros for unknown hosts.</summary>
|
/// <summary>Snapshot the current counters for a device. Returns zeros for unknown hosts.</summary>
|
||||||
public DiagnosticsSnapshot Snapshot(string deviceHostAddress)
|
public DiagnosticsSnapshot Snapshot(string deviceHostAddress)
|
||||||
{
|
{
|
||||||
@@ -139,7 +180,8 @@ public sealed class AbLegacyDiagnosticTags
|
|||||||
{
|
{
|
||||||
_counters.TryGetValue(deviceHostAddress, out c);
|
_counters.TryGetValue(deviceHostAddress, out c);
|
||||||
}
|
}
|
||||||
if (c is null) return new DiagnosticsSnapshot(0, 0, 0, 0, 0, string.Empty, 0);
|
if (c is null) return new DiagnosticsSnapshot(0, 0, 0, 0, 0, string.Empty, 0, 0, null);
|
||||||
|
var ticks = Interlocked.Read(ref c.LastDemotedUtcTicks);
|
||||||
return new DiagnosticsSnapshot(
|
return new DiagnosticsSnapshot(
|
||||||
Request: Interlocked.Read(ref c.Request),
|
Request: Interlocked.Read(ref c.Request),
|
||||||
Response: Interlocked.Read(ref c.Response),
|
Response: Interlocked.Read(ref c.Response),
|
||||||
@@ -147,7 +189,10 @@ public sealed class AbLegacyDiagnosticTags
|
|||||||
Retry: Interlocked.Read(ref c.Retry),
|
Retry: Interlocked.Read(ref c.Retry),
|
||||||
LastErrorCode: Volatile.Read(ref c.LastErrorCode),
|
LastErrorCode: Volatile.Read(ref c.LastErrorCode),
|
||||||
LastErrorMessage: c.LastErrorMessage ?? string.Empty,
|
LastErrorMessage: c.LastErrorMessage ?? string.Empty,
|
||||||
CommFailures: Interlocked.Read(ref c.CommFailures));
|
CommFailures: Interlocked.Read(ref c.CommFailures),
|
||||||
|
// PR ablegacy-12 / #255 — auto-demote surface.
|
||||||
|
DemoteCount: Interlocked.Read(ref c.DemoteCount),
|
||||||
|
LastDemotedUtc: ticks == 0 ? null : new DateTime(ticks, DateTimeKind.Utc));
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
@@ -155,7 +200,14 @@ public sealed class AbLegacyDiagnosticTags
|
|||||||
/// from <see cref="AbLegacyDriver.ReinitializeAsync"/> so a config redeploy starts
|
/// from <see cref="AbLegacyDriver.ReinitializeAsync"/> so a config redeploy starts
|
||||||
/// with a clean diagnostic surface.
|
/// with a clean diagnostic surface.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public void Reset(string deviceHostAddress)
|
/// <remarks>
|
||||||
|
/// PR ablegacy-12 / #255 — when <paramref name="preserveDemote"/> is <c>true</c> the
|
||||||
|
/// cumulative <c>DemoteCount</c> + <c>LastDemotedUtc</c> survive the reset.
|
||||||
|
/// <see cref="AbLegacyDriver.ReinitializeAsync"/> uses that mode so an operator
|
||||||
|
/// redeploying a config doesn't lose their flapping-link history; a fresh process
|
||||||
|
/// start clears them naturally because the dictionary is rebuilt from scratch.
|
||||||
|
/// </remarks>
|
||||||
|
public void Reset(string deviceHostAddress, bool preserveDemote = false)
|
||||||
{
|
{
|
||||||
ArgumentNullException.ThrowIfNull(deviceHostAddress);
|
ArgumentNullException.ThrowIfNull(deviceHostAddress);
|
||||||
var c = GetOrCreate(deviceHostAddress);
|
var c = GetOrCreate(deviceHostAddress);
|
||||||
@@ -166,14 +218,40 @@ public sealed class AbLegacyDiagnosticTags
|
|||||||
Interlocked.Exchange(ref c.LastErrorCode, 0);
|
Interlocked.Exchange(ref c.LastErrorCode, 0);
|
||||||
c.LastErrorMessage = string.Empty;
|
c.LastErrorMessage = string.Empty;
|
||||||
Interlocked.Exchange(ref c.CommFailures, 0);
|
Interlocked.Exchange(ref c.CommFailures, 0);
|
||||||
|
if (!preserveDemote)
|
||||||
|
{
|
||||||
|
Interlocked.Exchange(ref c.DemoteCount, 0);
|
||||||
|
Interlocked.Exchange(ref c.LastDemotedUtcTicks, 0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>Reset every tracked device. Called on full <c>ShutdownAsync</c>.</summary>
|
/// <summary>Reset every tracked device. Called on full <c>ShutdownAsync</c>.</summary>
|
||||||
public void ResetAll()
|
/// <remarks>
|
||||||
|
/// PR ablegacy-12 / #255 — when <paramref name="preserveDemote"/> is <c>true</c> the
|
||||||
|
/// cumulative demote counters survive a per-device reset of every other field.
|
||||||
|
/// The default (<c>false</c>) clears the dictionary outright, which is what
|
||||||
|
/// <see cref="AbLegacyDriver.ShutdownAsync"/> wants.
|
||||||
|
/// </remarks>
|
||||||
|
public void ResetAll(bool preserveDemote = false)
|
||||||
{
|
{
|
||||||
|
if (!preserveDemote)
|
||||||
|
{
|
||||||
|
lock (_lock)
|
||||||
|
{
|
||||||
|
_counters.Clear();
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Preserve mode: keep the dictionary keys + cumulative demote fields, but
|
||||||
|
// zero everything else. Used by Reinitialize to span a config redeploy
|
||||||
|
// without losing flapping-link history.
|
||||||
lock (_lock)
|
lock (_lock)
|
||||||
{
|
{
|
||||||
_counters.Clear();
|
foreach (var key in _counters.Keys.ToList())
|
||||||
|
{
|
||||||
|
Reset(key, preserveDemote: true);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -205,6 +283,11 @@ public sealed class AbLegacyDiagnosticTags
|
|||||||
"LastErrorCode" => snapshot.LastErrorCode,
|
"LastErrorCode" => snapshot.LastErrorCode,
|
||||||
"LastErrorMessage" => snapshot.LastErrorMessage,
|
"LastErrorMessage" => snapshot.LastErrorMessage,
|
||||||
"CommFailures" => snapshot.CommFailures,
|
"CommFailures" => snapshot.CommFailures,
|
||||||
|
// PR ablegacy-12 / #255 — auto-demote surface. LastDemotedUtc returns
|
||||||
|
// the empty string when no demotion has happened yet, mirroring the
|
||||||
|
// LastErrorMessage convention so HMIs can bind directly to a string.
|
||||||
|
"DemoteCount" => snapshot.DemoteCount,
|
||||||
|
"LastDemotedUtc" => snapshot.LastDemotedUtc?.ToString("o") ?? string.Empty,
|
||||||
_ => null,
|
_ => null,
|
||||||
};
|
};
|
||||||
return true;
|
return true;
|
||||||
@@ -236,6 +319,11 @@ public sealed class AbLegacyDiagnosticTags
|
|||||||
public int LastErrorCode;
|
public int LastErrorCode;
|
||||||
public string? LastErrorMessage = string.Empty;
|
public string? LastErrorMessage = string.Empty;
|
||||||
public long CommFailures;
|
public long CommFailures;
|
||||||
|
// PR ablegacy-12 / #255 — cumulative across config redeploys. Cleared only
|
||||||
|
// on full driver process restart (the dictionary is rebuilt from scratch);
|
||||||
|
// ReinitializeAsync uses preserveDemote: true.
|
||||||
|
public long DemoteCount;
|
||||||
|
public long LastDemotedUtcTicks;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -251,6 +339,9 @@ public sealed class AbLegacyDiagnosticTags
|
|||||||
/// <param name="LastErrorCode">Most recent libplctag status code on a failed read.</param>
|
/// <param name="LastErrorCode">Most recent libplctag status code on a failed read.</param>
|
||||||
/// <param name="LastErrorMessage">Most recent libplctag error message on a failed read.</param>
|
/// <param name="LastErrorMessage">Most recent libplctag error message on a failed read.</param>
|
||||||
/// <param name="CommFailures">Count of read failures mapped to <c>BadCommunicationError</c>.</param>
|
/// <param name="CommFailures">Count of read failures mapped to <c>BadCommunicationError</c>.</param>
|
||||||
|
/// <param name="DemoteCount">PR ablegacy-12 / #255 — cumulative auto-demote events.</param>
|
||||||
|
/// <param name="LastDemotedUtc">PR ablegacy-12 / #255 — UTC timestamp of the most
|
||||||
|
/// recent demotion, or <c>null</c> if the device has never been demoted.</param>
|
||||||
public sealed record DiagnosticsSnapshot(
|
public sealed record DiagnosticsSnapshot(
|
||||||
long Request,
|
long Request,
|
||||||
long Response,
|
long Response,
|
||||||
@@ -258,4 +349,6 @@ public sealed record DiagnosticsSnapshot(
|
|||||||
long Retry,
|
long Retry,
|
||||||
int LastErrorCode,
|
int LastErrorCode,
|
||||||
string LastErrorMessage,
|
string LastErrorMessage,
|
||||||
long CommFailures);
|
long CommFailures,
|
||||||
|
long DemoteCount,
|
||||||
|
DateTime? LastDemotedUtc);
|
||||||
|
|||||||
@@ -217,6 +217,20 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
|
|||||||
|
|
||||||
public async Task ReinitializeAsync(string driverConfigJson, CancellationToken cancellationToken)
|
public async Task ReinitializeAsync(string driverConfigJson, CancellationToken cancellationToken)
|
||||||
{
|
{
|
||||||
|
// PR ablegacy-12 / #255 — capture the cumulative DemoteCount + LastDemotedUtc
|
||||||
|
// for every currently-tracked device before we tear down. The Shutdown below
|
||||||
|
// calls ResetAll() which clears the dictionary; the per-host InitializeAsync
|
||||||
|
// below re-EnsureDevice's the slots; we restore the cumulative demote
|
||||||
|
// history so an operator who redeploys mid-incident doesn't lose the trail
|
||||||
|
// of how often this device was flapping.
|
||||||
|
var preservedDemote = new Dictionary<string, (long DemoteCount, DateTime? LastDemotedUtc)>(
|
||||||
|
StringComparer.OrdinalIgnoreCase);
|
||||||
|
foreach (var (host, _) in _devices)
|
||||||
|
{
|
||||||
|
var snap = _diagnosticTags.Snapshot(host);
|
||||||
|
preservedDemote[host] = (snap.DemoteCount, snap.LastDemotedUtc);
|
||||||
|
}
|
||||||
|
|
||||||
await ShutdownAsync(cancellationToken).ConfigureAwait(false);
|
await ShutdownAsync(cancellationToken).ConfigureAwait(false);
|
||||||
// PR ablegacy-10 / #253 — counters were dropped along with the device map when
|
// PR ablegacy-10 / #253 — counters were dropped along with the device map when
|
||||||
// ShutdownAsync called ResetAll; the InitializeAsync below re-EnsureDevice's each
|
// ShutdownAsync called ResetAll; the InitializeAsync below re-EnsureDevice's each
|
||||||
@@ -224,6 +238,16 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
|
|||||||
// here in case a downstream override of either method skips the cycle.
|
// here in case a downstream override of either method skips the cycle.
|
||||||
_diagnosticTags.ResetAll();
|
_diagnosticTags.ResetAll();
|
||||||
await InitializeAsync(driverConfigJson, cancellationToken).ConfigureAwait(false);
|
await InitializeAsync(driverConfigJson, cancellationToken).ConfigureAwait(false);
|
||||||
|
|
||||||
|
// PR ablegacy-12 / #255 — restore the cumulative demote history. Only hosts
|
||||||
|
// that survive the redeploy get their counters back; a device removed from
|
||||||
|
// config legitimately drops its history (it isn't being tracked any more).
|
||||||
|
foreach (var (host, (count, lastUtc)) in preservedDemote)
|
||||||
|
{
|
||||||
|
if (count == 0 && lastUtc is null) continue;
|
||||||
|
if (!_devices.ContainsKey(host)) continue;
|
||||||
|
_diagnosticTags.RestoreDemote(host, count, lastUtc);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public async Task ShutdownAsync(CancellationToken cancellationToken)
|
public async Task ShutdownAsync(CancellationToken cancellationToken)
|
||||||
@@ -275,6 +299,38 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
|
|||||||
internal int ResolveRetries(DeviceState device) =>
|
internal int ResolveRetries(DeviceState device) =>
|
||||||
device.Options.Retries ?? _options.Retries ?? 0;
|
device.Options.Retries ?? _options.Retries ?? 0;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// PR ablegacy-12 / #255 — resolve the active <see cref="AbLegacyDemoteOptions"/> for
|
||||||
|
/// a device. Per-device options win; otherwise the documented defaults (3 failures /
|
||||||
|
/// 30 s / enabled). Returns a non-null record so callers can assume a usable value.
|
||||||
|
/// </summary>
|
||||||
|
internal AbLegacyDemoteOptions ResolveDemote(DeviceState device) =>
|
||||||
|
device.Options.Demote ?? new AbLegacyDemoteOptions();
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// PR ablegacy-12 / #255 — common bookkeeping for one comm failure: bump the
|
||||||
|
/// consecutive-failure counter and arm the demote window once the threshold is
|
||||||
|
/// crossed. Returns <c>true</c> when this call tipped the device into Demoted (so
|
||||||
|
/// the caller can fire <see cref="OnHostStatusChanged"/>); <c>false</c> when the
|
||||||
|
/// device was already demoted or stayed below the threshold.
|
||||||
|
/// </summary>
|
||||||
|
private bool RecordFailureAndMaybeDemote(DeviceState state, DateTime nowUtc)
|
||||||
|
{
|
||||||
|
var demote = ResolveDemote(state);
|
||||||
|
var consecutive = Interlocked.Increment(ref state.ConsecutiveFailures);
|
||||||
|
|
||||||
|
if (!demote.Enabled || consecutive < demote.FailureThreshold) return false;
|
||||||
|
// Already demoted? Don't re-arm — the original window's expiry is the
|
||||||
|
// operator-facing recovery clock and re-arming on every subsequent failed
|
||||||
|
// read would suppress reads forever on a fully-down device. The probe
|
||||||
|
// loop is what eventually clears the demotion (or the window expiring).
|
||||||
|
if (state.DemotedUntilUtc is { } until && until > nowUtc) return false;
|
||||||
|
|
||||||
|
state.DemotedUntilUtc = nowUtc + demote.EffectiveDemoteFor;
|
||||||
|
_diagnosticTags.RecordDemote(state.Options.HostAddress, nowUtc);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
// ---- IReadable ----
|
// ---- IReadable ----
|
||||||
|
|
||||||
public async Task<IReadOnlyList<DataValueSnapshot>> ReadAsync(
|
public async Task<IReadOnlyList<DataValueSnapshot>> ReadAsync(
|
||||||
@@ -323,6 +379,45 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
|
|||||||
// double-counting the original attempt as a retry.
|
// double-counting the original attempt as a retry.
|
||||||
_diagnosticTags.RecordRequest(def.DeviceHostAddress);
|
_diagnosticTags.RecordRequest(def.DeviceHostAddress);
|
||||||
|
|
||||||
|
// PR ablegacy-12 / #255 — auto-demote short-circuit. When the device's demote
|
||||||
|
// window is still active we return BadCommunicationError immediately, without
|
||||||
|
// touching libplctag or its retry loop. That's the whole point of the feature:
|
||||||
|
// one slow PLC sharing the driver thread can't drag down healthy peers. We
|
||||||
|
// don't bump ErrorCount/CommFailures here because this isn't a fresh field
|
||||||
|
// failure — it's the cool-down on a previously-counted one.
|
||||||
|
if (device.DemotedUntilUtc is { } demotedUntil)
|
||||||
|
{
|
||||||
|
if (demotedUntil > now)
|
||||||
|
{
|
||||||
|
results[i] = new DataValueSnapshot(null,
|
||||||
|
AbLegacyStatusMapper.BadCommunicationError, null, now);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Window expired without an early-clear from a probe success — drop the
|
||||||
|
// marker but don't reset ConsecutiveFailures yet. If this read also
|
||||||
|
// fails the failure tally keeps counting from where it left off, so a
|
||||||
|
// permanently-down device re-arms the window after one more
|
||||||
|
// consecutive failure (vs. having to repeat the full threshold).
|
||||||
|
lock (device.ProbeLock)
|
||||||
|
{
|
||||||
|
if (device.DemotedUntilUtc is { } stillUntil && stillUntil <= now)
|
||||||
|
{
|
||||||
|
device.DemotedUntilUtc = null;
|
||||||
|
// Mirror Stopped→Running on a probe-driven recovery: leave the
|
||||||
|
// HostState transition to the probe loop (or the upcoming success
|
||||||
|
// below); we just clear the cool-down marker so the next read
|
||||||
|
// dispatches normally.
|
||||||
|
if (device.HostState == HostState.Demoted)
|
||||||
|
{
|
||||||
|
// Surface a transition out of Demoted. The probe loop will
|
||||||
|
// bring it Running once a probe succeeds; until then leave
|
||||||
|
// it in Stopped to reflect "we don't actually know it's up".
|
||||||
|
TransitionDeviceState(device, HostState.Stopped);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// PR 9 — per-device retry loop: on transient BadCommunicationError (libplctag throw
|
// PR 9 — per-device retry loop: on transient BadCommunicationError (libplctag throw
|
||||||
// OR a non-zero status that maps to BadCommunicationError) retry up to N times. A
|
// OR a non-zero status that maps to BadCommunicationError) retry up to N times. A
|
||||||
// terminal mapped status (e.g. BadNodeIdUnknown for a missing PLC tag, BadTypeMismatch
|
// terminal mapped status (e.g. BadNodeIdUnknown for a missing PLC tag, BadTypeMismatch
|
||||||
@@ -360,6 +455,15 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
|
|||||||
status,
|
status,
|
||||||
$"libplctag status {status} reading {reference}",
|
$"libplctag status {status} reading {reference}",
|
||||||
commFailure: mappedStatus == AbLegacyStatusMapper.BadCommunicationError);
|
commFailure: mappedStatus == AbLegacyStatusMapper.BadCommunicationError);
|
||||||
|
// PR ablegacy-12 / #255 — only comm failures count toward the
|
||||||
|
// demote tally. A BadNodeIdUnknown / BadTypeMismatch is a config
|
||||||
|
// / decoder mismatch, not a sign the host is unreachable, so
|
||||||
|
// demoting on it would punish the operator for a typo.
|
||||||
|
if (mappedStatus == AbLegacyStatusMapper.BadCommunicationError
|
||||||
|
&& RecordFailureAndMaybeDemote(device, now))
|
||||||
|
{
|
||||||
|
TransitionDeviceState(device, HostState.Demoted);
|
||||||
|
}
|
||||||
snapshot = new DataValueSnapshot(null, mappedStatus, null, now);
|
snapshot = new DataValueSnapshot(null, mappedStatus, null, now);
|
||||||
_health = new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead,
|
_health = new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead,
|
||||||
$"libplctag status {status} reading {reference}");
|
$"libplctag status {status} reading {reference}");
|
||||||
@@ -385,6 +489,13 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
|
|||||||
_health = new DriverHealth(DriverState.Healthy, now, null);
|
_health = new DriverHealth(DriverState.Healthy, now, null);
|
||||||
// PR ablegacy-10 / #253 — successful array read.
|
// PR ablegacy-10 / #253 — successful array read.
|
||||||
_diagnosticTags.RecordResponse(def.DeviceHostAddress);
|
_diagnosticTags.RecordResponse(def.DeviceHostAddress);
|
||||||
|
// PR ablegacy-12 / #255 — successful read clears the
|
||||||
|
// consecutive-failure tally. We do NOT auto-clear DemotedUntilUtc
|
||||||
|
// here — the demote window is honoured to its full duration so an
|
||||||
|
// intermittent link that just happened to answer once doesn't
|
||||||
|
// immediately re-flood the channel. Probe success is the early
|
||||||
|
// recovery path.
|
||||||
|
Interlocked.Exchange(ref device.ConsecutiveFailures, 0);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -398,6 +509,10 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
|
|||||||
_health = new DriverHealth(DriverState.Healthy, now, null);
|
_health = new DriverHealth(DriverState.Healthy, now, null);
|
||||||
// PR ablegacy-10 / #253 — successful scalar / sub-element / bit read.
|
// PR ablegacy-10 / #253 — successful scalar / sub-element / bit read.
|
||||||
_diagnosticTags.RecordResponse(def.DeviceHostAddress);
|
_diagnosticTags.RecordResponse(def.DeviceHostAddress);
|
||||||
|
// PR ablegacy-12 / #255 — successful read clears the
|
||||||
|
// consecutive-failure tally; demote window keeps running
|
||||||
|
// until a probe success or natural expiry.
|
||||||
|
Interlocked.Exchange(ref device.ConsecutiveFailures, 0);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
catch (OperationCanceledException) { throw; }
|
catch (OperationCanceledException) { throw; }
|
||||||
@@ -414,6 +529,12 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
|
|||||||
libplctagStatus: 0,
|
libplctagStatus: 0,
|
||||||
errorMessage: ex.Message,
|
errorMessage: ex.Message,
|
||||||
commFailure: true);
|
commFailure: true);
|
||||||
|
// PR ablegacy-12 / #255 — exception-driven comm failure counts
|
||||||
|
// toward the demote tally just like a status-mapped one.
|
||||||
|
if (RecordFailureAndMaybeDemote(device, now))
|
||||||
|
{
|
||||||
|
TransitionDeviceState(device, HostState.Demoted);
|
||||||
|
}
|
||||||
snapshot = new DataValueSnapshot(null,
|
snapshot = new DataValueSnapshot(null,
|
||||||
AbLegacyStatusMapper.BadCommunicationError, null, now);
|
AbLegacyStatusMapper.BadCommunicationError, null, now);
|
||||||
_health = new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, ex.Message);
|
_health = new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, ex.Message);
|
||||||
@@ -591,6 +712,14 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
|
|||||||
"Most recent libplctag error message on a failed read; empty when no error has been seen since the last reset.");
|
"Most recent libplctag error message on a failed read; empty when no error has been seen since the last reset.");
|
||||||
EmitDiagnosticVariable(diag, deviceHostAddress, "CommFailures", DriverDataType.Int64,
|
EmitDiagnosticVariable(diag, deviceHostAddress, "CommFailures", DriverDataType.Int64,
|
||||||
"Count of read failures mapped to BadCommunicationError. Spans transient libplctag throws + retried-out chains so operators see a single 'wire fell off' counter.");
|
"Count of read failures mapped to BadCommunicationError. Spans transient libplctag throws + retried-out chains so operators see a single 'wire fell off' counter.");
|
||||||
|
// PR ablegacy-12 / #255 — auto-demote surface. DemoteCount is cumulative
|
||||||
|
// across reinit (preserved in ReinitializeAsync); LastDemotedUtc is a
|
||||||
|
// string (ISO-8601 UTC) so HMIs can bind directly without a separate
|
||||||
|
// DateTime decoder. Empty string means "never demoted".
|
||||||
|
EmitDiagnosticVariable(diag, deviceHostAddress, "DemoteCount", DriverDataType.Int64,
|
||||||
|
"Cumulative auto-demote events for this device — bumps every time the driver crosses the consecutive-failure threshold and arms a fresh cool-down window. Survives ReinitializeAsync.");
|
||||||
|
EmitDiagnosticVariable(diag, deviceHostAddress, "LastDemotedUtc", DriverDataType.String,
|
||||||
|
"ISO-8601 UTC timestamp of the most recent auto-demotion; empty when this device has never been demoted.");
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void EmitDiagnosticVariable(
|
private static void EmitDiagnosticVariable(
|
||||||
@@ -665,7 +794,39 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
|
|||||||
state.ProbeInitialized = false;
|
state.ProbeInitialized = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
TransitionDeviceState(state, success ? HostState.Running : HostState.Stopped);
|
// PR ablegacy-12 / #255 — probe success is the early-recovery path: clear
|
||||||
|
// any active demote window + reset the failure tally so the next read
|
||||||
|
// dispatches normally. Probe failure participates in the same shared
|
||||||
|
// failure-tally as ReadAsync so a device with no live read traffic still
|
||||||
|
// demotes on a sustained outage.
|
||||||
|
if (success)
|
||||||
|
{
|
||||||
|
bool wasDemoted;
|
||||||
|
lock (state.ProbeLock)
|
||||||
|
{
|
||||||
|
wasDemoted = state.DemotedUntilUtc is not null;
|
||||||
|
state.DemotedUntilUtc = null;
|
||||||
|
}
|
||||||
|
Interlocked.Exchange(ref state.ConsecutiveFailures, 0);
|
||||||
|
TransitionDeviceState(state, HostState.Running);
|
||||||
|
_ = wasDemoted; // intentionally observed for future telemetry hooks
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (RecordFailureAndMaybeDemote(state, DateTime.UtcNow))
|
||||||
|
{
|
||||||
|
TransitionDeviceState(state, HostState.Demoted);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Mid-tally probe failure: surface as Stopped if not already
|
||||||
|
// Demoted. This preserves pre-PR-12 behaviour for the common
|
||||||
|
// case (FailureThreshold=3 + a single hiccup ends up Stopped,
|
||||||
|
// not Demoted).
|
||||||
|
if (state.HostState != HostState.Demoted)
|
||||||
|
TransitionDeviceState(state, HostState.Stopped);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
try { await Task.Delay(_options.Probe.Interval, ct).ConfigureAwait(false); }
|
try { await Task.Delay(_options.Probe.Interval, ct).ConfigureAwait(false); }
|
||||||
catch (OperationCanceledException) { break; }
|
catch (OperationCanceledException) { break; }
|
||||||
@@ -890,6 +1051,25 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
|
|||||||
public CancellationTokenSource? ProbeCts { get; set; }
|
public CancellationTokenSource? ProbeCts { get; set; }
|
||||||
public bool ProbeInitialized { get; set; }
|
public bool ProbeInitialized { get; set; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// PR ablegacy-12 / #255 — running tally of consecutive read / probe failures.
|
||||||
|
/// Reset on every successful read or probe; tripping
|
||||||
|
/// <see cref="AbLegacyDemoteOptions.FailureThreshold"/> arms the demote window.
|
||||||
|
/// Read + written via <see cref="Interlocked"/> because read + probe loops can
|
||||||
|
/// touch it concurrently.
|
||||||
|
/// </summary>
|
||||||
|
public int ConsecutiveFailures;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// PR ablegacy-12 / #255 — when set, reads against this device short-circuit
|
||||||
|
/// with <c>BadCommunicationError</c> until the timestamp passes; cleared early
|
||||||
|
/// by a successful probe. Guarded by <see cref="ProbeLock"/> for the mutator
|
||||||
|
/// paths (TransitionDeviceState + RecordFailureAndMaybeDemote); reads grab
|
||||||
|
/// the property without locking — a torn DateTime? read is harmless here
|
||||||
|
/// because the worst case is one extra dispatched read on an x86 boundary.
|
||||||
|
/// </summary>
|
||||||
|
public DateTimeOffset? DemotedUntilUtc { get; set; }
|
||||||
|
|
||||||
public void DisposeRuntimes()
|
public void DisposeRuntimes()
|
||||||
{
|
{
|
||||||
foreach (var r in Runtimes.Values) r.Dispose();
|
foreach (var r in Runtimes.Values) r.Dispose();
|
||||||
|
|||||||
@@ -45,7 +45,14 @@ public static class AbLegacyDriverFactoryExtensions
|
|||||||
DeviceName: d.DeviceName,
|
DeviceName: d.DeviceName,
|
||||||
// PR 9 — per-device timeout / retry overrides. Device-level wins over driver-wide.
|
// PR 9 — per-device timeout / retry overrides. Device-level wins over driver-wide.
|
||||||
Timeout: d.TimeoutMs is int devMs ? TimeSpan.FromMilliseconds(devMs) : null,
|
Timeout: d.TimeoutMs is int devMs ? TimeSpan.FromMilliseconds(devMs) : null,
|
||||||
Retries: d.Retries))]
|
Retries: d.Retries,
|
||||||
|
// PR ablegacy-12 / #255 — auto-demote knobs.
|
||||||
|
Demote: d.Demote is null ? null : new AbLegacyDemoteOptions(
|
||||||
|
FailureThreshold: d.Demote.FailureThreshold ?? 3,
|
||||||
|
DemoteFor: d.Demote.DemoteForMs is int demMs
|
||||||
|
? TimeSpan.FromMilliseconds(demMs)
|
||||||
|
: null,
|
||||||
|
Enabled: d.Demote.Enabled ?? true)))]
|
||||||
: [],
|
: [],
|
||||||
Tags = dto.Tags is { Count: > 0 }
|
Tags = dto.Tags is { Count: > 0 }
|
||||||
? [.. dto.Tags.Select(t => new AbLegacyTagDefinition(
|
? [.. dto.Tags.Select(t => new AbLegacyTagDefinition(
|
||||||
@@ -209,6 +216,26 @@ public static class AbLegacyDriverFactoryExtensions
|
|||||||
/// <c>null</c> at both levels = single attempt.
|
/// <c>null</c> at both levels = single attempt.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public int? Retries { get; init; }
|
public int? Retries { get; init; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// PR ablegacy-12 / #255 — optional per-device auto-demote knobs. <c>null</c>
|
||||||
|
/// means "use the documented defaults" (<c>FailureThreshold=3</c>,
|
||||||
|
/// <c>DemoteFor=30s</c>, <c>Enabled=true</c>) — the driver still demotes by
|
||||||
|
/// default. Set <c>Enabled=false</c> in the JSON to opt out entirely.
|
||||||
|
/// </summary>
|
||||||
|
public AbLegacyDemoteDto? Demote { get; init; }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// PR ablegacy-12 / #255 — JSON DTO for the auto-demote knobs. Times are
|
||||||
|
/// ms-suffixed for consistency with the rest of the driver config (TimeoutMs,
|
||||||
|
/// IntervalMs).
|
||||||
|
/// </summary>
|
||||||
|
internal sealed class AbLegacyDemoteDto
|
||||||
|
{
|
||||||
|
public int? FailureThreshold { get; init; }
|
||||||
|
public int? DemoteForMs { get; init; }
|
||||||
|
public bool? Enabled { get; init; }
|
||||||
}
|
}
|
||||||
|
|
||||||
internal sealed class AbLegacyTagDto
|
internal sealed class AbLegacyTagDto
|
||||||
|
|||||||
@@ -41,7 +41,39 @@ public sealed record AbLegacyDeviceOptions(
|
|||||||
AbLegacyPlcFamily PlcFamily = AbLegacyPlcFamily.Slc500,
|
AbLegacyPlcFamily PlcFamily = AbLegacyPlcFamily.Slc500,
|
||||||
string? DeviceName = null,
|
string? DeviceName = null,
|
||||||
TimeSpan? Timeout = null,
|
TimeSpan? Timeout = null,
|
||||||
int? Retries = null);
|
int? Retries = null,
|
||||||
|
AbLegacyDemoteOptions? Demote = null);
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// PR ablegacy-12 / #255 — auto-demote knobs. After
|
||||||
|
/// <see cref="FailureThreshold"/> consecutive read / probe failures the driver
|
||||||
|
/// marks the device <c>Demoted</c> for <see cref="DemoteFor"/>; reads against
|
||||||
|
/// a demoted device short-circuit with <c>BadCommunicationError</c> instead
|
||||||
|
/// of dispatching through libplctag, so one slow PLC can't starve faster
|
||||||
|
/// peers sharing the same driver. A successful probe clears the demotion
|
||||||
|
/// early; a successful read just resets the consecutive-failure counter
|
||||||
|
/// without leaving the demoted window.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="FailureThreshold">Consecutive read or probe failures that trip
|
||||||
|
/// the demotion. Default <c>3</c>.</param>
|
||||||
|
/// <param name="DemoteFor">Cool-down window before reads are dispatched again
|
||||||
|
/// without a successful probe in between. Default <c>30s</c>.</param>
|
||||||
|
/// <param name="Enabled">When <c>false</c> the failure tally still ticks but the
|
||||||
|
/// driver never sets the demoted window — useful when an operator wants the
|
||||||
|
/// diagnostic counters without the throttling behaviour.</param>
|
||||||
|
public sealed record AbLegacyDemoteOptions(
|
||||||
|
int FailureThreshold = 3,
|
||||||
|
TimeSpan? DemoteFor = null,
|
||||||
|
bool Enabled = true)
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Effective demote window. Records can't have <c>TimeSpan</c> defaults
|
||||||
|
/// because <c>TimeSpan.FromSeconds(30)</c> isn't a compile-time constant;
|
||||||
|
/// callers that pass <c>null</c> get the documented 30-second default
|
||||||
|
/// here.
|
||||||
|
/// </summary>
|
||||||
|
public TimeSpan EffectiveDemoteFor => DemoteFor ?? TimeSpan.FromSeconds(30);
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// One PCCC-backed OPC UA variable. <c>Address</c> is the canonical PCCC file-address
|
/// One PCCC-backed OPC UA variable. <c>Address</c> is the canonical PCCC file-address
|
||||||
|
|||||||
@@ -138,6 +138,11 @@ public sealed class HostStatusPublisher(
|
|||||||
HostState.Running => DriverHostState.Running,
|
HostState.Running => DriverHostState.Running,
|
||||||
HostState.Stopped => DriverHostState.Stopped,
|
HostState.Stopped => DriverHostState.Stopped,
|
||||||
HostState.Faulted => DriverHostState.Faulted,
|
HostState.Faulted => DriverHostState.Faulted,
|
||||||
|
// PR ablegacy-12 / #255 — Demoted is a driver-side back-off (skipped reads while
|
||||||
|
// we wait for a flaky host to recover). The Configuration enum doesn't have a
|
||||||
|
// dedicated value; surface it as Stopped so the Admin UI lights it up red-ish
|
||||||
|
// without the publisher needing a schema migration to differentiate.
|
||||||
|
HostState.Demoted => DriverHostState.Stopped,
|
||||||
_ => DriverHostState.Unknown,
|
_ => DriverHostState.Unknown,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,102 @@
|
|||||||
|
using Shouldly;
|
||||||
|
using Xunit;
|
||||||
|
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||||
|
using ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.PlcFamilies;
|
||||||
|
|
||||||
|
namespace ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.IntegrationTests;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// PR ablegacy-12 / #255 — wire-level smoke for auto-demote on comm failure.
|
||||||
|
/// Runs only when ab_server is reachable. Two devices: one healthy (the live
|
||||||
|
/// ab_server slc500 simulator), one pointed at <c>127.0.0.1:1</c> which
|
||||||
|
/// refuses every connection. After three consecutive failures the faulty
|
||||||
|
/// device's reads must short-circuit with <c>BadCommunicationError</c>
|
||||||
|
/// while the healthy device keeps returning <c>Good</c> — the whole point
|
||||||
|
/// of the feature: one slow / unreachable PLC sharing the driver thread
|
||||||
|
/// can't starve faster peers.
|
||||||
|
/// </summary>
|
||||||
|
/// <remarks>
|
||||||
|
/// <para>
|
||||||
|
/// Build-only by default — the assertion that demotion latency is
|
||||||
|
/// bounded depends on the ab_server simulator timing out on the faulty
|
||||||
|
/// port within the per-device timeout. We pin the faulty endpoint at
|
||||||
|
/// <c>127.0.0.1:1</c> (the bogus-port standard) which RST's the
|
||||||
|
/// connection immediately on most stacks; environments that whitelist
|
||||||
|
/// outbound to localhost:1 will see different timing but still trip
|
||||||
|
/// the threshold within the test budget.
|
||||||
|
/// </para>
|
||||||
|
/// <para>
|
||||||
|
/// The Docker fixture extension (<c>slc500-faulty</c>) noted in the PR
|
||||||
|
/// plan is a documentation-only placeholder for now — implementing a
|
||||||
|
/// refusing-proxy container is non-trivial and the localhost:1 trick
|
||||||
|
/// covers the same surface deterministically.
|
||||||
|
/// </para>
|
||||||
|
/// </remarks>
|
||||||
|
[Collection(AbLegacyServerCollection.Name)]
|
||||||
|
[Trait("Category", "Integration")]
|
||||||
|
[Trait("Simulator", "ab_server-PCCC")]
|
||||||
|
public sealed class AbLegacyAutoDemoteTests(AbLegacyServerFixture sim)
|
||||||
|
{
|
||||||
|
[AbLegacyFact]
|
||||||
|
public async Task Two_devices_one_unreachable_does_not_starve_healthy_reads()
|
||||||
|
{
|
||||||
|
if (sim.SkipReason is not null) Assert.Skip(sim.SkipReason);
|
||||||
|
|
||||||
|
var healthy = $"ab://{sim.Host}:{sim.Port}/{sim.CipPath}";
|
||||||
|
// 127.0.0.1:1 is the bogus-port standard — typical Linux/Windows TCP
|
||||||
|
// stacks RST immediately. The driver still reports it as a comm
|
||||||
|
// failure (libplctag wraps the failure as a transient throw).
|
||||||
|
var faulty = "ab://127.0.0.1:1/1,0";
|
||||||
|
|
||||||
|
await using var drv = new AbLegacyDriver(new AbLegacyDriverOptions
|
||||||
|
{
|
||||||
|
Devices =
|
||||||
|
[
|
||||||
|
new AbLegacyDeviceOptions(healthy, AbLegacyPlcFamily.Slc500,
|
||||||
|
Timeout: TimeSpan.FromSeconds(5)),
|
||||||
|
new AbLegacyDeviceOptions(faulty, AbLegacyPlcFamily.Slc500,
|
||||||
|
// Snappy timeout so the test budget stays short.
|
||||||
|
Timeout: TimeSpan.FromMilliseconds(500),
|
||||||
|
Demote: new AbLegacyDemoteOptions(
|
||||||
|
FailureThreshold: 3,
|
||||||
|
DemoteFor: TimeSpan.FromSeconds(30))),
|
||||||
|
],
|
||||||
|
Tags =
|
||||||
|
[
|
||||||
|
new AbLegacyTagDefinition("Healthy", healthy, "N7:0", AbLegacyDataType.Int),
|
||||||
|
new AbLegacyTagDefinition("Faulty", faulty, "N7:0", AbLegacyDataType.Int),
|
||||||
|
],
|
||||||
|
Probe = new AbLegacyProbeOptions { Enabled = false },
|
||||||
|
}, driverInstanceId: "ablegacy-auto-demote-it");
|
||||||
|
|
||||||
|
await drv.InitializeAsync("{}", TestContext.Current.CancellationToken);
|
||||||
|
|
||||||
|
// Trip the demote on the faulty device.
|
||||||
|
for (var i = 0; i < 3; i++)
|
||||||
|
{
|
||||||
|
await drv.ReadAsync(["Faulty"], TestContext.Current.CancellationToken);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Healthy host MUST keep returning Good even though the sibling is demoted.
|
||||||
|
var healthyResult = await drv.ReadAsync(["Healthy"], TestContext.Current.CancellationToken);
|
||||||
|
healthyResult[0].StatusCode.ShouldBe(AbLegacyStatusMapper.Good);
|
||||||
|
|
||||||
|
// Faulty host now short-circuits without waiting on libplctag's timeout.
|
||||||
|
var sw = System.Diagnostics.Stopwatch.StartNew();
|
||||||
|
var faultyResult = await drv.ReadAsync(["Faulty"], TestContext.Current.CancellationToken);
|
||||||
|
sw.Stop();
|
||||||
|
faultyResult[0].StatusCode.ShouldBe(AbLegacyStatusMapper.BadCommunicationError);
|
||||||
|
// Short-circuit should be ~1 ms; pad generously for CI noise. The pre-PR-12
|
||||||
|
// path would have waited the full 500 ms timeout.
|
||||||
|
sw.ElapsedMilliseconds.ShouldBeLessThan(200);
|
||||||
|
|
||||||
|
// Counter access via the public diagnostic short-circuit path — the
|
||||||
|
// internal Snapshot() seam isn't visible from this assembly.
|
||||||
|
var demoteCountRef = $"_Diagnostics/{faulty}/DemoteCount";
|
||||||
|
var lastDemotedRef = $"_Diagnostics/{faulty}/LastDemotedUtc";
|
||||||
|
var diag = await drv.ReadAsync(
|
||||||
|
[demoteCountRef, lastDemotedRef], TestContext.Current.CancellationToken);
|
||||||
|
((long)diag[0].Value!).ShouldBeGreaterThan(0);
|
||||||
|
((string)diag[1].Value!).Length.ShouldBeGreaterThan(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -72,3 +72,30 @@ services:
|
|||||||
"--tag=F8[120]",
|
"--tag=F8[120]",
|
||||||
"--tag=B3[10]"
|
"--tag=B3[10]"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# PR ablegacy-12 / #255 — faulty-PLC fixture for the auto-demote contract.
|
||||||
|
# FIXTURE-TIER FOLLOW-UP: implementing a refusing-proxy container that
|
||||||
|
# round-trips libplctag's CIP framing far enough to trigger comm failures
|
||||||
|
# (vs. just RST'ing the TCP handshake) is non-trivial — the integration
|
||||||
|
# test currently uses 127.0.0.1:1 (the bogus-port standard) which RST's
|
||||||
|
# immediately on most TCP stacks. That gets us deterministic comm-failure
|
||||||
|
# coverage without standing up a second container; if the localhost:1
|
||||||
|
# trick stops working on a future test runner (e.g. a sandbox that
|
||||||
|
# blocks port 1) re-enable this stub:
|
||||||
|
#
|
||||||
|
# slc500-faulty:
|
||||||
|
# profiles: ["slc500-faulty"]
|
||||||
|
# image: otopcua-ab-server:libplctag-release
|
||||||
|
# build:
|
||||||
|
# context: ../../ZB.MOM.WW.OtOpcUa.Driver.AbCip.IntegrationTests/Docker
|
||||||
|
# dockerfile: Dockerfile
|
||||||
|
# container_name: otopcua-ab-server-slc500-faulty
|
||||||
|
# restart: "no"
|
||||||
|
# ports:
|
||||||
|
# - "44819:44819"
|
||||||
|
# # Hostile entrypoint: bind the port but exit immediately so subsequent
|
||||||
|
# # connection attempts get RST'd. Future iteration: a libplctag-aware
|
||||||
|
# # proxy that accepts the CIP open and then drops the wire halfway
|
||||||
|
# # through, exercising the read-timeout path rather than the
|
||||||
|
# # connection-refused path.
|
||||||
|
# entrypoint: ["sh", "-c", "exit 1"]
|
||||||
|
|||||||
@@ -0,0 +1,380 @@
|
|||||||
|
using System.Text.Json;
|
||||||
|
using Shouldly;
|
||||||
|
using Xunit;
|
||||||
|
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||||
|
using ZB.MOM.WW.OtOpcUa.Driver.AbLegacy;
|
||||||
|
using ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.PlcFamilies;
|
||||||
|
|
||||||
|
namespace ZB.MOM.WW.OtOpcUa.Driver.AbLegacy.Tests;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// PR ablegacy-12 / #255 — auto-demote on consecutive comm failure. After
|
||||||
|
/// <c>FailureThreshold</c> consecutive read or probe failures the driver
|
||||||
|
/// marks the device <c>Demoted</c> for <c>DemoteFor</c>; subsequent reads
|
||||||
|
/// short-circuit with <c>BadCommunicationError</c> without invoking
|
||||||
|
/// libplctag, so one slow PLC sharing the driver thread can't starve faster
|
||||||
|
/// peers. Probe success clears the demote early; read success resets the
|
||||||
|
/// consecutive-failure tally without leaving the demote window.
|
||||||
|
/// </summary>
|
||||||
|
[Trait("Category", "Unit")]
|
||||||
|
public sealed class AbLegacyAutoDemoteTests
|
||||||
|
{
|
||||||
|
private const string Host = "ab://10.0.0.5/1,0";
|
||||||
|
private const string SecondHost = "ab://10.0.0.6/1,0";
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Disable the probe by default — every test wants deterministic
|
||||||
|
/// control over the failure tally without a background loop racing
|
||||||
|
/// against the read path.
|
||||||
|
/// </summary>
|
||||||
|
private static AbLegacyDriverOptions BaseOptions(
|
||||||
|
AbLegacyDemoteOptions? demote = null,
|
||||||
|
IReadOnlyList<AbLegacyDeviceOptions>? devices = null,
|
||||||
|
IReadOnlyList<AbLegacyTagDefinition>? tags = null) => new()
|
||||||
|
{
|
||||||
|
Devices = devices ?? [new AbLegacyDeviceOptions(Host, AbLegacyPlcFamily.Slc500, Demote: demote)],
|
||||||
|
Tags = tags ?? [new AbLegacyTagDefinition("X", Host, "N7:0", AbLegacyDataType.Int)],
|
||||||
|
Probe = new AbLegacyProbeOptions { Enabled = false },
|
||||||
|
};
|
||||||
|
|
||||||
|
private static (AbLegacyDriver drv, FakeAbLegacyTagFactory factory) NewDriver(
|
||||||
|
AbLegacyDemoteOptions? demote = null,
|
||||||
|
IReadOnlyList<AbLegacyDeviceOptions>? devices = null,
|
||||||
|
IReadOnlyList<AbLegacyTagDefinition>? tags = null)
|
||||||
|
{
|
||||||
|
var factory = new FakeAbLegacyTagFactory();
|
||||||
|
var drv = new AbLegacyDriver(BaseOptions(demote, devices, tags), "drv-demote", factory);
|
||||||
|
return (drv, factory);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static FakeAbLegacyTag SeedFailingTag(FakeAbLegacyTagFactory factory)
|
||||||
|
{
|
||||||
|
// Cause every read to throw — exception-driven failures count as
|
||||||
|
// BadCommunicationError per RecordError(commFailure:true).
|
||||||
|
factory.Customise = p => new FakeAbLegacyTag(p)
|
||||||
|
{
|
||||||
|
ThrowOnRead = true,
|
||||||
|
Exception = new TimeoutException("simulated comm failure"),
|
||||||
|
};
|
||||||
|
// Return value is the prototype so a caller that wants to flip the
|
||||||
|
// failure off later can do so via factory.Tags["N7:0"].
|
||||||
|
return null!;
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Three_consecutive_failures_demote_the_device()
|
||||||
|
{
|
||||||
|
var (drv, factory) = NewDriver();
|
||||||
|
SeedFailingTag(factory);
|
||||||
|
await drv.InitializeAsync("{}", CancellationToken.None);
|
||||||
|
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
|
||||||
|
var state = drv.GetDeviceState(Host).ShouldNotBeNull();
|
||||||
|
state.DemotedUntilUtc.ShouldNotBeNull();
|
||||||
|
var snap = drv.DiagnosticTags.Snapshot(Host);
|
||||||
|
snap.DemoteCount.ShouldBe(1);
|
||||||
|
snap.LastDemotedUtc.ShouldNotBeNull();
|
||||||
|
drv.GetHostStatuses().Single().State.ShouldBe(HostState.Demoted);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Reads_while_demoted_short_circuit_without_invoking_libplctag()
|
||||||
|
{
|
||||||
|
var (drv, factory) = NewDriver(
|
||||||
|
new AbLegacyDemoteOptions(FailureThreshold: 3, DemoteFor: TimeSpan.FromMinutes(5)));
|
||||||
|
SeedFailingTag(factory);
|
||||||
|
await drv.InitializeAsync("{}", CancellationToken.None);
|
||||||
|
|
||||||
|
// Trip the demotion.
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
var readsBeforeDemote = factory.Tags["N7:0"].ReadCount;
|
||||||
|
|
||||||
|
// Subsequent reads MUST NOT call into libplctag — the short-circuit
|
||||||
|
// returns BadCommunicationError before EnsureTagRuntimeAsync.
|
||||||
|
var result = await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
result[0].StatusCode.ShouldBe(AbLegacyStatusMapper.BadCommunicationError);
|
||||||
|
factory.Tags["N7:0"].ReadCount.ShouldBe(readsBeforeDemote);
|
||||||
|
|
||||||
|
var result2 = await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
result2[0].StatusCode.ShouldBe(AbLegacyStatusMapper.BadCommunicationError);
|
||||||
|
factory.Tags["N7:0"].ReadCount.ShouldBe(readsBeforeDemote);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task After_DemoteFor_expires_next_read_dispatches_through()
|
||||||
|
{
|
||||||
|
// Tiny window so the cool-down expires within the test.
|
||||||
|
var (drv, factory) = NewDriver(
|
||||||
|
new AbLegacyDemoteOptions(FailureThreshold: 2, DemoteFor: TimeSpan.FromMilliseconds(50)));
|
||||||
|
SeedFailingTag(factory);
|
||||||
|
await drv.InitializeAsync("{}", CancellationToken.None);
|
||||||
|
|
||||||
|
// Trip with two failures.
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
|
||||||
|
var state = drv.GetDeviceState(Host).ShouldNotBeNull();
|
||||||
|
state.DemotedUntilUtc.ShouldNotBeNull();
|
||||||
|
var readsBeforeWait = factory.Tags["N7:0"].ReadCount;
|
||||||
|
|
||||||
|
// Flip the fake to succeed and wait past the demote window.
|
||||||
|
factory.Tags["N7:0"].ThrowOnRead = false;
|
||||||
|
factory.Tags["N7:0"].Value = 42;
|
||||||
|
factory.Tags["N7:0"].Status = 0;
|
||||||
|
await Task.Delay(TimeSpan.FromMilliseconds(120));
|
||||||
|
|
||||||
|
var result = await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
result[0].StatusCode.ShouldBe(AbLegacyStatusMapper.Good);
|
||||||
|
result[0].Value.ShouldBe(42);
|
||||||
|
// The window expiry path dispatched through to libplctag.
|
||||||
|
factory.Tags["N7:0"].ReadCount.ShouldBeGreaterThan(readsBeforeWait);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Successful_read_resets_consecutive_failure_counter()
|
||||||
|
{
|
||||||
|
var (drv, factory) = NewDriver();
|
||||||
|
// Initial state — every read fails.
|
||||||
|
SeedFailingTag(factory);
|
||||||
|
await drv.InitializeAsync("{}", CancellationToken.None);
|
||||||
|
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
var state = drv.GetDeviceState(Host).ShouldNotBeNull();
|
||||||
|
state.ConsecutiveFailures.ShouldBe(2);
|
||||||
|
|
||||||
|
// One successful read — flip the existing fake.
|
||||||
|
factory.Tags["N7:0"].ThrowOnRead = false;
|
||||||
|
factory.Tags["N7:0"].Value = 99;
|
||||||
|
factory.Tags["N7:0"].Status = 0;
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
|
||||||
|
state.ConsecutiveFailures.ShouldBe(0);
|
||||||
|
state.DemotedUntilUtc.ShouldBeNull();
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Failure_success_failure_does_not_demote_at_threshold_three()
|
||||||
|
{
|
||||||
|
var (drv, factory) = NewDriver(
|
||||||
|
new AbLegacyDemoteOptions(FailureThreshold: 3));
|
||||||
|
SeedFailingTag(factory);
|
||||||
|
await drv.InitializeAsync("{}", CancellationToken.None);
|
||||||
|
|
||||||
|
// 2 failures.
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
|
||||||
|
// 1 success — counter resets.
|
||||||
|
factory.Tags["N7:0"].ThrowOnRead = false;
|
||||||
|
factory.Tags["N7:0"].Status = 0;
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
|
||||||
|
// 2 more failures — should still be below the threshold.
|
||||||
|
factory.Tags["N7:0"].ThrowOnRead = true;
|
||||||
|
factory.Tags["N7:0"].Exception = new TimeoutException("flap");
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
|
||||||
|
var state = drv.GetDeviceState(Host).ShouldNotBeNull();
|
||||||
|
state.DemotedUntilUtc.ShouldBeNull();
|
||||||
|
drv.DiagnosticTags.Snapshot(Host).DemoteCount.ShouldBe(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task DemoteCount_and_LastDemotedUtc_surface_via_diagnostic_short_circuit()
|
||||||
|
{
|
||||||
|
var (drv, factory) = NewDriver();
|
||||||
|
SeedFailingTag(factory);
|
||||||
|
await drv.InitializeAsync("{}", CancellationToken.None);
|
||||||
|
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
|
||||||
|
// Read the synthetic _Diagnostics counters.
|
||||||
|
var demoteCountRef = $"{AbLegacyDiagnosticTags.DiagnosticsFolderPrefix}{Host}/DemoteCount";
|
||||||
|
var lastDemotedRef = $"{AbLegacyDiagnosticTags.DiagnosticsFolderPrefix}{Host}/LastDemotedUtc";
|
||||||
|
var counts = await drv.ReadAsync([demoteCountRef, lastDemotedRef], CancellationToken.None);
|
||||||
|
|
||||||
|
counts[0].StatusCode.ShouldBe(AbLegacyStatusMapper.Good);
|
||||||
|
counts[0].Value.ShouldBe(1L);
|
||||||
|
counts[1].StatusCode.ShouldBe(AbLegacyStatusMapper.Good);
|
||||||
|
counts[1].Value.ShouldBeOfType<string>();
|
||||||
|
((string)counts[1].Value!).Length.ShouldBeGreaterThan(0); // ISO-8601 stamp
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Demote_disabled_never_short_circuits_reads()
|
||||||
|
{
|
||||||
|
var (drv, factory) = NewDriver(
|
||||||
|
new AbLegacyDemoteOptions(FailureThreshold: 1, Enabled: false));
|
||||||
|
SeedFailingTag(factory);
|
||||||
|
await drv.InitializeAsync("{}", CancellationToken.None);
|
||||||
|
|
||||||
|
// 5 failures — would normally trip a single-fail threshold, but Enabled=false.
|
||||||
|
for (var i = 0; i < 5; i++) await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
|
||||||
|
var state = drv.GetDeviceState(Host).ShouldNotBeNull();
|
||||||
|
state.DemotedUntilUtc.ShouldBeNull();
|
||||||
|
var snap = drv.DiagnosticTags.Snapshot(Host);
|
||||||
|
snap.DemoteCount.ShouldBe(0);
|
||||||
|
// Failures still get recorded as comm errors though — the diagnostic
|
||||||
|
// surface is honest about what happened, just no auto-throttle.
|
||||||
|
snap.CommFailures.ShouldBe(5);
|
||||||
|
// libplctag was invoked every time — that's the whole point of opting out.
|
||||||
|
factory.Tags["N7:0"].ReadCount.ShouldBe(5);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Reinit_preserves_DemoteCount_but_clears_active_demotion()
|
||||||
|
{
|
||||||
|
var (drv, factory) = NewDriver();
|
||||||
|
SeedFailingTag(factory);
|
||||||
|
await drv.InitializeAsync("{}", CancellationToken.None);
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
|
||||||
|
drv.DiagnosticTags.Snapshot(Host).DemoteCount.ShouldBe(1);
|
||||||
|
drv.GetDeviceState(Host)!.DemotedUntilUtc.ShouldNotBeNull();
|
||||||
|
|
||||||
|
await drv.ReinitializeAsync("{}", CancellationToken.None);
|
||||||
|
|
||||||
|
// Active demotion cleared (the device is freshly tracked); cumulative count survives.
|
||||||
|
drv.GetDeviceState(Host)!.DemotedUntilUtc.ShouldBeNull();
|
||||||
|
drv.GetDeviceState(Host)!.ConsecutiveFailures.ShouldBe(0);
|
||||||
|
drv.DiagnosticTags.Snapshot(Host).DemoteCount.ShouldBe(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Disposing_driver_after_demotion_does_not_throw()
|
||||||
|
{
|
||||||
|
var (drv, factory) = NewDriver();
|
||||||
|
SeedFailingTag(factory);
|
||||||
|
await drv.InitializeAsync("{}", CancellationToken.None);
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
|
||||||
|
await drv.DisposeAsync();
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Demote_options_dto_round_trips_through_factory_extensions()
|
||||||
|
{
|
||||||
|
const string json = """
|
||||||
|
{
|
||||||
|
"Devices": [
|
||||||
|
{
|
||||||
|
"HostAddress": "ab://10.0.0.5/1,0",
|
||||||
|
"PlcFamily": "Slc500",
|
||||||
|
"Demote": {
|
||||||
|
"FailureThreshold": 5,
|
||||||
|
"DemoteForMs": 60000,
|
||||||
|
"Enabled": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"Probe": { "Enabled": false },
|
||||||
|
"Tags": [
|
||||||
|
{ "Name": "X", "DeviceHostAddress": "ab://10.0.0.5/1,0", "Address": "N7:0", "DataType": "Int" }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
""";
|
||||||
|
|
||||||
|
var drv = AbLegacyDriverFactoryExtensions.CreateInstance("drv-demote-roundtrip", json);
|
||||||
|
await drv.InitializeAsync(json, CancellationToken.None);
|
||||||
|
|
||||||
|
var state = drv.GetDeviceState(Host).ShouldNotBeNull();
|
||||||
|
state.Options.Demote.ShouldNotBeNull();
|
||||||
|
state.Options.Demote!.FailureThreshold.ShouldBe(5);
|
||||||
|
state.Options.Demote.EffectiveDemoteFor.ShouldBe(TimeSpan.FromMinutes(1));
|
||||||
|
state.Options.Demote.Enabled.ShouldBeTrue();
|
||||||
|
|
||||||
|
await drv.ShutdownAsync(CancellationToken.None);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Two_devices_one_faulty_does_not_starve_the_healthy_one()
|
||||||
|
{
|
||||||
|
// Mixed factory — one host's tag throws, the other's reads cleanly.
|
||||||
|
var factory = new FakeAbLegacyTagFactory();
|
||||||
|
factory.Customise = p =>
|
||||||
|
{
|
||||||
|
// Identify by the Gateway portion of the create params.
|
||||||
|
var fail = p.Gateway == "10.0.0.6";
|
||||||
|
return new FakeAbLegacyTag(p)
|
||||||
|
{
|
||||||
|
ThrowOnRead = fail,
|
||||||
|
Exception = fail ? new TimeoutException("faulty") : null,
|
||||||
|
Value = 42,
|
||||||
|
Status = 0,
|
||||||
|
};
|
||||||
|
};
|
||||||
|
var drv = new AbLegacyDriver(new AbLegacyDriverOptions
|
||||||
|
{
|
||||||
|
Devices =
|
||||||
|
[
|
||||||
|
new AbLegacyDeviceOptions(Host, AbLegacyPlcFamily.Slc500),
|
||||||
|
new AbLegacyDeviceOptions(SecondHost, AbLegacyPlcFamily.Slc500),
|
||||||
|
],
|
||||||
|
Tags =
|
||||||
|
[
|
||||||
|
new AbLegacyTagDefinition("Healthy", Host, "N7:0", AbLegacyDataType.Int),
|
||||||
|
new AbLegacyTagDefinition("Faulty", SecondHost, "N7:0", AbLegacyDataType.Int),
|
||||||
|
],
|
||||||
|
Probe = new AbLegacyProbeOptions { Enabled = false },
|
||||||
|
}, "drv-mix", factory);
|
||||||
|
await drv.InitializeAsync("{}", CancellationToken.None);
|
||||||
|
|
||||||
|
// Trip the faulty side.
|
||||||
|
for (var i = 0; i < 3; i++)
|
||||||
|
await drv.ReadAsync(["Faulty"], CancellationToken.None);
|
||||||
|
|
||||||
|
// Healthy host MUST keep returning Good even though the sibling is demoted.
|
||||||
|
var healthyResult = await drv.ReadAsync(["Healthy"], CancellationToken.None);
|
||||||
|
healthyResult[0].StatusCode.ShouldBe(AbLegacyStatusMapper.Good);
|
||||||
|
healthyResult[0].Value.ShouldBe(42);
|
||||||
|
|
||||||
|
// Reads against the faulty host short-circuit.
|
||||||
|
var faultyResult = await drv.ReadAsync(["Faulty"], CancellationToken.None);
|
||||||
|
faultyResult[0].StatusCode.ShouldBe(AbLegacyStatusMapper.BadCommunicationError);
|
||||||
|
|
||||||
|
drv.GetDeviceState(Host)!.DemotedUntilUtc.ShouldBeNull();
|
||||||
|
drv.GetDeviceState(SecondHost)!.DemotedUntilUtc.ShouldNotBeNull();
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task BadNodeIdUnknown_does_not_count_toward_demote_tally()
|
||||||
|
{
|
||||||
|
// -14 maps to BadNodeIdUnknown — terminal, not a comm failure.
|
||||||
|
var (drv, factory) = NewDriver();
|
||||||
|
factory.Customise = p => new FakeAbLegacyTag(p) { Status = -14 };
|
||||||
|
await drv.InitializeAsync("{}", CancellationToken.None);
|
||||||
|
|
||||||
|
for (var i = 0; i < 5; i++)
|
||||||
|
await drv.ReadAsync(["X"], CancellationToken.None);
|
||||||
|
|
||||||
|
var state = drv.GetDeviceState(Host).ShouldNotBeNull();
|
||||||
|
// Five terminal failures shouldn't trip the demote threshold — they're
|
||||||
|
// a config / decoder mismatch, not a sign of a flapping link.
|
||||||
|
state.DemotedUntilUtc.ShouldBeNull();
|
||||||
|
drv.DiagnosticTags.Snapshot(Host).DemoteCount.ShouldBe(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void HostState_enum_has_Demoted_value()
|
||||||
|
{
|
||||||
|
// Belt-and-braces: the abstraction surface must carry the new value
|
||||||
|
// for downstream consumers (HostStatusPublisher, Admin UI, …) to
|
||||||
|
// see and route it.
|
||||||
|
Enum.IsDefined(typeof(HostState), HostState.Demoted).ShouldBeTrue();
|
||||||
|
((int)HostState.Demoted).ShouldBeGreaterThan((int)HostState.Faulted);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -173,7 +173,9 @@ public sealed class AbLegacyDiagnosticsTests
|
|||||||
var diagVars = builder.Variables
|
var diagVars = builder.Variables
|
||||||
.Where(v => v.Info.FullName.StartsWith(AbLegacyDiagnosticTags.DiagnosticsFolderPrefix))
|
.Where(v => v.Info.FullName.StartsWith(AbLegacyDiagnosticTags.DiagnosticsFolderPrefix))
|
||||||
.ToList();
|
.ToList();
|
||||||
diagVars.Count.ShouldBe(14); // 7 names × 2 devices
|
// PR ablegacy-12 / #255 — DemoteCount + LastDemotedUtc bring the canonical
|
||||||
|
// count to 9 names per device (was 7 in PR ablegacy-10).
|
||||||
|
diagVars.Count.ShouldBe(AbLegacyDiagnosticTags.DiagnosticTagNames.Count * 2);
|
||||||
diagVars.ShouldAllBe(v => v.Info.SecurityClass == SecurityClassification.ViewOnly);
|
diagVars.ShouldAllBe(v => v.Info.SecurityClass == SecurityClassification.ViewOnly);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -84,7 +84,14 @@ internal class FakeAbLegacyTag : IAbLegacyTagRuntime
|
|||||||
|
|
||||||
internal sealed class FakeAbLegacyTagFactory : IAbLegacyTagFactory
|
internal sealed class FakeAbLegacyTagFactory : IAbLegacyTagFactory
|
||||||
{
|
{
|
||||||
public Dictionary<string, FakeAbLegacyTag> Tags { get; } = new(StringComparer.OrdinalIgnoreCase);
|
// PR ablegacy-12 / #255 — switched from plain Dictionary to ConcurrentDictionary so
|
||||||
|
// the read path (test thread) and the probe loop (background Task) can both call
|
||||||
|
// Create without corrupting the dict. Pre-PR-12 the race existed but only tipped
|
||||||
|
// a few percent of test runs into KeyNotFoundException; PR-12's added
|
||||||
|
// Interlocked.Exchange writes shifted timing enough to make it deterministic-flaky
|
||||||
|
// (~60%).
|
||||||
|
public System.Collections.Concurrent.ConcurrentDictionary<string, FakeAbLegacyTag> Tags { get; } =
|
||||||
|
new(StringComparer.OrdinalIgnoreCase);
|
||||||
public Func<AbLegacyTagCreateParams, FakeAbLegacyTag>? Customise { get; set; }
|
public Func<AbLegacyTagCreateParams, FakeAbLegacyTag>? Customise { get; set; }
|
||||||
|
|
||||||
public IAbLegacyTagRuntime Create(AbLegacyTagCreateParams p)
|
public IAbLegacyTagRuntime Create(AbLegacyTagCreateParams p)
|
||||||
|
|||||||
Reference in New Issue
Block a user