Auto: ablegacy-12 — auto-demote on comm failure

Closes #255
This commit is contained in:
Joseph Doherty
2026-04-26 08:44:53 -04:00
parent 8ee65a75d2
commit 1e3053c0d8
18 changed files with 1160 additions and 31 deletions

View File

@@ -38,4 +38,16 @@ public sealed record HostStatusChangedEventArgs(
HostState NewState);
/// <summary>Host lifecycle state. Generalization of Galaxy's Platform/Engine ScanState.</summary>
public enum HostState { Unknown, Running, Stopped, Faulted }
/// <remarks>
/// <para>
/// <see cref="Demoted"/> (PR ablegacy-12 / #255) is a soft-stopped state used by drivers
/// that auto-throttle a host after N consecutive comm failures. Reads are short-circuited
/// with <c>BadCommunicationError</c> for a configurable cool-down window so one slow PLC
/// doesn't starve faster peers sharing the same driver. Demoted is *not* the same as
/// <see cref="Stopped"/> (which means "probe says it's down") nor <see cref="Faulted"/>
/// (which means "the driver itself is broken"); it's a deliberate driver-side back-off.
/// Consumers that don't recognize <c>Demoted</c> can safely treat it as <c>Stopped</c>
/// (see <c>HostStatusPublisher.MapState</c>).
/// </para>
/// </remarks>
public enum HostState { Unknown, Running, Stopped, Faulted, Demoted }

View File

@@ -25,6 +25,34 @@ public abstract class AbLegacyCommandBase : DriverCommandBase
[CommandOption("timeout-ms", Description = "Per-operation timeout in ms (default 5000).")]
public int TimeoutMs { get; init; } = 5000;
/// <summary>
/// PR ablegacy-12 / #255 — consecutive comm failures before this device is
/// auto-demoted. Reads against a demoted device short-circuit with
/// <c>BadCommunicationError</c> for <see cref="DemoteForMs"/> ms so one
/// unreachable PLC can't starve faster peers sharing the driver thread.
/// </summary>
[CommandOption("demote-failure-threshold", Description =
"Consecutive comm failures before the device is auto-demoted (PR ablegacy-12). Default 3.")]
public int DemoteFailureThreshold { get; init; } = 3;
/// <summary>
/// PR ablegacy-12 / #255 — auto-demote cool-down window in ms. Reads while
/// this window is active short-circuit with <c>BadCommunicationError</c>;
/// a successful probe clears it early.
/// </summary>
[CommandOption("demote-for-ms", Description =
"Auto-demote cool-down window in ms (PR ablegacy-12). Default 30000 (30s).")]
public int DemoteForMs { get; init; } = 30_000;
/// <summary>
/// PR ablegacy-12 / #255 — opt out of the auto-demote behaviour. The
/// consecutive-failure tally still ticks (so DemoteCount/LastDemotedUtc
/// stay zero) but reads never short-circuit.
/// </summary>
[CommandOption("no-demote", Description =
"Disable auto-demote on consecutive comm failures (PR ablegacy-12). Default off (auto-demote enabled).")]
public bool NoDemote { get; init; }
/// <inheritdoc />
public override TimeSpan Timeout
{
@@ -41,7 +69,11 @@ public abstract class AbLegacyCommandBase : DriverCommandBase
Devices = [new AbLegacyDeviceOptions(
HostAddress: Gateway,
PlcFamily: PlcType,
DeviceName: $"cli-{PlcType}")],
DeviceName: $"cli-{PlcType}",
Demote: new AbLegacyDemoteOptions(
FailureThreshold: DemoteFailureThreshold,
DemoteFor: TimeSpan.FromMilliseconds(DemoteForMs),
Enabled: !NoDemote))],
Tags = tags,
Timeout = Timeout,
Probe = new AbLegacyProbeOptions { Enabled = false },

View File

@@ -40,10 +40,19 @@ public sealed class ProbeCommand : AbLegacyCommandBase
await driver.InitializeAsync("{}", ct);
var snapshot = await driver.ReadAsync(["__probe"], ct);
var health = driver.GetHealth();
// PR ablegacy-12 / #255 — surface Demoted alongside the probe-driven
// HostState. After a one-shot probe the host hasn't been observed
// (no probe loop runs in CLI mode), so HostState is typically Unknown
// unless the read above tripped the demote threshold.
var hostStatus = driver.GetHostStatuses().FirstOrDefault();
await console.Output.WriteLineAsync($"Gateway: {Gateway}");
await console.Output.WriteLineAsync($"PLC type: {PlcType}");
await console.Output.WriteLineAsync($"Health: {health.State}");
if (hostStatus is not null)
{
await console.Output.WriteLineAsync($"Host state: {hostStatus.State}");
}
if (health.LastError is { } err)
await console.Output.WriteLineAsync($"Last error: {err}");
await console.Output.WriteLineAsync();

View File

@@ -40,6 +40,11 @@ public sealed class AbLegacyDiagnosticTags
public const string DiagnosticsFolderPrefix = "_Diagnostics/";
/// <summary>Canonical names the diagnostics folder exposes. Keep in lockstep with discovery.</summary>
/// <remarks>
/// PR ablegacy-12 / #255 — <c>DemoteCount</c> + <c>LastDemotedUtc</c> ride
/// alongside the original seven so HMIs can spot a flapping device by
/// watching <c>DemoteCount</c> climb without scraping logs.
/// </remarks>
public static readonly IReadOnlyList<string> DiagnosticTagNames =
[
"RequestCount",
@@ -49,6 +54,9 @@ public sealed class AbLegacyDiagnosticTags
"LastErrorCode",
"LastErrorMessage",
"CommFailures",
// PR ablegacy-12 / #255 — auto-demote on comm failure surface.
"DemoteCount",
"LastDemotedUtc",
];
private static readonly HashSet<string> DiagnosticTagNameSet =
@@ -130,6 +138,39 @@ public sealed class AbLegacyDiagnosticTags
Interlocked.Increment(ref c.Retry);
}
/// <summary>
/// PR ablegacy-12 / #255 — record an auto-demotion event: bumps cumulative
/// <c>DemoteCount</c> and stamps <c>LastDemotedUtc</c>. Fires every time the
/// driver crosses the failure threshold and arms a fresh cool-down window —
/// a single flapping link that demotes hourly will surface as a steadily
/// climbing counter, which is the operator-facing signal we want.
/// </summary>
public void RecordDemote(string deviceHostAddress, DateTime nowUtc)
{
ArgumentNullException.ThrowIfNull(deviceHostAddress);
var c = GetOrCreate(deviceHostAddress);
Interlocked.Increment(ref c.DemoteCount);
// DateTime is 64 bits — use Interlocked.Exchange on the Ticks field so a
// concurrent reader sees a torn-free snapshot. On x86 a 64-bit non-aligned
// write isn't atomic; on x64 it is, but routing through Interlocked is
// platform-independent + costs almost nothing.
Interlocked.Exchange(ref c.LastDemotedUtcTicks, nowUtc.Ticks);
}
/// <summary>
/// PR ablegacy-12 / #255 — restore cumulative demote bookkeeping after a
/// <see cref="AbLegacyDriver.ReinitializeAsync"/> cycle so an operator
/// redeploying config mid-incident doesn't lose flapping-link history.
/// Sets the counters to absolute values rather than incrementing.
/// </summary>
public void RestoreDemote(string deviceHostAddress, long demoteCount, DateTime? lastDemotedUtc)
{
ArgumentNullException.ThrowIfNull(deviceHostAddress);
var c = GetOrCreate(deviceHostAddress);
Interlocked.Exchange(ref c.DemoteCount, demoteCount);
Interlocked.Exchange(ref c.LastDemotedUtcTicks, lastDemotedUtc?.Ticks ?? 0);
}
/// <summary>Snapshot the current counters for a device. Returns zeros for unknown hosts.</summary>
public DiagnosticsSnapshot Snapshot(string deviceHostAddress)
{
@@ -139,7 +180,8 @@ public sealed class AbLegacyDiagnosticTags
{
_counters.TryGetValue(deviceHostAddress, out c);
}
if (c is null) return new DiagnosticsSnapshot(0, 0, 0, 0, 0, string.Empty, 0);
if (c is null) return new DiagnosticsSnapshot(0, 0, 0, 0, 0, string.Empty, 0, 0, null);
var ticks = Interlocked.Read(ref c.LastDemotedUtcTicks);
return new DiagnosticsSnapshot(
Request: Interlocked.Read(ref c.Request),
Response: Interlocked.Read(ref c.Response),
@@ -147,7 +189,10 @@ public sealed class AbLegacyDiagnosticTags
Retry: Interlocked.Read(ref c.Retry),
LastErrorCode: Volatile.Read(ref c.LastErrorCode),
LastErrorMessage: c.LastErrorMessage ?? string.Empty,
CommFailures: Interlocked.Read(ref c.CommFailures));
CommFailures: Interlocked.Read(ref c.CommFailures),
// PR ablegacy-12 / #255 — auto-demote surface.
DemoteCount: Interlocked.Read(ref c.DemoteCount),
LastDemotedUtc: ticks == 0 ? null : new DateTime(ticks, DateTimeKind.Utc));
}
/// <summary>
@@ -155,7 +200,14 @@ public sealed class AbLegacyDiagnosticTags
/// from <see cref="AbLegacyDriver.ReinitializeAsync"/> so a config redeploy starts
/// with a clean diagnostic surface.
/// </summary>
public void Reset(string deviceHostAddress)
/// <remarks>
/// PR ablegacy-12 / #255 — when <paramref name="preserveDemote"/> is <c>true</c> the
/// cumulative <c>DemoteCount</c> + <c>LastDemotedUtc</c> survive the reset.
/// <see cref="AbLegacyDriver.ReinitializeAsync"/> uses that mode so an operator
/// redeploying a config doesn't lose their flapping-link history; a fresh process
/// start clears them naturally because the dictionary is rebuilt from scratch.
/// </remarks>
public void Reset(string deviceHostAddress, bool preserveDemote = false)
{
ArgumentNullException.ThrowIfNull(deviceHostAddress);
var c = GetOrCreate(deviceHostAddress);
@@ -166,14 +218,40 @@ public sealed class AbLegacyDiagnosticTags
Interlocked.Exchange(ref c.LastErrorCode, 0);
c.LastErrorMessage = string.Empty;
Interlocked.Exchange(ref c.CommFailures, 0);
if (!preserveDemote)
{
Interlocked.Exchange(ref c.DemoteCount, 0);
Interlocked.Exchange(ref c.LastDemotedUtcTicks, 0);
}
}
/// <summary>Reset every tracked device. Called on full <c>ShutdownAsync</c>.</summary>
public void ResetAll()
/// <remarks>
/// PR ablegacy-12 / #255 — when <paramref name="preserveDemote"/> is <c>true</c> the
/// cumulative demote counters survive a per-device reset of every other field.
/// The default (<c>false</c>) clears the dictionary outright, which is what
/// <see cref="AbLegacyDriver.ShutdownAsync"/> wants.
/// </remarks>
public void ResetAll(bool preserveDemote = false)
{
if (!preserveDemote)
{
lock (_lock)
{
_counters.Clear();
}
return;
}
// Preserve mode: keep the dictionary keys + cumulative demote fields, but
// zero everything else. Used by Reinitialize to span a config redeploy
// without losing flapping-link history.
lock (_lock)
{
_counters.Clear();
foreach (var key in _counters.Keys.ToList())
{
Reset(key, preserveDemote: true);
}
}
}
@@ -205,6 +283,11 @@ public sealed class AbLegacyDiagnosticTags
"LastErrorCode" => snapshot.LastErrorCode,
"LastErrorMessage" => snapshot.LastErrorMessage,
"CommFailures" => snapshot.CommFailures,
// PR ablegacy-12 / #255 — auto-demote surface. LastDemotedUtc returns
// the empty string when no demotion has happened yet, mirroring the
// LastErrorMessage convention so HMIs can bind directly to a string.
"DemoteCount" => snapshot.DemoteCount,
"LastDemotedUtc" => snapshot.LastDemotedUtc?.ToString("o") ?? string.Empty,
_ => null,
};
return true;
@@ -236,6 +319,11 @@ public sealed class AbLegacyDiagnosticTags
public int LastErrorCode;
public string? LastErrorMessage = string.Empty;
public long CommFailures;
// PR ablegacy-12 / #255 — cumulative across config redeploys. Cleared only
// on full driver process restart (the dictionary is rebuilt from scratch);
// ReinitializeAsync uses preserveDemote: true.
public long DemoteCount;
public long LastDemotedUtcTicks;
}
}
@@ -251,6 +339,9 @@ public sealed class AbLegacyDiagnosticTags
/// <param name="LastErrorCode">Most recent libplctag status code on a failed read.</param>
/// <param name="LastErrorMessage">Most recent libplctag error message on a failed read.</param>
/// <param name="CommFailures">Count of read failures mapped to <c>BadCommunicationError</c>.</param>
/// <param name="DemoteCount">PR ablegacy-12 / #255 — cumulative auto-demote events.</param>
/// <param name="LastDemotedUtc">PR ablegacy-12 / #255 — UTC timestamp of the most
/// recent demotion, or <c>null</c> if the device has never been demoted.</param>
public sealed record DiagnosticsSnapshot(
long Request,
long Response,
@@ -258,4 +349,6 @@ public sealed record DiagnosticsSnapshot(
long Retry,
int LastErrorCode,
string LastErrorMessage,
long CommFailures);
long CommFailures,
long DemoteCount,
DateTime? LastDemotedUtc);

View File

@@ -217,6 +217,20 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
public async Task ReinitializeAsync(string driverConfigJson, CancellationToken cancellationToken)
{
// PR ablegacy-12 / #255 — capture the cumulative DemoteCount + LastDemotedUtc
// for every currently-tracked device before we tear down. The Shutdown below
// calls ResetAll() which clears the dictionary; the per-host InitializeAsync
// below re-EnsureDevice's the slots; we restore the cumulative demote
// history so an operator who redeploys mid-incident doesn't lose the trail
// of how often this device was flapping.
var preservedDemote = new Dictionary<string, (long DemoteCount, DateTime? LastDemotedUtc)>(
StringComparer.OrdinalIgnoreCase);
foreach (var (host, _) in _devices)
{
var snap = _diagnosticTags.Snapshot(host);
preservedDemote[host] = (snap.DemoteCount, snap.LastDemotedUtc);
}
await ShutdownAsync(cancellationToken).ConfigureAwait(false);
// PR ablegacy-10 / #253 — counters were dropped along with the device map when
// ShutdownAsync called ResetAll; the InitializeAsync below re-EnsureDevice's each
@@ -224,6 +238,16 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
// here in case a downstream override of either method skips the cycle.
_diagnosticTags.ResetAll();
await InitializeAsync(driverConfigJson, cancellationToken).ConfigureAwait(false);
// PR ablegacy-12 / #255 — restore the cumulative demote history. Only hosts
// that survive the redeploy get their counters back; a device removed from
// config legitimately drops its history (it isn't being tracked any more).
foreach (var (host, (count, lastUtc)) in preservedDemote)
{
if (count == 0 && lastUtc is null) continue;
if (!_devices.ContainsKey(host)) continue;
_diagnosticTags.RestoreDemote(host, count, lastUtc);
}
}
public async Task ShutdownAsync(CancellationToken cancellationToken)
@@ -275,6 +299,38 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
internal int ResolveRetries(DeviceState device) =>
device.Options.Retries ?? _options.Retries ?? 0;
/// <summary>
/// PR ablegacy-12 / #255 — resolve the active <see cref="AbLegacyDemoteOptions"/> for
/// a device. Per-device options win; otherwise the documented defaults (3 failures /
/// 30 s / enabled). Returns a non-null record so callers can assume a usable value.
/// </summary>
internal AbLegacyDemoteOptions ResolveDemote(DeviceState device) =>
device.Options.Demote ?? new AbLegacyDemoteOptions();
/// <summary>
/// PR ablegacy-12 / #255 — common bookkeeping for one comm failure: bump the
/// consecutive-failure counter and arm the demote window once the threshold is
/// crossed. Returns <c>true</c> when this call tipped the device into Demoted (so
/// the caller can fire <see cref="OnHostStatusChanged"/>); <c>false</c> when the
/// device was already demoted or stayed below the threshold.
/// </summary>
private bool RecordFailureAndMaybeDemote(DeviceState state, DateTime nowUtc)
{
var demote = ResolveDemote(state);
var consecutive = Interlocked.Increment(ref state.ConsecutiveFailures);
if (!demote.Enabled || consecutive < demote.FailureThreshold) return false;
// Already demoted? Don't re-arm — the original window's expiry is the
// operator-facing recovery clock and re-arming on every subsequent failed
// read would suppress reads forever on a fully-down device. The probe
// loop is what eventually clears the demotion (or the window expiring).
if (state.DemotedUntilUtc is { } until && until > nowUtc) return false;
state.DemotedUntilUtc = nowUtc + demote.EffectiveDemoteFor;
_diagnosticTags.RecordDemote(state.Options.HostAddress, nowUtc);
return true;
}
// ---- IReadable ----
public async Task<IReadOnlyList<DataValueSnapshot>> ReadAsync(
@@ -323,6 +379,45 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
// double-counting the original attempt as a retry.
_diagnosticTags.RecordRequest(def.DeviceHostAddress);
// PR ablegacy-12 / #255 — auto-demote short-circuit. When the device's demote
// window is still active we return BadCommunicationError immediately, without
// touching libplctag or its retry loop. That's the whole point of the feature:
// one slow PLC sharing the driver thread can't drag down healthy peers. We
// don't bump ErrorCount/CommFailures here because this isn't a fresh field
// failure — it's the cool-down on a previously-counted one.
if (device.DemotedUntilUtc is { } demotedUntil)
{
if (demotedUntil > now)
{
results[i] = new DataValueSnapshot(null,
AbLegacyStatusMapper.BadCommunicationError, null, now);
continue;
}
// Window expired without an early-clear from a probe success — drop the
// marker but don't reset ConsecutiveFailures yet. If this read also
// fails the failure tally keeps counting from where it left off, so a
// permanently-down device re-arms the window after one more
// consecutive failure (vs. having to repeat the full threshold).
lock (device.ProbeLock)
{
if (device.DemotedUntilUtc is { } stillUntil && stillUntil <= now)
{
device.DemotedUntilUtc = null;
// Mirror Stopped→Running on a probe-driven recovery: leave the
// HostState transition to the probe loop (or the upcoming success
// below); we just clear the cool-down marker so the next read
// dispatches normally.
if (device.HostState == HostState.Demoted)
{
// Surface a transition out of Demoted. The probe loop will
// bring it Running once a probe succeeds; until then leave
// it in Stopped to reflect "we don't actually know it's up".
TransitionDeviceState(device, HostState.Stopped);
}
}
}
}
// PR 9 — per-device retry loop: on transient BadCommunicationError (libplctag throw
// OR a non-zero status that maps to BadCommunicationError) retry up to N times. A
// terminal mapped status (e.g. BadNodeIdUnknown for a missing PLC tag, BadTypeMismatch
@@ -360,6 +455,15 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
status,
$"libplctag status {status} reading {reference}",
commFailure: mappedStatus == AbLegacyStatusMapper.BadCommunicationError);
// PR ablegacy-12 / #255 — only comm failures count toward the
// demote tally. A BadNodeIdUnknown / BadTypeMismatch is a config
// / decoder mismatch, not a sign the host is unreachable, so
// demoting on it would punish the operator for a typo.
if (mappedStatus == AbLegacyStatusMapper.BadCommunicationError
&& RecordFailureAndMaybeDemote(device, now))
{
TransitionDeviceState(device, HostState.Demoted);
}
snapshot = new DataValueSnapshot(null, mappedStatus, null, now);
_health = new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead,
$"libplctag status {status} reading {reference}");
@@ -385,6 +489,13 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
_health = new DriverHealth(DriverState.Healthy, now, null);
// PR ablegacy-10 / #253 — successful array read.
_diagnosticTags.RecordResponse(def.DeviceHostAddress);
// PR ablegacy-12 / #255 — successful read clears the
// consecutive-failure tally. We do NOT auto-clear DemotedUntilUtc
// here — the demote window is honoured to its full duration so an
// intermittent link that just happened to answer once doesn't
// immediately re-flood the channel. Probe success is the early
// recovery path.
Interlocked.Exchange(ref device.ConsecutiveFailures, 0);
break;
}
@@ -398,6 +509,10 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
_health = new DriverHealth(DriverState.Healthy, now, null);
// PR ablegacy-10 / #253 — successful scalar / sub-element / bit read.
_diagnosticTags.RecordResponse(def.DeviceHostAddress);
// PR ablegacy-12 / #255 — successful read clears the
// consecutive-failure tally; demote window keeps running
// until a probe success or natural expiry.
Interlocked.Exchange(ref device.ConsecutiveFailures, 0);
break;
}
catch (OperationCanceledException) { throw; }
@@ -414,6 +529,12 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
libplctagStatus: 0,
errorMessage: ex.Message,
commFailure: true);
// PR ablegacy-12 / #255 — exception-driven comm failure counts
// toward the demote tally just like a status-mapped one.
if (RecordFailureAndMaybeDemote(device, now))
{
TransitionDeviceState(device, HostState.Demoted);
}
snapshot = new DataValueSnapshot(null,
AbLegacyStatusMapper.BadCommunicationError, null, now);
_health = new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, ex.Message);
@@ -591,6 +712,14 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
"Most recent libplctag error message on a failed read; empty when no error has been seen since the last reset.");
EmitDiagnosticVariable(diag, deviceHostAddress, "CommFailures", DriverDataType.Int64,
"Count of read failures mapped to BadCommunicationError. Spans transient libplctag throws + retried-out chains so operators see a single 'wire fell off' counter.");
// PR ablegacy-12 / #255 — auto-demote surface. DemoteCount is cumulative
// across reinit (preserved in ReinitializeAsync); LastDemotedUtc is a
// string (ISO-8601 UTC) so HMIs can bind directly without a separate
// DateTime decoder. Empty string means "never demoted".
EmitDiagnosticVariable(diag, deviceHostAddress, "DemoteCount", DriverDataType.Int64,
"Cumulative auto-demote events for this device — bumps every time the driver crosses the consecutive-failure threshold and arms a fresh cool-down window. Survives ReinitializeAsync.");
EmitDiagnosticVariable(diag, deviceHostAddress, "LastDemotedUtc", DriverDataType.String,
"ISO-8601 UTC timestamp of the most recent auto-demotion; empty when this device has never been demoted.");
}
private static void EmitDiagnosticVariable(
@@ -665,7 +794,39 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
state.ProbeInitialized = false;
}
TransitionDeviceState(state, success ? HostState.Running : HostState.Stopped);
// PR ablegacy-12 / #255 — probe success is the early-recovery path: clear
// any active demote window + reset the failure tally so the next read
// dispatches normally. Probe failure participates in the same shared
// failure-tally as ReadAsync so a device with no live read traffic still
// demotes on a sustained outage.
if (success)
{
bool wasDemoted;
lock (state.ProbeLock)
{
wasDemoted = state.DemotedUntilUtc is not null;
state.DemotedUntilUtc = null;
}
Interlocked.Exchange(ref state.ConsecutiveFailures, 0);
TransitionDeviceState(state, HostState.Running);
_ = wasDemoted; // intentionally observed for future telemetry hooks
}
else
{
if (RecordFailureAndMaybeDemote(state, DateTime.UtcNow))
{
TransitionDeviceState(state, HostState.Demoted);
}
else
{
// Mid-tally probe failure: surface as Stopped if not already
// Demoted. This preserves pre-PR-12 behaviour for the common
// case (FailureThreshold=3 + a single hiccup ends up Stopped,
// not Demoted).
if (state.HostState != HostState.Demoted)
TransitionDeviceState(state, HostState.Stopped);
}
}
try { await Task.Delay(_options.Probe.Interval, ct).ConfigureAwait(false); }
catch (OperationCanceledException) { break; }
@@ -890,6 +1051,25 @@ public sealed class AbLegacyDriver : IDriver, IReadable, IWritable, ITagDiscover
public CancellationTokenSource? ProbeCts { get; set; }
public bool ProbeInitialized { get; set; }
/// <summary>
/// PR ablegacy-12 / #255 — running tally of consecutive read / probe failures.
/// Reset on every successful read or probe; tripping
/// <see cref="AbLegacyDemoteOptions.FailureThreshold"/> arms the demote window.
/// Read + written via <see cref="Interlocked"/> because read + probe loops can
/// touch it concurrently.
/// </summary>
public int ConsecutiveFailures;
/// <summary>
/// PR ablegacy-12 / #255 — when set, reads against this device short-circuit
/// with <c>BadCommunicationError</c> until the timestamp passes; cleared early
/// by a successful probe. Guarded by <see cref="ProbeLock"/> for the mutator
/// paths (TransitionDeviceState + RecordFailureAndMaybeDemote); reads grab
/// the property without locking — a torn DateTime? read is harmless here
/// because the worst case is one extra dispatched read on an x86 boundary.
/// </summary>
public DateTimeOffset? DemotedUntilUtc { get; set; }
public void DisposeRuntimes()
{
foreach (var r in Runtimes.Values) r.Dispose();

View File

@@ -45,7 +45,14 @@ public static class AbLegacyDriverFactoryExtensions
DeviceName: d.DeviceName,
// PR 9 — per-device timeout / retry overrides. Device-level wins over driver-wide.
Timeout: d.TimeoutMs is int devMs ? TimeSpan.FromMilliseconds(devMs) : null,
Retries: d.Retries))]
Retries: d.Retries,
// PR ablegacy-12 / #255 — auto-demote knobs.
Demote: d.Demote is null ? null : new AbLegacyDemoteOptions(
FailureThreshold: d.Demote.FailureThreshold ?? 3,
DemoteFor: d.Demote.DemoteForMs is int demMs
? TimeSpan.FromMilliseconds(demMs)
: null,
Enabled: d.Demote.Enabled ?? true)))]
: [],
Tags = dto.Tags is { Count: > 0 }
? [.. dto.Tags.Select(t => new AbLegacyTagDefinition(
@@ -209,6 +216,26 @@ public static class AbLegacyDriverFactoryExtensions
/// <c>null</c> at both levels = single attempt.
/// </summary>
public int? Retries { get; init; }
/// <summary>
/// PR ablegacy-12 / #255 — optional per-device auto-demote knobs. <c>null</c>
/// means "use the documented defaults" (<c>FailureThreshold=3</c>,
/// <c>DemoteFor=30s</c>, <c>Enabled=true</c>) — the driver still demotes by
/// default. Set <c>Enabled=false</c> in the JSON to opt out entirely.
/// </summary>
public AbLegacyDemoteDto? Demote { get; init; }
}
/// <summary>
/// PR ablegacy-12 / #255 — JSON DTO for the auto-demote knobs. Times are
/// ms-suffixed for consistency with the rest of the driver config (TimeoutMs,
/// IntervalMs).
/// </summary>
internal sealed class AbLegacyDemoteDto
{
public int? FailureThreshold { get; init; }
public int? DemoteForMs { get; init; }
public bool? Enabled { get; init; }
}
internal sealed class AbLegacyTagDto

View File

@@ -41,7 +41,39 @@ public sealed record AbLegacyDeviceOptions(
AbLegacyPlcFamily PlcFamily = AbLegacyPlcFamily.Slc500,
string? DeviceName = null,
TimeSpan? Timeout = null,
int? Retries = null);
int? Retries = null,
AbLegacyDemoteOptions? Demote = null);
/// <summary>
/// PR ablegacy-12 / #255 — auto-demote knobs. After
/// <see cref="FailureThreshold"/> consecutive read / probe failures the driver
/// marks the device <c>Demoted</c> for <see cref="DemoteFor"/>; reads against
/// a demoted device short-circuit with <c>BadCommunicationError</c> instead
/// of dispatching through libplctag, so one slow PLC can't starve faster
/// peers sharing the same driver. A successful probe clears the demotion
/// early; a successful read just resets the consecutive-failure counter
/// without leaving the demoted window.
/// </summary>
/// <param name="FailureThreshold">Consecutive read or probe failures that trip
/// the demotion. Default <c>3</c>.</param>
/// <param name="DemoteFor">Cool-down window before reads are dispatched again
/// without a successful probe in between. Default <c>30s</c>.</param>
/// <param name="Enabled">When <c>false</c> the failure tally still ticks but the
/// driver never sets the demoted window — useful when an operator wants the
/// diagnostic counters without the throttling behaviour.</param>
public sealed record AbLegacyDemoteOptions(
int FailureThreshold = 3,
TimeSpan? DemoteFor = null,
bool Enabled = true)
{
/// <summary>
/// Effective demote window. Records can't have <c>TimeSpan</c> defaults
/// because <c>TimeSpan.FromSeconds(30)</c> isn't a compile-time constant;
/// callers that pass <c>null</c> get the documented 30-second default
/// here.
/// </summary>
public TimeSpan EffectiveDemoteFor => DemoteFor ?? TimeSpan.FromSeconds(30);
}
/// <summary>
/// One PCCC-backed OPC UA variable. <c>Address</c> is the canonical PCCC file-address

View File

@@ -138,6 +138,11 @@ public sealed class HostStatusPublisher(
HostState.Running => DriverHostState.Running,
HostState.Stopped => DriverHostState.Stopped,
HostState.Faulted => DriverHostState.Faulted,
// PR ablegacy-12 / #255 — Demoted is a driver-side back-off (skipped reads while
// we wait for a flaky host to recover). The Configuration enum doesn't have a
// dedicated value; surface it as Stopped so the Admin UI lights it up red-ish
// without the publisher needing a schema migration to differentiate.
HostState.Demoted => DriverHostState.Stopped,
_ => DriverHostState.Unknown,
};
}