fix(focas): serialize per-device wire I/O + bound reads; tolerate AdminUI config formats

Equipment tags were stuck at Bad_WaitingForInitialData on the deployed driver: the equipment poll, fixed-tree loop, probe and recycle shared one FOCAS/2 socket with no serialization, and the steady-state read had no timeout — concurrent reads collided and a stalled read hung forever, never overwriting the node's initial-data seed.

- SynchronizedFocasClient: per-device SemaphoreSlim gate + per-call timeout around every wire op (Connect/Probe gated, not double-bounded); wired in EnsureConnectedAsync. ReadAsync/WriteAsync map a per-call timeout to BadCommunicationError instead of rethrowing.
- FlexibleStringConverter on FOCAS config Series: the AdminUI persists the enum as a number ("series":6); accept number-or-string instead of throwing -> stub.
- FocasHostAddress.TryParse tolerates a scheme-less {ip}[:{port}] (AdminUI hostAddress form); canonical focas:// unchanged, malformed schemes still rejected.

247 FOCAS tests green; each fix has a regression test. Live-validated on wonder-app-vd03 (tags read Good).
This commit is contained in:
Joseph Doherty
2026-06-26 05:59:54 -04:00
parent 20b2df9241
commit 235b8b8e6d
9 changed files with 484 additions and 11 deletions
@@ -306,7 +306,16 @@ public sealed class FocasDriver : IDriver, IReadable, IWritable, ITagDiscovery,
Volatile.Read(ref _health).LastSuccessfulRead,
$"FOCAS status 0x{status:X8} reading {reference}"));
}
catch (OperationCanceledException) { throw; }
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) { throw; }
catch (OperationCanceledException)
{
// Per-call timeout (not external cancellation) — the read stalled past the device
// Timeout budget. Surface a recoverable comm error so the BadWaitingForInitialData
// seed is overwritten and health degrades, instead of the read hanging forever.
results[i] = new DataValueSnapshot(null, FocasStatusMapper.BadCommunicationError, null, now);
Volatile.Write(ref _health, new DriverHealth(DriverState.Degraded,
Volatile.Read(ref _health).LastSuccessfulRead, $"FOCAS read timed out for {reference}"));
}
catch (Exception ex)
{
results[i] = new DataValueSnapshot(null, FocasStatusMapper.BadCommunicationError, null, now);
@@ -356,7 +365,15 @@ public sealed class FocasDriver : IDriver, IReadable, IWritable, ITagDiscovery,
var status = await client.WriteAsync(parsed, def.DataType, w.Value, cancellationToken).ConfigureAwait(false);
results[i] = new WriteResult(status);
}
catch (OperationCanceledException) { throw; }
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) { throw; }
catch (OperationCanceledException)
{
// Per-call timeout (not external cancellation) — the write stalled past the device
// Timeout budget. Surface a recoverable comm error rather than aborting the batch.
results[i] = new WriteResult(FocasStatusMapper.BadCommunicationError);
Volatile.Write(ref _health, new DriverHealth(DriverState.Degraded,
Volatile.Read(ref _health).LastSuccessfulRead, $"FOCAS write timed out for {w.FullReference}"));
}
catch (NotSupportedException nse)
{
results[i] = new WriteResult(FocasStatusMapper.BadNotSupported);
@@ -1113,7 +1130,11 @@ public sealed class FocasDriver : IDriver, IReadable, IWritable, ITagDiscovery,
device.Client = null;
}
device.Client = _clientFactory.Create();
// Wrap the raw wire client so every operation on the device's single FOCAS/2 socket is
// serialized (request→response on one socket cannot interleave) and time-bounded. Without
// this, the equipment poll, fixed-tree loop, probe, and recycle loop collide on the shared
// socket and a stalled read blocks forever — leaving bound tags at BadWaitingForInitialData.
device.Client = new SynchronizedFocasClient(_clientFactory.Create(), _options.Timeout);
try
{
await device.Client.ConnectAsync(device.ParsedAddress, _options.Timeout, ct).ConfigureAwait(false);
@@ -195,12 +195,41 @@ public static class FocasDriverFactoryExtensions
AllowTrailingCommas = true,
};
/// <summary>
/// Reads a JSON property as a string, tolerating a JSON <b>number</b> token as well. The
/// AdminUI persists the FOCAS <c>Series</c> enum as its integer value (e.g. <c>"series":6</c>),
/// while this DTO models <c>Series</c> as a string handed to <see cref="ParseSeries"/>
/// (Enum.TryParse accepts the numeric form). Without this, System.Text.Json throws
/// "Cannot get the value of a token type 'Number' as a string" on the bare number and the
/// driver falls back to a stub. Accepts string / number / null and emits a string.
/// </summary>
internal sealed class FlexibleStringConverter : JsonConverter<string?>
{
public override string? Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) =>
reader.TokenType switch
{
JsonTokenType.String => reader.GetString(),
JsonTokenType.Number => reader.TryGetInt64(out var n)
? n.ToString(System.Globalization.CultureInfo.InvariantCulture)
: reader.GetDouble().ToString(System.Globalization.CultureInfo.InvariantCulture),
JsonTokenType.Null => null,
_ => throw new JsonException($"Expected string, number, or null but got {reader.TokenType}."),
};
public override void Write(Utf8JsonWriter writer, string? value, JsonSerializerOptions options)
{
if (value is null) writer.WriteNullValue();
else writer.WriteStringValue(value);
}
}
internal sealed class FocasDriverConfigDto
{
/// <summary>Gets or sets the FOCAS client factory backend name (e.g. "wire" or "stub").</summary>
public string? Backend { get; init; }
/// <summary>Gets or sets the CNC series for this driver.</summary>
[JsonConverter(typeof(FlexibleStringConverter))]
public string? Series { get; init; }
/// <summary>Gets or sets the operation timeout in milliseconds.</summary>
@@ -234,6 +263,7 @@ public static class FocasDriverFactoryExtensions
public string? DeviceName { get; init; }
/// <summary>Gets or sets the CNC series for this device (overrides top-level series if provided).</summary>
[JsonConverter(typeof(FlexibleStringConverter))]
public string? Series { get; init; }
/// <summary>
@@ -21,9 +21,19 @@ public sealed record FocasHostAddress(string Host, int Port)
{
if (string.IsNullOrWhiteSpace(value)) return null;
const string prefix = "focas://";
if (!value.StartsWith(prefix, StringComparison.OrdinalIgnoreCase)) return null;
var body = value[prefix.Length..];
// Canonical form is focas://{ip}[:{port}], but the AdminUI persists the device host as a
// scheme-less "{ip}[:{port}]" (e.g. "10.201.31.5:8193"). Accept that too: take the body
// after focas:// when present, else the whole value when it carries NO other URI scheme
// (a "://" that isn't ours — e.g. http:// — is still rejected). The host-contains-colon
// guard below then rejects malformed scheme typos like "focas:10.0.0.5:8193".
string body;
if (value.StartsWith(prefix, StringComparison.OrdinalIgnoreCase))
body = value[prefix.Length..];
else if (!value.Contains("://", StringComparison.Ordinal))
body = value;
else
return null;
if (string.IsNullOrEmpty(body)) return null;
var colonIdx = body.LastIndexOf(':');
@@ -39,7 +49,9 @@ public sealed record FocasHostAddress(string Host, int Port)
{
host = body;
}
if (string.IsNullOrEmpty(host)) return null;
// Empty host, or a host still carrying a colon (e.g. the malformed "focas:10.0.0.5" left
// when someone wrote "focas:10.0.0.5:8193" without the //), is invalid.
if (string.IsNullOrEmpty(host) || host.Contains(':', StringComparison.Ordinal)) return null;
return new FocasHostAddress(host, port);
}
}
@@ -0,0 +1,152 @@
namespace ZB.MOM.WW.OtOpcUa.Driver.FOCAS;
/// <summary>
/// Decorates an <see cref="IFocasClient"/> so that every wire operation on the device's
/// single FOCAS/2 socket is (1) <b>serialized</b> against all other operations and
/// (2) <b>time-bounded</b>.
/// </summary>
/// <remarks>
/// <para>FOCAS/2 over TCP:8193 is a strict request→response protocol on ONE socket. The
/// driver holds a single <see cref="IFocasClient"/> per device, but several independent loops
/// read from it concurrently — the equipment poll (<see cref="FocasDriver.ReadAsync"/>), the
/// fixed-tree loop (<c>FixedTreeLoopAsync</c>), the connectivity probe, and the recycle loop.
/// Without serialization, two reads interleave their <c>send(request); read(response)</c> on the
/// same socket: one reader consumes the other's response PDU and the victim then blocks forever
/// waiting for bytes that never arrive — leaving the bound OPC UA node stuck at
/// <c>BadWaitingForInitialData</c>. This was the root cause of FOCAS equipment tags never
/// surfacing a value while the probe reported HEALTHY (the probe reads work single-threaded on a
/// dev box, but collide deployed once the fixed-tree loop runs concurrently).</para>
///
/// <para>The gate (<see cref="SemaphoreSlim"/> of count 1) makes each request→response atomic on
/// the socket. The per-call timeout ensures a stalled response can never hold the gate — and thus
/// the socket — indefinitely; a hung read surfaces as a recoverable error at the configured
/// <c>Timeout</c> budget instead of permanent silence. The gate and timeout are paired
/// deliberately: a lock around an <i>unbounded</i> read would deadlock all I/O for the device.</para>
///
/// <para><see cref="ConnectAsync"/> and <see cref="ProbeAsync"/> are serialized but NOT bounded by
/// this decorator's call timeout — they carry their own budgets (the connect timeout argument and
/// the probe's caller-supplied linked token respectively), and double-bounding would shrink them.</para>
/// </remarks>
public sealed class SynchronizedFocasClient : IFocasClient
{
private readonly IFocasClient _inner;
private readonly TimeSpan _callTimeout;
private readonly SemaphoreSlim _gate = new(1, 1);
/// <summary>Wraps <paramref name="inner"/> with per-device serialization + a per-call timeout.</summary>
/// <param name="inner">The underlying FOCAS client to serialize access to.</param>
/// <param name="callTimeout">
/// The budget applied to each data read/write. <see cref="TimeSpan.Zero"/> or negative disables
/// the per-call timeout (callers' own cancellation tokens still apply).
/// </param>
public SynchronizedFocasClient(IFocasClient inner, TimeSpan callTimeout)
{
_inner = inner ?? throw new ArgumentNullException(nameof(inner));
_callTimeout = callTimeout;
}
/// <inheritdoc />
public bool IsConnected => _inner.IsConnected;
/// <inheritdoc />
public Task ConnectAsync(FocasHostAddress address, TimeSpan timeout, CancellationToken cancellationToken) =>
RunGatedAsync(ct => _inner.ConnectAsync(address, timeout, ct), cancellationToken);
/// <inheritdoc />
public Task<bool> ProbeAsync(CancellationToken cancellationToken) =>
RunGatedAsync(ct => _inner.ProbeAsync(ct), cancellationToken);
/// <inheritdoc />
public Task<(object? value, uint status)> ReadAsync(
FocasAddress address, FocasDataType type, CancellationToken cancellationToken) =>
RunBoundedAsync(ct => _inner.ReadAsync(address, type, ct), cancellationToken);
/// <inheritdoc />
public Task<uint> WriteAsync(
FocasAddress address, FocasDataType type, object? value, CancellationToken cancellationToken) =>
RunBoundedAsync(ct => _inner.WriteAsync(address, type, value, ct), cancellationToken);
/// <inheritdoc />
public Task<IReadOnlyList<FocasActiveAlarm>> ReadAlarmsAsync(CancellationToken cancellationToken) =>
RunBoundedAsync(ct => _inner.ReadAlarmsAsync(ct), cancellationToken);
/// <inheritdoc />
public Task<FocasSysInfo> GetSysInfoAsync(CancellationToken cancellationToken) =>
RunBoundedAsync(ct => _inner.GetSysInfoAsync(ct), cancellationToken);
/// <inheritdoc />
public Task<IReadOnlyList<FocasAxisName>> GetAxisNamesAsync(CancellationToken cancellationToken) =>
RunBoundedAsync(ct => _inner.GetAxisNamesAsync(ct), cancellationToken);
/// <inheritdoc />
public Task<IReadOnlyList<FocasSpindleName>> GetSpindleNamesAsync(CancellationToken cancellationToken) =>
RunBoundedAsync(ct => _inner.GetSpindleNamesAsync(ct), cancellationToken);
/// <inheritdoc />
public Task<FocasDynamicSnapshot> ReadDynamicAsync(int axisIndex, CancellationToken cancellationToken) =>
RunBoundedAsync(ct => _inner.ReadDynamicAsync(axisIndex, ct), cancellationToken);
/// <inheritdoc />
public Task<FocasProgramInfo> GetProgramInfoAsync(CancellationToken cancellationToken) =>
RunBoundedAsync(ct => _inner.GetProgramInfoAsync(ct), cancellationToken);
/// <inheritdoc />
public Task<FocasTimer> GetTimerAsync(FocasTimerKind kind, CancellationToken cancellationToken) =>
RunBoundedAsync(ct => _inner.GetTimerAsync(kind, ct), cancellationToken);
/// <inheritdoc />
public Task<IReadOnlyList<FocasServoLoad>> GetServoLoadsAsync(CancellationToken cancellationToken) =>
RunBoundedAsync(ct => _inner.GetServoLoadsAsync(ct), cancellationToken);
/// <inheritdoc />
public Task<IReadOnlyList<int>> GetSpindleLoadsAsync(CancellationToken cancellationToken) =>
RunBoundedAsync(ct => _inner.GetSpindleLoadsAsync(ct), cancellationToken);
/// <inheritdoc />
public Task<IReadOnlyList<int>> GetSpindleMaxRpmsAsync(CancellationToken cancellationToken) =>
RunBoundedAsync(ct => _inner.GetSpindleMaxRpmsAsync(ct), cancellationToken);
/// <inheritdoc />
public Task<IReadOnlyList<int>> GetPositionFiguresAsync(CancellationToken cancellationToken) =>
RunBoundedAsync(ct => _inner.GetPositionFiguresAsync(ct), cancellationToken);
/// <inheritdoc />
public void Dispose()
{
_inner.Dispose();
_gate.Dispose();
}
// Gate only — the caller already governs the budget (connect timeout arg / probe linked token).
private async Task<T> RunGatedAsync<T>(Func<CancellationToken, Task<T>> op, CancellationToken ct)
{
await _gate.WaitAsync(ct).ConfigureAwait(false);
try { return await op(ct).ConfigureAwait(false); }
finally { _gate.Release(); }
}
private async Task RunGatedAsync(Func<CancellationToken, Task> op, CancellationToken ct)
{
await _gate.WaitAsync(ct).ConfigureAwait(false);
try { await op(ct).ConfigureAwait(false); }
finally { _gate.Release(); }
}
// Gate + per-call timeout. A fired timeout surfaces as OperationCanceledException whose token is
// the linked (not the caller's) token — callers distinguish it from real cancellation by testing
// their own token's IsCancellationRequested.
private async Task<T> RunBoundedAsync<T>(Func<CancellationToken, Task<T>> op, CancellationToken ct)
{
await _gate.WaitAsync(ct).ConfigureAwait(false);
try
{
if (_callTimeout <= TimeSpan.Zero)
return await op(ct).ConfigureAwait(false);
using var linked = CancellationTokenSource.CreateLinkedTokenSource(ct);
linked.CancelAfter(_callTimeout);
return await op(linked.Token).ConfigureAwait(false);
}
finally { _gate.Release(); }
}
}