review(Driver.Historian.Wonderware): AtTime fails over on connection-class errors

Re-review at 7286d320. -014 (Medium): ReadAtTimeAsync didn't classify StartQuery failures,
so a connection-class failure left a dead connection, re-failed every timestamp, and returned
Success=true with all-Bad (no failover); now resets+fails over via a shared classifier + tests.
-015: refresh stale named-pipe comments to TCP (no wire change). -013 (silent cap truncation,
ties OpcUaServer-002/Core.Abstractions-009) deferred cross-module. NOTE: the SDK-touching tests
are net48 + native aahClientManaged and run only on Windows; macOS verifies build + the SDK-free
subset only.
This commit is contained in:
Joseph Doherty
2026-06-19 11:47:11 -04:00
parent e07a4fbf52
commit b3907efa6e
8 changed files with 225 additions and 11 deletions
@@ -6,7 +6,7 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
/// Wonderware Historian SDK configuration. Populated from environment variables at
/// sidecar startup (see <c>Program.cs</c>): the supervisor (lmxopcua-side
/// <c>WonderwareHistorianClient</c>) spawns the sidecar with these env vars; UA
/// translation lives on the client side of the named-pipe IPC, so this surface is
/// translation lives on the client side of the TCP IPC, so this surface is
/// kept OPC-UA-free. The legacy v1 Galaxy.Host / Proxy host this lived in retired
/// in PR 7.2.
/// </summary>
@@ -11,7 +11,7 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
/// <summary>
/// Reads historical data from the Wonderware Historian via the aahClientManaged SDK.
/// OPC-UA-free — emits <see cref="HistorianSample"/>/<see cref="HistorianAggregateSample"/>
/// which the sidecar serialises onto the named-pipe wire (PR 3.3 contracts) for the
/// which the sidecar serialises onto the TCP wire (PR 3.3 contracts) for the
/// .NET 10 <c>WonderwareHistorianClient</c> to translate into OPC UA <c>DataValue</c>
/// on its side of the IPC. The v1 Galaxy.Host / Proxy architecture this class
/// originally lived in retired in PR 7.2.
@@ -86,13 +86,27 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
internal static bool IsConnectionClassError(HistorianAccessError.ErrorValue code)
=> ConnectionErrorCodes.Contains(code);
/// <summary>
/// Whether a failed <c>StartQuery</c> in the per-timestamp at-time loop should reset
/// the shared SDK connection (and abort the read) rather than record a per-timestamp
/// Bad sample and continue. Returns <c>true</c> only for connection-class error
/// codes; query-class / no-data codes (and a missing error) return <c>false</c> so
/// a single bad/empty timestamp does not tear down a connection that is still serving
/// the other timestamps. The <c>HistoryQuery</c> SDK type is non-virtual and has no
/// interface, so the at-time loop can't be driven offline — this pure helper is the
/// unit-testable seam for the classification. See Driver.Historian.Wonderware-014.
/// </summary>
/// <param name="error">The SDK error returned by the failed <c>StartQuery</c>.</param>
internal static bool ShouldResetConnectionForStartQueryFailure(HistorianAccessError? error)
=> IsConnectionClassError(error?.ErrorCode ?? HistorianAccessError.ErrorValue.Failure);
/// <summary>
/// Builds the per-read <see cref="CancellationTokenSource"/> linked into the
/// caller's <paramref name="ct"/> and pre-wired to fire after
/// <see cref="HistorianConfiguration.RequestTimeoutSeconds"/> if positive. The
/// read paths use the resulting token in their <c>ThrowIfCancellationRequested</c>
/// checks so a hung <c>StartQuery</c> or slow <c>MoveNext</c> cannot block the
/// single pipe-server connection thread indefinitely. See
/// single TCP-server connection thread indefinitely. See
/// Driver.Historian.Wonderware-010.
/// </summary>
/// <param name="cfg">The historian configuration.</param>
@@ -409,7 +423,7 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
var results = new List<HistorianSample>();
// Driver.Historian.Wonderware-010: wire RequestTimeoutSeconds into the read path
// so a hung StartQuery / slow MoveNext can't block the connection thread forever.
// so a hung StartQuery / slow MoveNext can't block the TCP connection thread forever.
using var requestCts = BuildRequestCts(_config, ct);
var token = requestCts.Token;
@@ -609,6 +623,22 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
if (!query.StartQuery(args, out var error))
{
// Driver.Historian.Wonderware-014: classify the failure like the raw /
// aggregate / event paths. A connection-class code means the shared
// connection is dead — throw so the whole at-time read aborts and the IPC
// layer surfaces Success=false (the outer catch resets the connection and
// marks the node failed). Without this, every remaining timestamp would
// re-fail StartQuery on the dead connection and the method would still
// report Success=true with an all-Bad result, never failing over. A
// query-class / no-data code keeps the connection and records a Bad sample
// for just this timestamp.
if (ShouldResetConnectionForStartQueryFailure(error))
{
var code = error?.ErrorCode ?? HistorianAccessError.ErrorValue.Failure;
throw new InvalidOperationException(
$"Historian SDK StartQuery failed for at-time query of tag '{tagName}': {code} ({error?.ErrorDescription})");
}
results.Add(new HistorianSample
{
Value = null,
@@ -4,7 +4,7 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
{
/// <summary>
/// OPC-UA-free representation of a single historical data point. The sidecar serialises
/// these onto the named-pipe wire (<c>HistorianSampleDto</c>) for the .NET 10
/// these onto the TCP wire (<c>HistorianSampleDto</c>) for the .NET 10
/// <c>WonderwareHistorianClient</c>, which maps quality and value into OPC UA
/// <c>DataValue</c> on its side. Raw OPC DA quality byte is preserved so the client
/// can reuse the same quality mapper it already uses for live reads.
@@ -8,7 +8,7 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
/// <summary>
/// OPC-UA-free surface for the Wonderware Historian subsystem inside the historian
/// sidecar process. Implementations read via the aahClient* SDK; the .NET 10
/// <c>WonderwareHistorianClient</c> on the other side of the named-pipe IPC maps
/// <c>WonderwareHistorianClient</c> on the other side of the TCP IPC maps
/// returned samples to OPC UA <c>DataValue</c>. The v1 Galaxy.Host / Proxy hosts
/// this lived in retired in PR 7.2.
/// </summary>
@@ -4,7 +4,7 @@ using MessagePack;
namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Ipc;
// ============================================================================
// Wire DTOs for the sidecar pipe protocol. The sidecar speaks its own legacy
// Wire DTOs for the sidecar TCP protocol. The sidecar speaks its own legacy
// shape (List<HistorianSample> etc.) — the .NET 10 client (PR 3.4) translates
// to / from Core.Abstractions.DataValueSnapshot + HistoricalEvent.
//
@@ -1,7 +1,7 @@
namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Ipc;
/// <summary>
/// Length-prefixed framing constants for the Wonderware historian sidecar pipe protocol.
/// Length-prefixed framing constants for the Wonderware historian sidecar TCP protocol.
/// Each frame on the wire is:
/// <c>[4-byte big-endian length][1-byte message kind][MessagePack body]</c>.
/// Length is the body size only; the kind byte is not part of the prefixed length.