fix(driver-historian-wonderware): resolve Low code-review findings (Driver.Historian.Wonderware-004,005,007,008,010,011,012)

- Driver.Historian.Wonderware-004: ToHistorianEvent synthesises a fresh
  Guid when the upstream EventId is unparseable and logs the substitution
  instead of writing the historian with Guid.Empty.
- Driver.Historian.Wonderware-005: GetHealthSnapshot derives the
  connection-open booleans from the active-node fields so the snapshot
  is self-consistent without depending on the secondary lock.
- Driver.Historian.Wonderware-007: SID-mismatch branch in PipeServer now
  sends a HelloAck { Accepted=false, RejectReason } so the client sees a
  symmetric rejection.
- Driver.Historian.Wonderware-008: classify StartQuery failures —
  connection-class codes drop the connection, query-class codes throw
  QueryClassStartQueryException so the IPC layer surfaces Success=false.
- Driver.Historian.Wonderware-010: RequestTimeoutSeconds now enforced
  via BuildRequestCts linked to the caller's CancellationToken.
- Driver.Historian.Wonderware-011: refreshed XML docs to describe the
  current sidecar / named-pipe architecture (Galaxy.Host / Proxy
  references reframed as historical context).
- Driver.Historian.Wonderware-012: pinned the previously-uncovered
  HistorianDataSource behaviours with five new test files; also removed
  the stale empty tests/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests
  directory.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-23 08:18:10 -04:00
parent 42aa82de29
commit 1f29b215c8
14 changed files with 910 additions and 53 deletions
@@ -11,7 +11,10 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
/// <summary>
/// Reads historical data from the Wonderware Historian via the aahClientManaged SDK.
/// OPC-UA-free — emits <see cref="HistorianSample"/>/<see cref="HistorianAggregateSample"/>
/// which the Proxy maps to OPC UA <c>DataValue</c> on its side of the IPC.
/// which the sidecar serialises onto the named-pipe wire (PR 3.3 contracts) for the
/// .NET 10 <c>WonderwareHistorianClient</c> to translate into OPC UA <c>DataValue</c>
/// on its side of the IPC. The v1 Galaxy.Host / Proxy architecture this class
/// originally lived in retired in PR 7.2.
/// </summary>
public sealed class HistorianDataSource : IHistorianDataSource
{
@@ -50,6 +53,51 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
_picker = picker ?? new HistorianClusterEndpointPicker(config);
}
// Error codes that signify the connection or server is the problem rather than the
// query itself. A query-class failure (bad tag name, unsupported aggregate, etc.) must
// not force us to tear down and re-open the (relatively expensive) historian
// connection — that would let a burst of bad-tag queries push an otherwise healthy
// cluster node into cooldown. See Driver.Historian.Wonderware-008.
private static readonly HashSet<HistorianAccessError.ErrorValue> ConnectionErrorCodes =
new HashSet<HistorianAccessError.ErrorValue>
{
HistorianAccessError.ErrorValue.FailedToConnect,
HistorianAccessError.ErrorValue.FailedToCreateSession,
HistorianAccessError.ErrorValue.NoReply,
HistorianAccessError.ErrorValue.NotReady,
HistorianAccessError.ErrorValue.NotInitialized,
HistorianAccessError.ErrorValue.Stopping,
HistorianAccessError.ErrorValue.Win32Exception,
HistorianAccessError.ErrorValue.InvalidResponse,
};
/// <summary>
/// Whether an <c>aahClientManaged</c> error code indicates that the
/// <em>connection</em> (rather than the query payload) is the problem and the
/// shared SDK connection should therefore be reset. Internal for unit testing.
/// </summary>
internal static bool IsConnectionClassError(HistorianAccessError.ErrorValue code)
=> ConnectionErrorCodes.Contains(code);
/// <summary>
/// Builds the per-read <see cref="CancellationTokenSource"/> linked into the
/// caller's <paramref name="ct"/> and pre-wired to fire after
/// <see cref="HistorianConfiguration.RequestTimeoutSeconds"/> if positive. The
/// read paths use the resulting token in their <c>ThrowIfCancellationRequested</c>
/// checks so a hung <c>StartQuery</c> or slow <c>MoveNext</c> cannot block the
/// single pipe-server connection thread indefinitely. See
/// Driver.Historian.Wonderware-010.
/// </summary>
internal static CancellationTokenSource BuildRequestCts(HistorianConfiguration cfg, CancellationToken ct)
{
var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
if (cfg.RequestTimeoutSeconds > 0)
{
cts.CancelAfter(TimeSpan.FromSeconds(cfg.RequestTimeoutSeconds));
}
return cts;
}
private (HistorianAccess Connection, string Node) ConnectToAnyHealthyNode(HistorianConnectionType type)
{
var candidates = _picker.GetHealthyNodes();
@@ -110,6 +158,13 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
foreach (var n in nodeStates)
if (n.IsHealthy) healthyCount++;
// Driver.Historian.Wonderware-005: derive the connection-open booleans from the
// active-node strings, both of which live under _healthLock. _connection itself
// is published under _connectionLock — reading it here under a different lock
// could produce an internally inconsistent snapshot (open with no node, or
// closed with a non-null node) at the publish/clear boundary. Treating the
// active-node strings as the single source of truth makes the snapshot
// self-consistent by construction.
lock (_healthLock)
{
return new HistorianHealthSnapshot
@@ -121,8 +176,8 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
LastSuccessTime = _lastSuccessTime,
LastFailureTime = _lastFailureTime,
LastError = _lastError,
ProcessConnectionOpen = Volatile.Read(ref _connection) != null,
EventConnectionOpen = Volatile.Read(ref _eventConnection) != null,
ProcessConnectionOpen = _activeProcessNode != null,
EventConnectionOpen = _activeEventNode != null,
ActiveProcessNode = _activeProcessNode,
ActiveEventNode = _activeEventNode,
NodeCount = nodeStates.Count,
@@ -245,6 +300,59 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
}
}
/// <summary>
/// Internal exception signalling that <c>StartQuery</c> returned an SDK error
/// whose code is <em>query-class</em> (bad tag name, unsupported aggregate, etc.)
/// and the shared SDK connection therefore must NOT be reset. The outer catch
/// re-throws this so the IPC frame handler surfaces <c>Success=false</c> without
/// touching the connection. See Driver.Historian.Wonderware-008.
/// </summary>
internal sealed class QueryClassStartQueryException : InvalidOperationException
{
public HistorianAccessError.ErrorValue Code { get; }
public QueryClassStartQueryException(string message, HistorianAccessError.ErrorValue code)
: base(message)
{
Code = code;
}
}
/// <summary>
/// Centralised <c>StartQuery</c>-failure handler. Throws so the caller surfaces
/// <c>Success=false</c> in the IPC reply (the previous return-empty-with-success
/// behaviour made an SDK error look like "no data in range" to the client). The
/// connection is only reset when the error code is connection-class —
/// query-class failures (bad tag name, unsupported aggregate, etc.) must leave
/// the shared SDK connection intact, otherwise a burst of bad-tag queries cycles
/// the connection and pushes a healthy cluster node into cooldown.
/// See Driver.Historian.Wonderware-008.
/// </summary>
private void HandleStartQueryFailure(
string operation, HistorianAccessError error, bool isEventConnection)
{
var code = error?.ErrorCode ?? HistorianAccessError.ErrorValue.Failure;
var description = error?.ErrorDescription ?? string.Empty;
var connectionClass = IsConnectionClassError(code);
Log.Warning(
"Historian SDK StartQuery failed: {Operation} -> {Code} ({Desc}) [{Kind}]",
operation, code, description,
connectionClass ? "connection-class" : "query-class");
RecordFailure($"{operation}: {code}");
var message = $"Historian SDK StartQuery failed for {operation}: {code} ({description})";
if (connectionClass)
{
if (isEventConnection) HandleEventConnectionError();
else HandleConnectionError();
throw new InvalidOperationException(message);
}
// Query-class — the outer catch block must NOT call HandleConnectionError on this.
throw new QueryClassStartQueryException(message, code);
}
private void HandleEventConnectionError(Exception? ex = null)
{
lock (_eventConnectionLock)
@@ -280,6 +388,11 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
{
var results = new List<HistorianSample>();
// Driver.Historian.Wonderware-010: wire RequestTimeoutSeconds into the read path
// so a hung StartQuery / slow MoveNext can't block the connection thread forever.
using var requestCts = BuildRequestCts(_config, ct);
var token = requestCts.Token;
try
{
EnsureConnected();
@@ -300,10 +413,8 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
if (!query.StartQuery(args, out var error))
{
Log.Warning("Historian SDK raw query start failed for {Tag}: {Error}", tagName, error.ErrorCode);
RecordFailure($"raw StartQuery: {error.ErrorCode}");
HandleConnectionError();
return Task.FromResult(results);
HandleStartQueryFailure(
$"raw query for tag '{tagName}'", error, isEventConnection: false);
}
var count = 0;
@@ -311,7 +422,7 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
while (query.MoveNext(out error))
{
ct.ThrowIfCancellationRequested();
token.ThrowIfCancellationRequested();
var result = query.QueryResult;
var timestamp = DateTime.SpecifyKind(result.StartDateTime, DateTimeKind.Utc);
@@ -332,11 +443,20 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
}
catch (OperationCanceledException) { throw; }
catch (ObjectDisposedException) { throw; }
catch (QueryClassStartQueryException)
{
// Query-class StartQuery failure — HandleStartQueryFailure already logged
// and recorded. Re-throw so the IPC layer surfaces Success=false instead of
// returning an empty list (which would look like "no data in range"). The
// connection is deliberately NOT reset. See Driver.Historian.Wonderware-008.
throw;
}
catch (Exception ex)
{
Log.Warning(ex, "HistoryRead raw failed for {Tag}", tagName);
RecordFailure($"raw: {ex.Message}");
HandleConnectionError(ex);
throw;
}
Log.Debug("HistoryRead raw: {Tag} returned {Count} values ({Start} to {End})",
@@ -352,6 +472,10 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
{
var results = new List<HistorianAggregateSample>();
// Driver.Historian.Wonderware-010: outer safety timeout — see ReadRawAsync.
using var requestCts = BuildRequestCts(_config, ct);
var token = requestCts.Token;
try
{
EnsureConnected();
@@ -367,10 +491,8 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
if (!query.StartQuery(args, out var error))
{
Log.Warning("Historian SDK aggregate query start failed for {Tag}: {Error}", tagName, error.ErrorCode);
RecordFailure($"aggregate StartQuery: {error.ErrorCode}");
HandleConnectionError();
return Task.FromResult(results);
HandleStartQueryFailure(
$"aggregate query for tag '{tagName}'", error, isEventConnection: false);
}
// Apply the same bucket cap as the raw-read path so a wide time range with a
@@ -381,7 +503,7 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
while (query.MoveNext(out error))
{
ct.ThrowIfCancellationRequested();
token.ThrowIfCancellationRequested();
var result = query.QueryResult;
var timestamp = DateTime.SpecifyKind(result.StartDateTime, DateTimeKind.Utc);
@@ -408,11 +530,13 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
}
catch (OperationCanceledException) { throw; }
catch (ObjectDisposedException) { throw; }
catch (QueryClassStartQueryException) { throw; } // see ReadRawAsync — keep connection
catch (Exception ex)
{
Log.Warning(ex, "HistoryRead aggregate failed for {Tag}", tagName);
RecordFailure($"aggregate: {ex.Message}");
HandleConnectionError(ex);
throw;
}
Log.Debug("HistoryRead aggregate ({Aggregate}): {Tag} returned {Count} values",
@@ -430,13 +554,17 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
if (timestamps == null || timestamps.Length == 0)
return Task.FromResult(results);
// Driver.Historian.Wonderware-010: outer safety timeout — see ReadRawAsync.
using var requestCts = BuildRequestCts(_config, ct);
var token = requestCts.Token;
try
{
EnsureConnected();
foreach (var timestamp in timestamps)
{
ct.ThrowIfCancellationRequested();
token.ThrowIfCancellationRequested();
using var query = _connection!.CreateHistoryQuery();
var args = new HistoryQueryArgs
@@ -490,6 +618,7 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
Log.Warning(ex, "HistoryRead at-time failed for {Tag}", tagName);
RecordFailure($"at-time: {ex.Message}");
HandleConnectionError(ex);
throw;
}
Log.Debug("HistoryRead at-time: {Tag} returned {Count} values for {Timestamps} timestamps",
@@ -504,6 +633,10 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
{
var results = new List<HistorianEventDto>();
// Driver.Historian.Wonderware-010: outer safety timeout — see ReadRawAsync.
using var requestCts = BuildRequestCts(_config, ct);
var token = requestCts.Token;
try
{
EnsureEventConnected();
@@ -525,16 +658,14 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
if (!query.StartQuery(args, out var error))
{
Log.Warning("Historian SDK event query start failed: {Error}", error.ErrorCode);
RecordFailure($"events StartQuery: {error.ErrorCode}");
HandleEventConnectionError();
return Task.FromResult(results);
HandleStartQueryFailure(
$"event query for source '{sourceName ?? "(all)"}'", error, isEventConnection: true);
}
var count = 0;
while (query.MoveNext(out error))
{
ct.ThrowIfCancellationRequested();
token.ThrowIfCancellationRequested();
results.Add(ToDto(query.QueryResult));
count++;
if (maxEvents > 0 && count >= maxEvents) break;
@@ -545,11 +676,13 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
}
catch (OperationCanceledException) { throw; }
catch (ObjectDisposedException) { throw; }
catch (QueryClassStartQueryException) { throw; } // see ReadRawAsync — keep connection
catch (Exception ex)
{
Log.Warning(ex, "HistoryRead events failed for source {Source}", sourceName ?? "(all)");
RecordFailure($"events: {ex.Message}");
HandleEventConnectionError(ex);
throw;
}
Log.Debug("HistoryRead events: source={Source} returned {Count} events ({Start} to {End})",
@@ -593,11 +726,20 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
/// as a string; this is a known edge case of the SDK binding.
/// </para>
/// </summary>
private static object? SelectValue(HistoryQueryResult result)
internal static object? SelectValue(HistoryQueryResult result)
=> SelectValueFromPair(result.Value, result.StringValue);
/// <summary>
/// SDK-independent overload of the string-vs-numeric heuristic. Exposed so unit
/// tests can pin the logic without having to instantiate the SDK
/// <see cref="HistoryQueryResult"/> (whose internal property initialisers make
/// it impractical to fake). See Driver.Historian.Wonderware-012.
/// </summary>
internal static object? SelectValueFromPair(double value, string? stringValue)
{
if (!string.IsNullOrEmpty(result.StringValue) && result.Value == 0)
return result.StringValue;
return result.Value;
if (!string.IsNullOrEmpty(stringValue) && value == 0)
return stringValue;
return value;
}
internal static double? ExtractAggregateValue(AnalogSummaryQueryResult result, string column)