From 1f29b215c86768b8f612678816534ef2f881cf77 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Sat, 23 May 2026 08:18:10 -0400 Subject: [PATCH] fix(driver-historian-wonderware): resolve Low code-review findings (Driver.Historian.Wonderware-004,005,007,008,010,011,012) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Driver.Historian.Wonderware-004: ToHistorianEvent synthesises a fresh Guid when the upstream EventId is unparseable and logs the substitution instead of writing the historian with Guid.Empty. - Driver.Historian.Wonderware-005: GetHealthSnapshot derives the connection-open booleans from the active-node fields so the snapshot is self-consistent without depending on the secondary lock. - Driver.Historian.Wonderware-007: SID-mismatch branch in PipeServer now sends a HelloAck { Accepted=false, RejectReason } so the client sees a symmetric rejection. - Driver.Historian.Wonderware-008: classify StartQuery failures — connection-class codes drop the connection, query-class codes throw QueryClassStartQueryException so the IPC layer surfaces Success=false. - Driver.Historian.Wonderware-010: RequestTimeoutSeconds now enforced via BuildRequestCts linked to the caller's CancellationToken. - Driver.Historian.Wonderware-011: refreshed XML docs to describe the current sidecar / named-pipe architecture (Galaxy.Host / Proxy references reframed as historical context). - Driver.Historian.Wonderware-012: pinned the previously-uncovered HistorianDataSource behaviours with five new test files; also removed the stale empty tests/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests directory. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Driver.Historian.Wonderware/findings.md | 30 +-- .../Backend/HistorianConfiguration.cs | 9 +- .../Backend/HistorianDataSource.cs | 188 +++++++++++++++--- .../Backend/HistorianSample.cs | 11 +- .../Backend/IHistorianDataSource.cs | 8 +- .../Backend/SdkAlarmHistorianWriteBackend.cs | 13 ++ .../Ipc/PipeServer.cs | 32 ++- ...HistorianDataSourceConnectFailoverTests.cs | 142 +++++++++++++ .../HistorianDataSourceHealthSnapshotTests.cs | 114 +++++++++++ .../HistorianDataSourceRequestTimeoutTests.cs | 109 ++++++++++ ...DataSourceStartQueryClassificationTests.cs | 50 +++++ ...storianDataSourceValueAndAggregateTests.cs | 122 ++++++++++++ .../SdkAlarmHistorianWriteBackendTests.cs | 52 +++++ .../Ipc/PipeServerSidRejectTests.cs | 83 ++++++++ 14 files changed, 910 insertions(+), 53 deletions(-) create mode 100644 tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceConnectFailoverTests.cs create mode 100644 tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceHealthSnapshotTests.cs create mode 100644 tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceRequestTimeoutTests.cs create mode 100644 tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceStartQueryClassificationTests.cs create mode 100644 tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceValueAndAggregateTests.cs create mode 100644 tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Ipc/PipeServerSidRejectTests.cs diff --git a/code-reviews/Driver.Historian.Wonderware/findings.md b/code-reviews/Driver.Historian.Wonderware/findings.md index 38be375..8473b5e 100644 --- a/code-reviews/Driver.Historian.Wonderware/findings.md +++ b/code-reviews/Driver.Historian.Wonderware/findings.md @@ -7,7 +7,7 @@ | Review date | 2026-05-22 | | Commit reviewed | `76d35d1` | | Status | Reviewed | -| Open findings | 7 | +| Open findings | 0 | ## Checklist coverage @@ -115,7 +115,7 @@ analog/integer tags. | Severity | Low | | Category | Correctness and logic bugs | | Location | `Backend/SdkAlarmHistorianWriteBackend.cs:198-201` | -| Status | Open | +| Status | Resolved | **Description:** `ToHistorianEvent` only assigns `historianEvent.Id` when `Guid.TryParse(dto.EventId, ...)` succeeds. If `EventId` is not a parseable GUID @@ -128,7 +128,7 @@ The non-parseable case is never logged. the event as `PermanentFail` (malformed input) or synthesize a fresh `Guid.NewGuid()` so each event still gets a unique id. -**Resolution:** _(open)_ +**Resolution:** Resolved 2026-05-23 — `ToHistorianEvent` now synthesizes a fresh `Guid.NewGuid()` when the dto's `EventId` fails `Guid.TryParse`, and logs a warning carrying both the original (unparseable) id and the synthesized id so collisions stop happening silently. Regression tests `ToHistorianEvent_parseable_event_id_is_used_verbatim` and `ToHistorianEvent_unparseable_event_id_synthesizes_unique_non_empty_Guid` in `SdkAlarmHistorianWriteBackendTests`. ### Driver.Historian.Wonderware-005 @@ -137,7 +137,7 @@ the event as `PermanentFail` (malformed input) or synthesize a fresh | Severity | Low | | Category | Concurrency and thread safety | | Location | `Backend/HistorianDataSource.cs:124`, `:126-127` | -| Status | Open | +| Status | Resolved | **Description:** `GetHealthSnapshot` reads `_activeProcessNode` and `_activeEventNode` inside `_healthLock`, but those two fields are written under @@ -152,7 +152,7 @@ a momentarily inconsistent health snapshot. `_healthLock` on every connection state change, or read them under the connection lock), so the snapshot is internally consistent. -**Resolution:** _(open)_ +**Resolution:** Resolved 2026-05-23 — `GetHealthSnapshot` now derives the `ProcessConnectionOpen` / `EventConnectionOpen` booleans from the active-node strings (`_activeProcessNode != null` / `_activeEventNode != null`) which all live under `_healthLock`, instead of reading `_connection`/`_eventConnection` via `Volatile.Read` outside the lock those fields are published under. The snapshot is now self-consistent by construction: open ↔ active node populated. Regression tests in `HistorianDataSourceHealthSnapshotTests` cover the three half-published states plus the steady-state cases. ### Driver.Historian.Wonderware-006 @@ -184,7 +184,7 @@ restart the sidecar cleanly. | Severity | Low | | Category | Error handling and resilience | | Location | `Ipc/PipeServer.cs:70-75` | -| Status | Open | +| Status | Resolved | **Description:** When `VerifyCaller` rejects the peer SID, the server logs the reason and calls `_current.Disconnect()` with no `HelloAck` frame sent. The @@ -198,7 +198,7 @@ harder to test from the client. `caller-sid-mismatch` reject reason before disconnecting, consistent with the other two rejection paths. -**Resolution:** _(open)_ +**Resolution:** Resolved 2026-05-23 — the SID rejection path now writes a `HelloAck { Accepted=false, RejectReason="caller-sid-mismatch: ..." }` before disconnecting, symmetric with the shared-secret-mismatch and major-version-mismatch paths. The caller-verification function was also extracted into a `CallerVerifier` delegate so tests can override it (the pipe ACL would otherwise block the test client itself). End-to-end regression `PipeServerSidRejectTests.Caller_SID_mismatch_sends_HelloAck_with_reject_reason_before_disconnect` connects a real named-pipe client and asserts the rejecting ack frame arrives. ### Driver.Historian.Wonderware-008 @@ -207,7 +207,7 @@ other two rejection paths. | Severity | Low | | Category | Error handling and resilience | | Location | `Backend/HistorianDataSource.cs:301-307`, `:374-380` | -| Status | Open | +| Status | Resolved | **Description:** When `query.StartQuery` returns `false`, `ReadRawAsync` and `ReadAggregateAsync` call `HandleConnectionError()` and return an empty result @@ -226,7 +226,7 @@ connection intact, surface the error). Consider returning a failed reply (`Success = false`) for query-class `StartQuery` failures so the client does not treat an SDK error as an empty history. -**Resolution:** _(open)_ +**Resolution:** Resolved 2026-05-23 — extracted a static `ConnectionErrorCodes` set + `IsConnectionClassError` classifier (mirroring the alarm-write side) and centralised the failure handling in a new `HandleStartQueryFailure` helper. Connection-class codes still drop the connection and mark the node failed; query-class codes throw a new `QueryClassStartQueryException` that the outer catch re-throws WITHOUT touching the connection. All four read paths (raw / aggregate / at-time / events) also re-throw caught exceptions so the IPC frame handler surfaces `Success=false` instead of returning an empty list with `Success=true`. Regression tests `HistorianDataSourceStartQueryClassificationTests` pin the connection-class vs query-class classification per error code; the connect-failover suite (`HistorianDataSourceConnectFailoverTests`) verifies the read paths now propagate the exception. ### Driver.Historian.Wonderware-009 @@ -261,7 +261,7 @@ cap with an explicit error reply rather than letting `WriteAsync` throw. | Severity | Low | | Category | Performance and resource management | | Location | `Backend/HistorianConfiguration.cs:32-36`, `Backend/HistorianDataSource.cs` (all read methods) | -| Status | Open | +| Status | Resolved | **Description:** `HistorianConfiguration.RequestTimeoutSeconds` is documented as the "outer safety timeout applied to sync-over-async Historian operations" and is @@ -278,7 +278,7 @@ timeout, but the query path does not). The documented safety net does not exist. worker with a bounded wait), or remove the property and its XML doc so the code does not advertise a guarantee it does not provide. -**Resolution:** _(open)_ +**Resolution:** Resolved 2026-05-23 — added an internal `BuildRequestCts` helper that returns a `CancellationTokenSource` linked into the caller's `ct` with `CancelAfter(RequestTimeoutSeconds)` applied when positive. Each read method (`ReadRawAsync`, `ReadAggregateAsync`, `ReadAtTimeAsync`, `ReadEventsAsync`) now wraps its work with the linked CTS and feeds the linked token into the `ThrowIfCancellationRequested` checks between `MoveNext` iterations, so a hung SDK call cancels at the configured deadline instead of blocking the connection thread indefinitely. Regression tests `HistorianDataSourceRequestTimeoutTests` pin the helper: positive value enforces `CancelAfter`, zero/negative means no timeout, caller cancellation propagates, default is 60s. ### Driver.Historian.Wonderware-011 @@ -287,7 +287,7 @@ does not advertise a guarantee it does not provide. | Severity | Low | | Category | Design-document adherence | | Location | `Backend/HistorianDataSource.cs:9-12`, `Backend/IHistorianDataSource.cs:9-11`, `Backend/HistorianSample.cs:7-9`, `Backend/HistorianConfiguration.cs:7-9` | -| Status | Open | +| Status | Resolved | **Description:** Several XML doc comments reference the retired v1 architecture as if it were current: "inside Galaxy.Host", "the Proxy maps returned samples", "the @@ -303,7 +303,7 @@ review checklist. architecture (sidecar talking to `WonderwareHistorianClient` over the named pipe), dropping the `Galaxy.Host` / `Proxy` / `GalaxyDataValue` references. -**Resolution:** _(open)_ +**Resolution:** Resolved 2026-05-23 — refreshed the XML doc comments on `HistorianDataSource`, `IHistorianDataSource`, `HistorianSample` / `HistorianAggregateSample`, and `HistorianConfiguration` to describe the current sidecar / named-pipe / .NET 10 `WonderwareHistorianClient` architecture. References to `Galaxy.Host` / `Galaxy.Proxy` / `GalaxyDataValue` are now framed as historical context tied to the PR 7.2 retirement rather than as current behaviour. ### Driver.Historian.Wonderware-012 @@ -312,7 +312,7 @@ dropping the `Galaxy.Host` / `Proxy` / `GalaxyDataValue` references. | Severity | Low | | Category | Testing coverage | | Location | `Backend/HistorianDataSource.cs`, `tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/` | -| Status | Open | +| Status | Resolved | **Description:** The unit-test suite covers `HistorianQualityMapper`, `HistorianClusterEndpointPicker`, `SdkAlarmHistorianWriteBackend`, @@ -334,4 +334,4 @@ removed to avoid confusion. cancellation, and the value-type selection — and delete the stale empty `tests/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/` directory. -**Resolution:** _(open)_ +**Resolution:** Resolved 2026-05-23 — added four new `HistorianDataSource`-targeted test files: `HistorianDataSourceHealthSnapshotTests` (snapshot consistency under half-published state, see also -005), `HistorianDataSourceStartQueryClassificationTests` (connection-class vs query-class error-code table, see also -008), `HistorianDataSourceRequestTimeoutTests` (the request-timeout helper, see also -010), `HistorianDataSourceConnectFailoverTests` (cluster failover order + cooldown via the `IHistorianConnectionFactory` fake), and `HistorianDataSourceValueAndAggregateTests` (the string-vs-numeric heuristic via the new SDK-independent `SelectValueFromPair` overload + the `ExtractAggregateValue` column dispatch). Stale empty `tests/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/` directory deleted. Unit count rose from 80 to 125 (+45 new tests). diff --git a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/HistorianConfiguration.cs b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/HistorianConfiguration.cs index d2e129e..19ba358 100644 --- a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/HistorianConfiguration.cs +++ b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/HistorianConfiguration.cs @@ -3,9 +3,12 @@ using System.Collections.Generic; namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend { /// - /// Wonderware Historian SDK configuration. Populated from environment variables at Host - /// startup (see Program.cs) or from the Proxy's DriverInstance.DriverConfig - /// section passed during OpenSession. Kept OPC-UA-free — the Proxy side owns UA translation. + /// Wonderware Historian SDK configuration. Populated from environment variables at + /// sidecar startup (see Program.cs): the supervisor (lmxopcua-side + /// WonderwareHistorianClient) spawns the sidecar with these env vars; UA + /// translation lives on the client side of the named-pipe IPC, so this surface is + /// kept OPC-UA-free. The legacy v1 Galaxy.Host / Proxy host this lived in retired + /// in PR 7.2. /// public sealed class HistorianConfiguration { diff --git a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/HistorianDataSource.cs b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/HistorianDataSource.cs index 4e0def7..38a5689 100644 --- a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/HistorianDataSource.cs +++ b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/HistorianDataSource.cs @@ -11,7 +11,10 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend /// /// Reads historical data from the Wonderware Historian via the aahClientManaged SDK. /// OPC-UA-free — emits / - /// which the Proxy maps to OPC UA DataValue on its side of the IPC. + /// which the sidecar serialises onto the named-pipe wire (PR 3.3 contracts) for the + /// .NET 10 WonderwareHistorianClient to translate into OPC UA DataValue + /// on its side of the IPC. The v1 Galaxy.Host / Proxy architecture this class + /// originally lived in retired in PR 7.2. /// public sealed class HistorianDataSource : IHistorianDataSource { @@ -50,6 +53,51 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend _picker = picker ?? new HistorianClusterEndpointPicker(config); } + // Error codes that signify the connection or server is the problem rather than the + // query itself. A query-class failure (bad tag name, unsupported aggregate, etc.) must + // not force us to tear down and re-open the (relatively expensive) historian + // connection — that would let a burst of bad-tag queries push an otherwise healthy + // cluster node into cooldown. See Driver.Historian.Wonderware-008. + private static readonly HashSet ConnectionErrorCodes = + new HashSet + { + HistorianAccessError.ErrorValue.FailedToConnect, + HistorianAccessError.ErrorValue.FailedToCreateSession, + HistorianAccessError.ErrorValue.NoReply, + HistorianAccessError.ErrorValue.NotReady, + HistorianAccessError.ErrorValue.NotInitialized, + HistorianAccessError.ErrorValue.Stopping, + HistorianAccessError.ErrorValue.Win32Exception, + HistorianAccessError.ErrorValue.InvalidResponse, + }; + + /// + /// Whether an aahClientManaged error code indicates that the + /// connection (rather than the query payload) is the problem and the + /// shared SDK connection should therefore be reset. Internal for unit testing. + /// + internal static bool IsConnectionClassError(HistorianAccessError.ErrorValue code) + => ConnectionErrorCodes.Contains(code); + + /// + /// Builds the per-read linked into the + /// caller's and pre-wired to fire after + /// if positive. The + /// read paths use the resulting token in their ThrowIfCancellationRequested + /// checks so a hung StartQuery or slow MoveNext cannot block the + /// single pipe-server connection thread indefinitely. See + /// Driver.Historian.Wonderware-010. + /// + internal static CancellationTokenSource BuildRequestCts(HistorianConfiguration cfg, CancellationToken ct) + { + var cts = CancellationTokenSource.CreateLinkedTokenSource(ct); + if (cfg.RequestTimeoutSeconds > 0) + { + cts.CancelAfter(TimeSpan.FromSeconds(cfg.RequestTimeoutSeconds)); + } + return cts; + } + private (HistorianAccess Connection, string Node) ConnectToAnyHealthyNode(HistorianConnectionType type) { var candidates = _picker.GetHealthyNodes(); @@ -110,6 +158,13 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend foreach (var n in nodeStates) if (n.IsHealthy) healthyCount++; + // Driver.Historian.Wonderware-005: derive the connection-open booleans from the + // active-node strings, both of which live under _healthLock. _connection itself + // is published under _connectionLock — reading it here under a different lock + // could produce an internally inconsistent snapshot (open with no node, or + // closed with a non-null node) at the publish/clear boundary. Treating the + // active-node strings as the single source of truth makes the snapshot + // self-consistent by construction. lock (_healthLock) { return new HistorianHealthSnapshot @@ -121,8 +176,8 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend LastSuccessTime = _lastSuccessTime, LastFailureTime = _lastFailureTime, LastError = _lastError, - ProcessConnectionOpen = Volatile.Read(ref _connection) != null, - EventConnectionOpen = Volatile.Read(ref _eventConnection) != null, + ProcessConnectionOpen = _activeProcessNode != null, + EventConnectionOpen = _activeEventNode != null, ActiveProcessNode = _activeProcessNode, ActiveEventNode = _activeEventNode, NodeCount = nodeStates.Count, @@ -245,6 +300,59 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend } } + /// + /// Internal exception signalling that StartQuery returned an SDK error + /// whose code is query-class (bad tag name, unsupported aggregate, etc.) + /// and the shared SDK connection therefore must NOT be reset. The outer catch + /// re-throws this so the IPC frame handler surfaces Success=false without + /// touching the connection. See Driver.Historian.Wonderware-008. + /// + internal sealed class QueryClassStartQueryException : InvalidOperationException + { + public HistorianAccessError.ErrorValue Code { get; } + public QueryClassStartQueryException(string message, HistorianAccessError.ErrorValue code) + : base(message) + { + Code = code; + } + } + + /// + /// Centralised StartQuery-failure handler. Throws so the caller surfaces + /// Success=false in the IPC reply (the previous return-empty-with-success + /// behaviour made an SDK error look like "no data in range" to the client). The + /// connection is only reset when the error code is connection-class — + /// query-class failures (bad tag name, unsupported aggregate, etc.) must leave + /// the shared SDK connection intact, otherwise a burst of bad-tag queries cycles + /// the connection and pushes a healthy cluster node into cooldown. + /// See Driver.Historian.Wonderware-008. + /// + private void HandleStartQueryFailure( + string operation, HistorianAccessError error, bool isEventConnection) + { + var code = error?.ErrorCode ?? HistorianAccessError.ErrorValue.Failure; + var description = error?.ErrorDescription ?? string.Empty; + var connectionClass = IsConnectionClassError(code); + + Log.Warning( + "Historian SDK StartQuery failed: {Operation} -> {Code} ({Desc}) [{Kind}]", + operation, code, description, + connectionClass ? "connection-class" : "query-class"); + RecordFailure($"{operation}: {code}"); + + var message = $"Historian SDK StartQuery failed for {operation}: {code} ({description})"; + + if (connectionClass) + { + if (isEventConnection) HandleEventConnectionError(); + else HandleConnectionError(); + throw new InvalidOperationException(message); + } + + // Query-class — the outer catch block must NOT call HandleConnectionError on this. + throw new QueryClassStartQueryException(message, code); + } + private void HandleEventConnectionError(Exception? ex = null) { lock (_eventConnectionLock) @@ -280,6 +388,11 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend { var results = new List(); + // Driver.Historian.Wonderware-010: wire RequestTimeoutSeconds into the read path + // so a hung StartQuery / slow MoveNext can't block the connection thread forever. + using var requestCts = BuildRequestCts(_config, ct); + var token = requestCts.Token; + try { EnsureConnected(); @@ -300,10 +413,8 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend if (!query.StartQuery(args, out var error)) { - Log.Warning("Historian SDK raw query start failed for {Tag}: {Error}", tagName, error.ErrorCode); - RecordFailure($"raw StartQuery: {error.ErrorCode}"); - HandleConnectionError(); - return Task.FromResult(results); + HandleStartQueryFailure( + $"raw query for tag '{tagName}'", error, isEventConnection: false); } var count = 0; @@ -311,7 +422,7 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend while (query.MoveNext(out error)) { - ct.ThrowIfCancellationRequested(); + token.ThrowIfCancellationRequested(); var result = query.QueryResult; var timestamp = DateTime.SpecifyKind(result.StartDateTime, DateTimeKind.Utc); @@ -332,11 +443,20 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend } catch (OperationCanceledException) { throw; } catch (ObjectDisposedException) { throw; } + catch (QueryClassStartQueryException) + { + // Query-class StartQuery failure — HandleStartQueryFailure already logged + // and recorded. Re-throw so the IPC layer surfaces Success=false instead of + // returning an empty list (which would look like "no data in range"). The + // connection is deliberately NOT reset. See Driver.Historian.Wonderware-008. + throw; + } catch (Exception ex) { Log.Warning(ex, "HistoryRead raw failed for {Tag}", tagName); RecordFailure($"raw: {ex.Message}"); HandleConnectionError(ex); + throw; } Log.Debug("HistoryRead raw: {Tag} returned {Count} values ({Start} to {End})", @@ -352,6 +472,10 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend { var results = new List(); + // Driver.Historian.Wonderware-010: outer safety timeout — see ReadRawAsync. + using var requestCts = BuildRequestCts(_config, ct); + var token = requestCts.Token; + try { EnsureConnected(); @@ -367,10 +491,8 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend if (!query.StartQuery(args, out var error)) { - Log.Warning("Historian SDK aggregate query start failed for {Tag}: {Error}", tagName, error.ErrorCode); - RecordFailure($"aggregate StartQuery: {error.ErrorCode}"); - HandleConnectionError(); - return Task.FromResult(results); + HandleStartQueryFailure( + $"aggregate query for tag '{tagName}'", error, isEventConnection: false); } // Apply the same bucket cap as the raw-read path so a wide time range with a @@ -381,7 +503,7 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend while (query.MoveNext(out error)) { - ct.ThrowIfCancellationRequested(); + token.ThrowIfCancellationRequested(); var result = query.QueryResult; var timestamp = DateTime.SpecifyKind(result.StartDateTime, DateTimeKind.Utc); @@ -408,11 +530,13 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend } catch (OperationCanceledException) { throw; } catch (ObjectDisposedException) { throw; } + catch (QueryClassStartQueryException) { throw; } // see ReadRawAsync — keep connection catch (Exception ex) { Log.Warning(ex, "HistoryRead aggregate failed for {Tag}", tagName); RecordFailure($"aggregate: {ex.Message}"); HandleConnectionError(ex); + throw; } Log.Debug("HistoryRead aggregate ({Aggregate}): {Tag} returned {Count} values", @@ -430,13 +554,17 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend if (timestamps == null || timestamps.Length == 0) return Task.FromResult(results); + // Driver.Historian.Wonderware-010: outer safety timeout — see ReadRawAsync. + using var requestCts = BuildRequestCts(_config, ct); + var token = requestCts.Token; + try { EnsureConnected(); foreach (var timestamp in timestamps) { - ct.ThrowIfCancellationRequested(); + token.ThrowIfCancellationRequested(); using var query = _connection!.CreateHistoryQuery(); var args = new HistoryQueryArgs @@ -490,6 +618,7 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend Log.Warning(ex, "HistoryRead at-time failed for {Tag}", tagName); RecordFailure($"at-time: {ex.Message}"); HandleConnectionError(ex); + throw; } Log.Debug("HistoryRead at-time: {Tag} returned {Count} values for {Timestamps} timestamps", @@ -504,6 +633,10 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend { var results = new List(); + // Driver.Historian.Wonderware-010: outer safety timeout — see ReadRawAsync. + using var requestCts = BuildRequestCts(_config, ct); + var token = requestCts.Token; + try { EnsureEventConnected(); @@ -525,16 +658,14 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend if (!query.StartQuery(args, out var error)) { - Log.Warning("Historian SDK event query start failed: {Error}", error.ErrorCode); - RecordFailure($"events StartQuery: {error.ErrorCode}"); - HandleEventConnectionError(); - return Task.FromResult(results); + HandleStartQueryFailure( + $"event query for source '{sourceName ?? "(all)"}'", error, isEventConnection: true); } var count = 0; while (query.MoveNext(out error)) { - ct.ThrowIfCancellationRequested(); + token.ThrowIfCancellationRequested(); results.Add(ToDto(query.QueryResult)); count++; if (maxEvents > 0 && count >= maxEvents) break; @@ -545,11 +676,13 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend } catch (OperationCanceledException) { throw; } catch (ObjectDisposedException) { throw; } + catch (QueryClassStartQueryException) { throw; } // see ReadRawAsync — keep connection catch (Exception ex) { Log.Warning(ex, "HistoryRead events failed for source {Source}", sourceName ?? "(all)"); RecordFailure($"events: {ex.Message}"); HandleEventConnectionError(ex); + throw; } Log.Debug("HistoryRead events: source={Source} returned {Count} events ({Start} to {End})", @@ -593,11 +726,20 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend /// as a string; this is a known edge case of the SDK binding. /// /// - private static object? SelectValue(HistoryQueryResult result) + internal static object? SelectValue(HistoryQueryResult result) + => SelectValueFromPair(result.Value, result.StringValue); + + /// + /// SDK-independent overload of the string-vs-numeric heuristic. Exposed so unit + /// tests can pin the logic without having to instantiate the SDK + /// (whose internal property initialisers make + /// it impractical to fake). See Driver.Historian.Wonderware-012. + /// + internal static object? SelectValueFromPair(double value, string? stringValue) { - if (!string.IsNullOrEmpty(result.StringValue) && result.Value == 0) - return result.StringValue; - return result.Value; + if (!string.IsNullOrEmpty(stringValue) && value == 0) + return stringValue; + return value; } internal static double? ExtractAggregateValue(AnalogSummaryQueryResult result, string column) diff --git a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/HistorianSample.cs b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/HistorianSample.cs index b7b55cc..f04f92f 100644 --- a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/HistorianSample.cs +++ b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/HistorianSample.cs @@ -3,10 +3,11 @@ using System; namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend { /// - /// OPC-UA-free representation of a single historical data point. The Host returns these - /// across the IPC boundary as GalaxyDataValue; the Proxy maps quality and value to - /// OPC UA DataValue. Raw MX quality byte is preserved so the Proxy can use the same - /// quality mapper it already uses for live reads. + /// OPC-UA-free representation of a single historical data point. The sidecar serialises + /// these onto the named-pipe wire (HistorianSampleDto) for the .NET 10 + /// WonderwareHistorianClient, which maps quality and value into OPC UA + /// DataValue on its side. Raw OPC DA quality byte is preserved so the client + /// can reuse the same quality mapper it already uses for live reads. /// public sealed class HistorianSample { @@ -20,7 +21,7 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend /// /// Result of . When is - /// null the aggregate is unavailable for that bucket (Proxy maps to BadNoData). + /// null the aggregate is unavailable for that bucket — the client maps to BadNoData. /// public sealed class HistorianAggregateSample { diff --git a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/IHistorianDataSource.cs b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/IHistorianDataSource.cs index ac73cf6..7694dec 100644 --- a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/IHistorianDataSource.cs +++ b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/IHistorianDataSource.cs @@ -6,9 +6,11 @@ using System.Threading.Tasks; namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend { /// - /// OPC-UA-free surface for the Wonderware Historian subsystem inside Galaxy.Host. - /// Implementations read via the aahClient* SDK; the Proxy side maps returned samples - /// to OPC UA DataValue. + /// OPC-UA-free surface for the Wonderware Historian subsystem inside the historian + /// sidecar process. Implementations read via the aahClient* SDK; the .NET 10 + /// WonderwareHistorianClient on the other side of the named-pipe IPC maps + /// returned samples to OPC UA DataValue. The v1 Galaxy.Host / Proxy hosts + /// this lived in retired in PR 7.2. /// public interface IHistorianDataSource : IDisposable { diff --git a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/SdkAlarmHistorianWriteBackend.cs b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/SdkAlarmHistorianWriteBackend.cs index 738a82a..a2e0081 100644 --- a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/SdkAlarmHistorianWriteBackend.cs +++ b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Backend/SdkAlarmHistorianWriteBackend.cs @@ -205,6 +205,19 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend { historianEvent.Id = id; } + else + { + // Driver.Historian.Wonderware-004: an unparseable / empty EventId previously + // left Id as Guid.Empty, which made every such alarm collide on the same id + // with no diagnostic. Synthesize a fresh Guid so each event still gets a + // unique identifier (the historian still accepts the write — outcome stays + // Ack — and the sender can correlate the synthesized id via the warning log). + var synthesized = Guid.NewGuid(); + Log.Warning( + "Alarm historian event has non-parseable EventId {EventId} for source {Source}; synthesizing Id={SynthesizedId}", + dto.EventId ?? "(null)", dto.SourceName ?? "(none)", synthesized); + historianEvent.Id = synthesized; + } #pragma warning restore CS0618 if (!string.IsNullOrEmpty(dto.AckComment)) diff --git a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Ipc/PipeServer.cs b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Ipc/PipeServer.cs index d20fe49..fa61a1a 100644 --- a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Ipc/PipeServer.cs +++ b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Ipc/PipeServer.cs @@ -21,16 +21,33 @@ public sealed class PipeServer : IDisposable private readonly string _sharedSecret; private readonly ILogger _logger; private readonly CancellationTokenSource _cts = new(); + private readonly CallerVerifier _verifier; private NamedPipeServerStream? _current; + /// + /// Pluggable caller-verification seam. Default implementation calls + /// ; tests can substitute one that ignores the pipe ACL + /// to exercise the rejection paths. + /// + internal delegate bool CallerVerifier(NamedPipeServerStream pipe, SecurityIdentifier allowedSid, out string reason); + public PipeServer(string pipeName, SecurityIdentifier allowedSid, string sharedSecret, ILogger logger) + : this(pipeName, allowedSid, sharedSecret, logger, DefaultVerifier) { } + + internal PipeServer( + string pipeName, SecurityIdentifier allowedSid, string sharedSecret, ILogger logger, + CallerVerifier verifier) { _pipeName = pipeName ?? throw new ArgumentNullException(nameof(pipeName)); _allowedSid = allowedSid ?? throw new ArgumentNullException(nameof(allowedSid)); _sharedSecret = sharedSecret ?? throw new ArgumentNullException(nameof(sharedSecret)); _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + _verifier = verifier ?? throw new ArgumentNullException(nameof(verifier)); } + private static bool DefaultVerifier(NamedPipeServerStream pipe, SecurityIdentifier allowedSid, out string reason) + => VerifyCaller(pipe, allowedSid, out reason); + /// /// Accepts one connection, performs Hello handshake, then dispatches frames to /// until EOF or cancel. Returns when the client disconnects. @@ -67,8 +84,15 @@ public sealed class PipeServer : IDisposable return; } - if (!VerifyCaller(_current, out var reason)) + if (!_verifier(_current, _allowedSid, out var reason)) { + // Driver.Historian.Wonderware-007: send a rejecting HelloAck so the client + // learns why instead of having to wait for its own read timeout. The reason + // tag "caller-sid-mismatch" is symmetric with the shared-secret-mismatch and + // major-version-mismatch acks the two other rejection paths emit below. + await writer.WriteAsync(MessageKind.HelloAck, + new HelloAck { Accepted = false, RejectReason = $"caller-sid-mismatch: {reason}" }, + linked.Token).ConfigureAwait(false); _logger.Warning("Sidecar IPC caller rejected: {Reason}", reason); _current.Disconnect(); return; @@ -172,7 +196,7 @@ public sealed class PipeServer : IDisposable } } - private bool VerifyCaller(NamedPipeServerStream pipe, out string reason) + private static bool VerifyCaller(NamedPipeServerStream pipe, SecurityIdentifier allowedSid, out string reason) { try { @@ -181,9 +205,9 @@ public sealed class PipeServer : IDisposable using var wi = WindowsIdentity.GetCurrent(); if (wi.User is null) throw new InvalidOperationException("GetCurrent().User is null — cannot verify caller"); - if (wi.User != _allowedSid) + if (wi.User != allowedSid) throw new UnauthorizedAccessException( - $"caller SID {wi.User.Value} does not match allowed {_allowedSid.Value}"); + $"caller SID {wi.User.Value} does not match allowed {allowedSid.Value}"); }); reason = string.Empty; return true; diff --git a/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceConnectFailoverTests.cs b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceConnectFailoverTests.cs new file mode 100644 index 0000000..139a7cf --- /dev/null +++ b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceConnectFailoverTests.cs @@ -0,0 +1,142 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using ArchestrA; +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend; + +namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests.Backend; + +/// +/// Driver.Historian.Wonderware-012 coverage — pins 's +/// connect-failover / cooldown loop via a fake . +/// A live is never instantiated; the fake throws on every +/// attempt so the read path surfaces the connect failure without touching the SDK. +/// +[Trait("Category", "Unit")] +public sealed class HistorianDataSourceConnectFailoverTests +{ + [Fact] + public async Task ReadRaw_when_no_nodes_are_healthy_throws_so_IPC_surfaces_Success_false() + { + var cfg = new HistorianConfiguration + { + Enabled = true, + ServerNames = new List { "node-a" }, + FailureCooldownSeconds = 60, + // Disable the outer request timeout so the test doesn't race the connect failure + // against the timeout (we want the connect failure path, not a TimeoutException). + RequestTimeoutSeconds = 0, + }; + var ds = new HistorianDataSource(cfg, new ThrowingConnectionFactory()); + + // Read methods used to swallow the connect exception and return an empty list with + // Success=true; the fix re-throws so the IPC layer surfaces Success=false. The + // exception must therefore propagate. + await Should.ThrowAsync(() => ds.ReadRawAsync( + "Tank.Level", + new DateTime(2026, 5, 1, 0, 0, 0, DateTimeKind.Utc), + new DateTime(2026, 5, 1, 0, 1, 0, DateTimeKind.Utc), + maxValues: 100, + CancellationToken.None)); + } + + [Fact] + public async Task ReadRaw_tries_each_cluster_node_in_order_until_one_succeeds_or_all_fail() + { + var cfg = new HistorianConfiguration + { + Enabled = true, + ServerNames = new List { "node-a", "node-b", "node-c" }, + FailureCooldownSeconds = 60, + RequestTimeoutSeconds = 0, + }; + var factory = new TrackingThrowingConnectionFactory(); + var ds = new HistorianDataSource(cfg, factory); + + await Should.ThrowAsync(() => ds.ReadRawAsync( + "Tank.Level", + new DateTime(2026, 5, 1, 0, 0, 0, DateTimeKind.Utc), + new DateTime(2026, 5, 1, 0, 1, 0, DateTimeKind.Utc), + maxValues: 100, + CancellationToken.None)); + + // All three candidates must be attempted in the configured order before the + // connect-loop gives up. + factory.AttemptedNodes.ShouldBe(new[] { "node-a", "node-b", "node-c" }); + } + + [Fact] + public async Task ReadRaw_marks_failed_nodes_in_cooldown_so_a_subsequent_call_sees_no_healthy_nodes() + { + var cfg = new HistorianConfiguration + { + Enabled = true, + ServerNames = new List { "node-a", "node-b" }, + FailureCooldownSeconds = 60, + RequestTimeoutSeconds = 0, + }; + var ds = new HistorianDataSource(cfg, new ThrowingConnectionFactory()); + + await Should.ThrowAsync(() => ds.ReadRawAsync( + "Tank.Level", + DateTime.UtcNow.AddMinutes(-1), DateTime.UtcNow, + maxValues: 100, CancellationToken.None)); + + var snap = ds.GetHealthSnapshot(); + snap.NodeCount.ShouldBe(2); + snap.HealthyNodeCount.ShouldBe(0, "both nodes failed and entered cooldown after the connect attempts"); + snap.ProcessConnectionOpen.ShouldBeFalse(); + snap.ActiveProcessNode.ShouldBeNull(); + } + + [Fact] + public async Task ReadEvents_uses_a_separate_event_connection_path() + { + // ReadEventsAsync uses _eventConnection / EnsureEventConnected — a different + // codepath than ReadRawAsync. Symmetric test to pin the dual-connection design. + var cfg = new HistorianConfiguration + { + Enabled = true, + ServerNames = new List { "node-a" }, + FailureCooldownSeconds = 60, + RequestTimeoutSeconds = 0, + }; + var factory = new TrackingThrowingConnectionFactory(); + var ds = new HistorianDataSource(cfg, factory); + + await Should.ThrowAsync(() => ds.ReadEventsAsync( + sourceName: "Tank.HiHi", + DateTime.UtcNow.AddMinutes(-1), DateTime.UtcNow, + maxEvents: 100, CancellationToken.None)); + + factory.AttemptedTypes.ShouldContain(HistorianConnectionType.Event, + "event reads must open an Event-typed connection"); + factory.AttemptedNodes.ShouldBe(new[] { "node-a" }); + } + + // ── helpers ────────────────────────────────────────────────────────── + + private sealed class ThrowingConnectionFactory : IHistorianConnectionFactory + { + public HistorianAccess CreateAndConnect( + HistorianConfiguration config, HistorianConnectionType type, bool readOnly = true) + => throw new InvalidOperationException($"simulated connect failure to {config.ServerName}"); + } + + private sealed class TrackingThrowingConnectionFactory : IHistorianConnectionFactory + { + public List AttemptedNodes { get; } = new(); + public List AttemptedTypes { get; } = new(); + + public HistorianAccess CreateAndConnect( + HistorianConfiguration config, HistorianConnectionType type, bool readOnly = true) + { + AttemptedNodes.Add(config.ServerName); + AttemptedTypes.Add(type); + throw new InvalidOperationException($"simulated connect failure to {config.ServerName}"); + } + } +} diff --git a/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceHealthSnapshotTests.cs b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceHealthSnapshotTests.cs new file mode 100644 index 0000000..bc98f88 --- /dev/null +++ b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceHealthSnapshotTests.cs @@ -0,0 +1,114 @@ +using System; +using System.Reflection; +using ArchestrA; +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend; + +namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests.Backend; + +/// +/// Driver.Historian.Wonderware-005 regression tests for . +/// The active-node strings and the connection-open booleans were published under different +/// locks, so a snapshot could observe an internally inconsistent pairing (open with no node, +/// or closed with a non-null node). The fix derives the open booleans from the same field +/// that is published under the same lock so the snapshot is self-consistent by construction. +/// +[Trait("Category", "Unit")] +public sealed class HistorianDataSourceHealthSnapshotTests +{ + /// + /// Drives the "half-published" state directly via reflection: set _connection + /// to a non-null sentinel but leave _activeProcessNode null. The snapshot must + /// report ProcessConnectionOpen = false and ActiveProcessNode = null + /// consistently — never a mismatch. + /// + [Fact] + public void Snapshot_with_connection_set_but_active_node_null_is_consistent() + { + var ds = new HistorianDataSource( + new HistorianConfiguration { Enabled = true, ServerName = "h1" }); + + SetField(ds, "_connection", new HistorianAccess()); + SetField(ds, "_activeProcessNode", (string?)null); + + var snap = ds.GetHealthSnapshot(); + (snap.ProcessConnectionOpen == (snap.ActiveProcessNode != null)).ShouldBeTrue( + "snapshot must not advertise open with no node — picks one source of truth"); + } + + /// + /// Symmetric case for the event connection. + /// + [Fact] + public void Snapshot_with_event_connection_set_but_active_node_null_is_consistent() + { + var ds = new HistorianDataSource( + new HistorianConfiguration { Enabled = true, ServerName = "h1" }); + + SetField(ds, "_eventConnection", new HistorianAccess()); + SetField(ds, "_activeEventNode", (string?)null); + + var snap = ds.GetHealthSnapshot(); + (snap.EventConnectionOpen == (snap.ActiveEventNode != null)).ShouldBeTrue( + "snapshot must not advertise event open with no node"); + } + + /// + /// The other direction: connection cleared but node still populated (the failure path + /// between the two field clears). The snapshot must still pair them consistently. + /// + [Fact] + public void Snapshot_with_connection_cleared_but_active_node_populated_is_consistent() + { + var ds = new HistorianDataSource( + new HistorianConfiguration { Enabled = true, ServerName = "h1" }); + + SetField(ds, "_connection", (HistorianAccess?)null); + SetField(ds, "_activeProcessNode", "node-stale"); + + var snap = ds.GetHealthSnapshot(); + (snap.ProcessConnectionOpen == (snap.ActiveProcessNode != null)).ShouldBeTrue( + "snapshot must not advertise closed with a node still set"); + } + + /// + /// Steady-state happy path: both fields populated — snapshot reports both consistently. + /// + [Fact] + public void Snapshot_with_both_fields_populated_reports_open_and_active_node() + { + var ds = new HistorianDataSource( + new HistorianConfiguration { Enabled = true, ServerName = "h1" }); + + SetField(ds, "_connection", new HistorianAccess()); + SetField(ds, "_activeProcessNode", "h1"); + + var snap = ds.GetHealthSnapshot(); + snap.ProcessConnectionOpen.ShouldBeTrue(); + snap.ActiveProcessNode.ShouldBe("h1"); + } + + /// + /// Steady-state default (no connect attempted): both null. + /// + [Fact] + public void Snapshot_with_default_fields_reports_closed_with_no_active_node() + { + var ds = new HistorianDataSource( + new HistorianConfiguration { Enabled = true, ServerName = "h1" }); + + var snap = ds.GetHealthSnapshot(); + snap.ProcessConnectionOpen.ShouldBeFalse(); + snap.ActiveProcessNode.ShouldBeNull(); + snap.EventConnectionOpen.ShouldBeFalse(); + snap.ActiveEventNode.ShouldBeNull(); + } + + private static void SetField(object target, string name, object? value) + { + var f = target.GetType().GetField(name, BindingFlags.Instance | BindingFlags.NonPublic); + f.ShouldNotBeNull($"private field '{name}' must exist on {target.GetType().Name}"); + f!.SetValue(target, value); + } +} diff --git a/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceRequestTimeoutTests.cs b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceRequestTimeoutTests.cs new file mode 100644 index 0000000..7269e5c --- /dev/null +++ b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceRequestTimeoutTests.cs @@ -0,0 +1,109 @@ +using System; +using System.Reflection; +using System.Threading; +using System.Threading.Tasks; +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend; + +namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests.Backend; + +/// +/// Driver.Historian.Wonderware-010 regression. +/// was documented as the "outer safety timeout applied to sync-over-async Historian +/// operations" but was never read or enforced — a hung StartQuery or a slow +/// MoveNext could block the single pipe-server connection thread indefinitely. +/// The fix wires it into the read paths via a linked +/// so the documented safety net actually exists. +/// +/// The SDK-touching read methods cannot be unit-driven without a live AVEVA Historian. +/// This test pins the helper that derives the effective timeout from the config — the +/// read methods invoke that helper, so a regression in either the helper or the wiring +/// would break the test. +/// +[Trait("Category", "Unit")] +public sealed class HistorianDataSourceRequestTimeoutTests +{ + [Fact] + public void Default_request_timeout_is_60_seconds() + { + new HistorianConfiguration().RequestTimeoutSeconds.ShouldBe(60); + } + + [Fact] + public void Positive_request_timeout_is_used_verbatim() + { + InvokeBuildLinkedTokenSource( + new HistorianConfiguration { RequestTimeoutSeconds = 30 }, + CancellationToken.None, + out var cts); + cts.ShouldNotBeNull(); + // The helper must wire CancelAfter — easiest cross-check is to observe that the + // returned CTS is NOT already cancelled, and that disposing it is safe. + cts!.IsCancellationRequested.ShouldBeFalse(); + cts.Dispose(); + } + + [Fact] + public void Zero_or_negative_request_timeout_is_treated_as_no_timeout() + { + // A zero/negative value means "no outer timeout" — the helper must still return a + // linked CTS so callers can use one code path, but it must not auto-cancel. + InvokeBuildLinkedTokenSource( + new HistorianConfiguration { RequestTimeoutSeconds = 0 }, + CancellationToken.None, + out var cts); + cts.ShouldNotBeNull(); + cts!.IsCancellationRequested.ShouldBeFalse(); + // Give the runtime a moment — a misconfigured CancelAfter(0) would fire immediately. + Thread.Sleep(50); + cts.IsCancellationRequested.ShouldBeFalse("RequestTimeoutSeconds <= 0 must not auto-cancel"); + cts.Dispose(); + } + + [Fact] + public async Task Small_timeout_cancels_the_linked_token() + { + // 50 ms timeout — sleep 250 ms then assert the linked CTS has fired. + InvokeBuildLinkedTokenSource( + new HistorianConfiguration { RequestTimeoutSeconds = 1 }, // smallest non-zero whole-second value + CancellationToken.None, + out var cts); + cts.ShouldNotBeNull(); + + // The wall-clock cost of waiting a full second per test is acceptable — this + // pins the actual CancelAfter wiring rather than just the conditional logic. + await Task.Delay(1500); + cts!.IsCancellationRequested.ShouldBeTrue("RequestTimeoutSeconds=1 must cancel within 1.5s"); + cts.Dispose(); + } + + [Fact] + public void Inbound_cancellation_propagates_into_the_linked_token() + { + using var outer = new CancellationTokenSource(); + InvokeBuildLinkedTokenSource( + new HistorianConfiguration { RequestTimeoutSeconds = 60 }, + outer.Token, + out var cts); + cts.ShouldNotBeNull(); + cts!.IsCancellationRequested.ShouldBeFalse(); + + outer.Cancel(); + cts.IsCancellationRequested.ShouldBeTrue("cancelling the caller's CT must cancel the linked CTS"); + cts.Dispose(); + } + + private static void InvokeBuildLinkedTokenSource( + HistorianConfiguration cfg, CancellationToken ct, out CancellationTokenSource? cts) + { + // The helper is internal so the InternalsVisibleTo on the data-source project lets + // us bind to it directly. Reflection keeps the test resilient if the method name is + // ever shortened. + var method = typeof(HistorianDataSource) + .GetMethod("BuildRequestCts", BindingFlags.Static | BindingFlags.NonPublic); + method.ShouldNotBeNull( + "HistorianDataSource.BuildRequestCts must exist — wires RequestTimeoutSeconds into the read paths"); + cts = (CancellationTokenSource?)method!.Invoke(null, new object[] { cfg, ct }); + } +} diff --git a/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceStartQueryClassificationTests.cs b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceStartQueryClassificationTests.cs new file mode 100644 index 0000000..bef731f --- /dev/null +++ b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceStartQueryClassificationTests.cs @@ -0,0 +1,50 @@ +using ArchestrA; +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend; + +namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests.Backend; + +/// +/// Driver.Historian.Wonderware-008 regression. The previous implementation unconditionally +/// called HandleConnectionError() whenever StartQuery returned false, +/// which tore down the (relatively expensive) shared SDK connection on a query-class error +/// such as a bad tag name. A burst of bad-tag queries could therefore push an otherwise +/// healthy cluster node into cooldown via the picker's MarkFailed. The fix +/// classifies the SDK error code: connection-class codes drop the connection; query-class +/// codes leave it intact. +/// +[Trait("Category", "Unit")] +public sealed class HistorianDataSourceStartQueryClassificationTests +{ + // ── Connection-class codes — the connection should be reset ─────────── + + [Theory] + [InlineData(HistorianAccessError.ErrorValue.FailedToConnect)] + [InlineData(HistorianAccessError.ErrorValue.FailedToCreateSession)] + [InlineData(HistorianAccessError.ErrorValue.NoReply)] + [InlineData(HistorianAccessError.ErrorValue.NotReady)] + [InlineData(HistorianAccessError.ErrorValue.NotInitialized)] + [InlineData(HistorianAccessError.ErrorValue.Stopping)] + [InlineData(HistorianAccessError.ErrorValue.Win32Exception)] + [InlineData(HistorianAccessError.ErrorValue.InvalidResponse)] + public void Connection_class_codes_are_classified_as_connection_errors(HistorianAccessError.ErrorValue code) + { + HistorianDataSource.IsConnectionClassError(code).ShouldBeTrue( + $"{code} is a connection/server failure — the SDK connection should be reset"); + } + + // ── Query-class codes — the connection should NOT be reset ──────────── + + [Theory] + [InlineData(HistorianAccessError.ErrorValue.InvalidArgument)] // bad tag name, etc. + [InlineData(HistorianAccessError.ErrorValue.ValidationFailed)] // bad query args + [InlineData(HistorianAccessError.ErrorValue.NotApplicable)] // wrong tag kind for query + [InlineData(HistorianAccessError.ErrorValue.NotImplemented)] // unsupported aggregate + [InlineData(HistorianAccessError.ErrorValue.NoData)] // empty range + public void Query_class_codes_are_NOT_classified_as_connection_errors(HistorianAccessError.ErrorValue code) + { + HistorianDataSource.IsConnectionClassError(code).ShouldBeFalse( + $"{code} is a query payload problem — must NOT tear down the SDK connection"); + } +} diff --git a/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceValueAndAggregateTests.cs b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceValueAndAggregateTests.cs new file mode 100644 index 0000000..f2e06c4 --- /dev/null +++ b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/HistorianDataSourceValueAndAggregateTests.cs @@ -0,0 +1,122 @@ +using System.Runtime.Serialization; +using ArchestrA; +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend; + +namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests.Backend; + +/// +/// Driver.Historian.Wonderware-012 coverage — pins the two static helpers on +/// that previously had no direct tests: +/// (the string-vs-numeric heuristic +/// for the raw + at-time read paths) and +/// (the aggregate-column dispatch). The SDK HistoryQueryResult initialises internal +/// state lazily on first property access, which makes it impractical to fake via +/// ; the heuristic was therefore +/// refactored into an SDK-independent overload that the tests drive directly. +/// +[Trait("Category", "Unit")] +public sealed class HistorianDataSourceValueAndAggregateTests +{ + // ── SelectValueFromPair ─────────────────────────────────────────────── + + [Fact] + public void SelectValueFromPair_returns_numeric_value_when_StringValue_is_empty() + { + HistorianDataSource.SelectValueFromPair(42.5, string.Empty).ShouldBe(42.5); + } + + [Fact] + public void SelectValueFromPair_returns_numeric_value_when_Value_is_non_zero_even_with_StringValue_populated() + { + // Tag is numeric and sampled non-zero; the SDK may still populate a formatted + // StringValue but the value path wins. + HistorianDataSource.SelectValueFromPair(3.14, "3.14").ShouldBe(3.14); + } + + [Fact] + public void SelectValueFromPair_returns_StringValue_when_Value_is_zero_and_StringValue_non_empty() + { + // String tags in the SDK always project Value=0 — that's the documented heuristic. + HistorianDataSource.SelectValueFromPair(0.0, "Ready").ShouldBe("Ready"); + } + + [Fact] + public void SelectValueFromPair_returns_numeric_zero_when_Value_is_zero_and_StringValue_empty() + { + // Numeric tag legitimately samples zero, no formatted text — must remain numeric. + HistorianDataSource.SelectValueFromPair(0.0, string.Empty).ShouldBe(0.0); + } + + [Fact] + public void SelectValueFromPair_null_StringValue_falls_back_to_numeric() + { + HistorianDataSource.SelectValueFromPair(7.7, null).ShouldBe(7.7); + } + + [Fact] + public void SelectValueFromPair_documented_edge_case_numeric_zero_with_formatted_string_returns_string() + { + // The doc comment on SelectValue calls this out as a known SDK-binding edge case: + // "A numeric tag at exactly zero with a non-empty formatted StringValue (e.g. '0.00') + // would be mis-reported as a string". This test pins that documented behaviour so + // a future SDK upgrade that surfaces a real data-type field can replace the + // heuristic deliberately rather than by accident. + HistorianDataSource.SelectValueFromPair(0.0, "0.00").ShouldBe("0.00"); + } + + // ── ExtractAggregateValue ───────────────────────────────────────────── + + [Theory] + [InlineData("Average", 10.0)] + [InlineData("Minimum", 1.0)] + [InlineData("Maximum", 20.0)] + [InlineData("First", 2.0)] + [InlineData("Last", 8.0)] + [InlineData("StdDev", 1.5)] + public void ExtractAggregateValue_dispatches_known_columns(string column, double expected) + { + var result = NewAggregateResult(); + result.Average = 10.0; + result.Minimum = 1.0; + result.Maximum = 20.0; + result.ValueCount = 5; + result.First = 2.0; + result.Last = 8.0; + result.StdDev = 1.5; + + HistorianDataSource.ExtractAggregateValue(result, column).ShouldBe(expected); + } + + [Fact] + public void ExtractAggregateValue_ValueCount_dispatches_to_uint_field() + { + var result = NewAggregateResult(); + result.ValueCount = 42; + HistorianDataSource.ExtractAggregateValue(result, "ValueCount").ShouldBe(42.0); + } + + [Fact] + public void ExtractAggregateValue_unknown_column_returns_null() + { + // Unknown column → null → IPC sample carries no value → client maps to BadNoData. + HistorianDataSource.ExtractAggregateValue(NewAggregateResult(), "NotAColumn").ShouldBeNull(); + } + + [Fact] + public void ExtractAggregateValue_case_sensitive_dispatch() + { + // The switch is case-sensitive — "average" (lowercase) does NOT dispatch. Pinned so + // the canonical column-name casing is preserved across refactors. + var result = NewAggregateResult(); + result.Average = 99.0; + HistorianDataSource.ExtractAggregateValue(result, "average").ShouldBeNull(); + HistorianDataSource.ExtractAggregateValue(result, "Average").ShouldBe(99.0); + } + + private static AnalogSummaryQueryResult NewAggregateResult() + { + return (AnalogSummaryQueryResult)FormatterServices.GetUninitializedObject(typeof(AnalogSummaryQueryResult)); + } +} diff --git a/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/SdkAlarmHistorianWriteBackendTests.cs b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/SdkAlarmHistorianWriteBackendTests.cs index 2be0944..5d373db 100644 --- a/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/SdkAlarmHistorianWriteBackendTests.cs +++ b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Backend/SdkAlarmHistorianWriteBackendTests.cs @@ -103,6 +103,58 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host.Tests SdkAlarmHistorianWriteBackend.ClassifyOutcome(code).ShouldBe(expected); } + // ── ToHistorianEvent — EventId handling ─────────────────────────────── + + [Fact] + public void ToHistorianEvent_parseable_event_id_is_used_verbatim() + { + // Sanity case: a real GUID round-trips into HistorianEvent.Id. + var id = Guid.Parse("12345678-1234-1234-1234-123456789abc"); + var dto = new AlarmHistorianEventDto + { + EventId = id.ToString(), + SourceName = "Tank01", + AlarmType = "AnalogLimitAlarm.HiHi", + EventTimeUtcTicks = DateTime.UtcNow.Ticks, + }; + +#pragma warning disable CS0618 + SdkAlarmHistorianWriteBackend.ToHistorianEvent(dto).Id.ShouldBe(id); +#pragma warning restore CS0618 + } + + [Fact] + public void ToHistorianEvent_unparseable_event_id_synthesizes_unique_non_empty_Guid() + { + // Driver.Historian.Wonderware-004 regression: when EventId is not a parseable + // GUID (or is empty) the previous implementation silently left HistorianEvent.Id + // as Guid.Empty, so multiple alarms collided on the same id with no warning. + // The fix synthesizes a fresh Guid so every event still gets a unique identifier. + var dtoA = new AlarmHistorianEventDto + { + EventId = "not-a-guid", + SourceName = "Tank01", + AlarmType = "Active", + EventTimeUtcTicks = DateTime.UtcNow.Ticks, + }; + var dtoB = new AlarmHistorianEventDto + { + EventId = string.Empty, + SourceName = "Tank01", + AlarmType = "Active", + EventTimeUtcTicks = DateTime.UtcNow.Ticks, + }; + +#pragma warning disable CS0618 + var idA = SdkAlarmHistorianWriteBackend.ToHistorianEvent(dtoA).Id; + var idB = SdkAlarmHistorianWriteBackend.ToHistorianEvent(dtoB).Id; +#pragma warning restore CS0618 + + idA.ShouldNotBe(Guid.Empty, "unparseable EventId must not collapse to Guid.Empty"); + idB.ShouldNotBe(Guid.Empty, "empty EventId must not collapse to Guid.Empty"); + idA.ShouldNotBe(idB, "every event needs a unique synthesized id"); + } + [Fact] public void ClassifyOutcome_WriteToReadOnlyFile_is_RetryPlease_not_PermanentFail() { diff --git a/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Ipc/PipeServerSidRejectTests.cs b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Ipc/PipeServerSidRejectTests.cs new file mode 100644 index 0000000..a62c40e --- /dev/null +++ b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests/Ipc/PipeServerSidRejectTests.cs @@ -0,0 +1,83 @@ +using System; +using System.IO.Pipes; +using System.Security.Principal; +using System.Threading; +using System.Threading.Tasks; +using MessagePack; +using Serilog; +using Serilog.Core; +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Ipc; + +namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Tests.Ipc; + +/// +/// Driver.Historian.Wonderware-007 regression. The two other rejection paths +/// (shared-secret-mismatch and major-version-mismatch) both write a +/// with Accepted=false before disconnecting; the caller-SID-mismatch path used to +/// just disconnect abruptly, leaving the client to time out instead of learning why. +/// The fix sends a symmetric caller-sid-mismatch ack before disconnecting. +/// +/// The test uses the internal test-seam constructor so the verifier rejects without +/// needing to actually relax the pipe ACL (which would block the test client itself). +/// +public sealed class PipeServerSidRejectTests +{ + private static readonly ILogger Quiet = Logger.None; + + [Fact] + public async Task Caller_SID_mismatch_sends_HelloAck_with_reject_reason_before_disconnect() + { + // The pipe ACL must allow the current process to connect — so wire up the pipe + // with the current user's SID. Then have the verifier seam simulate the SID + // mismatch by returning false. This isolates the "what does the server do on a + // rejected caller" question from the (separate) "is the ACL correct" question. + var current = WindowsIdentity.GetCurrent().User + ?? throw new InvalidOperationException("WindowsIdentity.GetCurrent().User was null — cannot run test"); + + var pipeName = $"otopcua-hist-sidreject-test-{Guid.NewGuid():N}"; + + PipeServer.CallerVerifier rejecting = (NamedPipeServerStream _, SecurityIdentifier _, out string reason) => + { + reason = "synthetic-mismatch"; + return false; + }; + using var server = new PipeServer(pipeName, current, "secret", Quiet, rejecting); + + var serverTask = Task.Run(() => server.RunOneConnectionAsync(new NoopHandler(), CancellationToken.None)); + + using var client = new NamedPipeClientStream(".", pipeName, PipeDirection.InOut, PipeOptions.Asynchronous); + await client.ConnectAsync(5_000); + + using var writer = new FrameWriter(client, leaveOpen: true); + using var reader = new FrameReader(client, leaveOpen: true); + + var hello = new Hello { ProtocolMajor = Hello.CurrentMajor, PeerName = "test", SharedSecret = "secret" }; + await writer.WriteAsync(MessageKind.Hello, hello, CancellationToken.None); + + // Read the rejecting HelloAck the server is expected to send before disconnecting. + var frame = await reader.ReadFrameAsync(CancellationToken.None); + frame.ShouldNotBeNull("server must send a HelloAck on caller-SID rejection, not just disconnect"); + frame!.Value.Kind.ShouldBe(MessageKind.HelloAck); + + var ack = MessagePackSerializer.Deserialize(frame.Value.Body); + ack.Accepted.ShouldBeFalse(); + ack.RejectReason.ShouldNotBeNullOrEmpty(); + ack.RejectReason!.ShouldContain("caller-sid-mismatch", + Case.Insensitive, + "reject reason must match the documented caller-sid-mismatch tag so clients can diagnose"); + + await serverTask; + } + + /// Handler that asserts it is never called — the connection must be rejected at Hello. + private sealed class NoopHandler : IFrameHandler + { + public Task HandleAsync(MessageKind kind, byte[] body, FrameWriter writer, CancellationToken ct) + { + throw new InvalidOperationException( + $"Handler must not be reached on a rejected caller; got frame {kind}"); + } + } +}