Files
lmxopcua/tests/ZB.MOM.WW.OtOpcUa.Historian.Aveva.Tests/HistorianClusterFailoverTests.cs
Joseph Doherty 3b2defd94f Phase 0 — mechanical rename ZB.MOM.WW.LmxOpcUa.* → ZB.MOM.WW.OtOpcUa.*
Renames all 11 projects (5 src + 6 tests), the .slnx solution file, all source-file namespaces, all axaml namespace references, and all v1 documentation references in CLAUDE.md and docs/*.md (excluding docs/v2/ which is already in OtOpcUa form). Also updates the TopShelf service registration name from "LmxOpcUa" to "OtOpcUa" per Phase 0 Task 0.6.

Preserves runtime identifiers per Phase 0 Out-of-Scope rules to avoid breaking v1/v2 client trust during coexistence: OPC UA `ApplicationUri` defaults (`urn:{GalaxyName}:LmxOpcUa`), server `EndpointPath` (`/LmxOpcUa`), `ServerName` default (feeds cert subject CN), `MxAccessConfiguration.ClientName` default (defensive — stays "LmxOpcUa" for MxAccess audit-trail consistency), client OPC UA identifiers (`ApplicationName = "LmxOpcUaClient"`, `ApplicationUri = "urn:localhost:LmxOpcUaClient"`, cert directory `%LocalAppData%\LmxOpcUaClient\pki\`), and the `LmxOpcUaServer` class name (class rename out of Phase 0 scope per Task 0.5 sed pattern; happens in Phase 1 alongside `LmxNodeManager → GenericDriverNodeManager` Core extraction). 23 LmxOpcUa references retained, all enumerated and justified in `docs/v2/implementation/exit-gate-phase-0.md`.

Build clean: 0 errors, 30 warnings (lower than baseline 167). Tests at strict improvement over baseline: 821 passing / 1 failing vs baseline 820 / 2 (one flaky pre-existing failure passed this run; the other still fails — both pre-existing and unrelated to the rename). `Client.UI.Tests`, `Historian.Aveva.Tests`, `Client.Shared.Tests`, `IntegrationTests` all match baseline exactly. Exit gate compliance results recorded in `docs/v2/implementation/exit-gate-phase-0.md` with all 7 checks PASS or DEFERRED-to-PR-review (#7 service install verification needs Windows service permissions on the reviewer's box).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-17 13:57:47 -04:00

167 lines
7.9 KiB
C#

using System;
using System.Linq;
using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Host.Configuration;
namespace ZB.MOM.WW.OtOpcUa.Historian.Aveva.Tests
{
/// <summary>
/// End-to-end behavior of the cluster endpoint picker wired into
/// <see cref="HistorianDataSource"/>. Verifies that a failing node is skipped on the next
/// attempt, that the picker state is shared across process + event silos, and that the
/// health snapshot surfaces the winning node.
/// </summary>
public class HistorianClusterFailoverTests
{
private static HistorianConfiguration ClusterConfig(params string[] nodes) => new()
{
Enabled = true,
ServerNames = nodes.ToList(),
Port = 32568,
IntegratedSecurity = true,
CommandTimeoutSeconds = 5,
FailureCooldownSeconds = 60
};
[Fact]
public void Connect_FirstNodeFails_PicksSecond()
{
// host-a fails during connect; host-b connects successfully. The fake returns an
// unconnected HistorianAccess on success, so the query phase will subsequently trip
// HandleConnectionError on host-b — that's expected. The observable signal is that
// the picker tried host-a first, skipped to host-b, and host-a's failure was recorded.
var factory = new FakeHistorianConnectionFactory();
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
var config = ClusterConfig("host-a", "host-b");
using var ds = new HistorianDataSource(config, factory);
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
.GetAwaiter().GetResult();
factory.ConnectHistory.ShouldBe(new[] { "host-a", "host-b" });
var snap = ds.GetHealthSnapshot();
snap.NodeCount.ShouldBe(2);
snap.Nodes.Single(n => n.Name == "host-a").IsHealthy.ShouldBeFalse();
snap.Nodes.Single(n => n.Name == "host-a").FailureCount.ShouldBe(1);
snap.Nodes.Single(n => n.Name == "host-a").LastError.ShouldContain("A down");
}
[Fact]
public void Connect_AllNodesFail_ReturnsEmptyResults_AndAllInCooldown()
{
var factory = new FakeHistorianConnectionFactory();
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
factory.ServerBehaviors["host-b"] = new InvalidOperationException("B down");
var config = ClusterConfig("host-a", "host-b");
using var ds = new HistorianDataSource(config, factory);
var results = ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
.GetAwaiter().GetResult();
results.Count.ShouldBe(0);
factory.ConnectHistory.ShouldBe(new[] { "host-a", "host-b" });
var snap = ds.GetHealthSnapshot();
snap.ActiveProcessNode.ShouldBeNull();
snap.HealthyNodeCount.ShouldBe(0);
snap.TotalFailures.ShouldBe(1); // one read call failed (after all cluster tries)
snap.LastError.ShouldContain("All 2 healthy historian candidate(s) failed");
snap.LastError.ShouldContain("B down"); // last inner exception preserved
}
[Fact]
public void Connect_SecondCall_SkipsCooledDownNode()
{
// After first call: host-a is in cooldown (60s), host-b is also marked failed via
// HandleConnectionError since the fake connection doesn't support real queries.
// Second call: both are in cooldown and the picker returns empty → the read method
// catches the "all nodes failed" exception and returns empty without retrying connect.
// We verify this by checking that the second call adds NOTHING to the connect history.
var factory = new FakeHistorianConnectionFactory();
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
var config = ClusterConfig("host-a", "host-b"); // 60s cooldown
using var ds = new HistorianDataSource(config, factory);
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
.GetAwaiter().GetResult();
factory.ConnectHistory.Clear();
var results = ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
.GetAwaiter().GetResult();
// Both nodes are in cooldown → picker returns empty → factory is not called at all.
results.Count.ShouldBe(0);
factory.ConnectHistory.ShouldBeEmpty();
}
[Fact]
public void Connect_SingleNodeConfig_BehavesLikeLegacy()
{
var factory = new FakeHistorianConnectionFactory();
var config = new HistorianConfiguration
{
Enabled = true,
ServerName = "legacy-host",
Port = 32568,
FailureCooldownSeconds = 0
};
using var ds = new HistorianDataSource(config, factory);
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
.GetAwaiter().GetResult();
factory.ConnectHistory.ShouldBe(new[] { "legacy-host" });
var snap = ds.GetHealthSnapshot();
snap.NodeCount.ShouldBe(1);
snap.Nodes.Single().Name.ShouldBe("legacy-host");
}
[Fact]
public void Connect_PickerOrderRespected()
{
var factory = new FakeHistorianConnectionFactory();
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
factory.ServerBehaviors["host-b"] = new InvalidOperationException("B down");
factory.ServerBehaviors["host-c"] = new InvalidOperationException("C down");
var config = ClusterConfig("host-a", "host-b", "host-c");
using var ds = new HistorianDataSource(config, factory);
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
.GetAwaiter().GetResult();
// Candidates are tried in configuration order.
factory.ConnectHistory.ShouldBe(new[] { "host-a", "host-b", "host-c" });
}
[Fact]
public void Connect_SharedPickerAcrossProcessAndEventSilos()
{
// Process path tries host-a, fails, then tries host-b. host-a is in cooldown. When
// the event path subsequently starts with a 0s cooldown, the picker state is shared:
// host-a is still marked failed (via its cooldown window) at the moment the event
// silo asks. The event path therefore must not retry host-a.
var factory = new FakeHistorianConnectionFactory();
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
var config = ClusterConfig("host-a", "host-b");
using var ds = new HistorianDataSource(config, factory);
// Process path: host-a fails → host-b reached (then torn down mid-query via the fake).
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
.GetAwaiter().GetResult();
// At this point host-a and host-b are both in cooldown. ReadEvents will hit the
// picker's empty-healthy-list path and return empty without calling the factory.
factory.ConnectHistory.Clear();
var events = ds.ReadEventsAsync(null, DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
.GetAwaiter().GetResult();
events.Count.ShouldBe(0);
factory.ConnectHistory.ShouldBeEmpty();
// Critical assertion: host-a was NOT retried by the event silo — it's in the
// shared cooldown from the process path's failure.
factory.ConnectHistory.ShouldNotContain("host-a");
}
}
}