Renames all 11 projects (5 src + 6 tests), the .slnx solution file, all source-file namespaces, all axaml namespace references, and all v1 documentation references in CLAUDE.md and docs/*.md (excluding docs/v2/ which is already in OtOpcUa form). Also updates the TopShelf service registration name from "LmxOpcUa" to "OtOpcUa" per Phase 0 Task 0.6.
Preserves runtime identifiers per Phase 0 Out-of-Scope rules to avoid breaking v1/v2 client trust during coexistence: OPC UA `ApplicationUri` defaults (`urn:{GalaxyName}:LmxOpcUa`), server `EndpointPath` (`/LmxOpcUa`), `ServerName` default (feeds cert subject CN), `MxAccessConfiguration.ClientName` default (defensive — stays "LmxOpcUa" for MxAccess audit-trail consistency), client OPC UA identifiers (`ApplicationName = "LmxOpcUaClient"`, `ApplicationUri = "urn:localhost:LmxOpcUaClient"`, cert directory `%LocalAppData%\LmxOpcUaClient\pki\`), and the `LmxOpcUaServer` class name (class rename out of Phase 0 scope per Task 0.5 sed pattern; happens in Phase 1 alongside `LmxNodeManager → GenericDriverNodeManager` Core extraction). 23 LmxOpcUa references retained, all enumerated and justified in `docs/v2/implementation/exit-gate-phase-0.md`.
Build clean: 0 errors, 30 warnings (lower than baseline 167). Tests at strict improvement over baseline: 821 passing / 1 failing vs baseline 820 / 2 (one flaky pre-existing failure passed this run; the other still fails — both pre-existing and unrelated to the rename). `Client.UI.Tests`, `Historian.Aveva.Tests`, `Client.Shared.Tests`, `IntegrationTests` all match baseline exactly. Exit gate compliance results recorded in `docs/v2/implementation/exit-gate-phase-0.md` with all 7 checks PASS or DEFERRED-to-PR-review (#7 service install verification needs Windows service permissions on the reviewer's box).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
167 lines
7.9 KiB
C#
167 lines
7.9 KiB
C#
using System;
|
|
using System.Linq;
|
|
using Shouldly;
|
|
using Xunit;
|
|
using ZB.MOM.WW.OtOpcUa.Host.Configuration;
|
|
|
|
namespace ZB.MOM.WW.OtOpcUa.Historian.Aveva.Tests
|
|
{
|
|
/// <summary>
|
|
/// End-to-end behavior of the cluster endpoint picker wired into
|
|
/// <see cref="HistorianDataSource"/>. Verifies that a failing node is skipped on the next
|
|
/// attempt, that the picker state is shared across process + event silos, and that the
|
|
/// health snapshot surfaces the winning node.
|
|
/// </summary>
|
|
public class HistorianClusterFailoverTests
|
|
{
|
|
private static HistorianConfiguration ClusterConfig(params string[] nodes) => new()
|
|
{
|
|
Enabled = true,
|
|
ServerNames = nodes.ToList(),
|
|
Port = 32568,
|
|
IntegratedSecurity = true,
|
|
CommandTimeoutSeconds = 5,
|
|
FailureCooldownSeconds = 60
|
|
};
|
|
|
|
[Fact]
|
|
public void Connect_FirstNodeFails_PicksSecond()
|
|
{
|
|
// host-a fails during connect; host-b connects successfully. The fake returns an
|
|
// unconnected HistorianAccess on success, so the query phase will subsequently trip
|
|
// HandleConnectionError on host-b — that's expected. The observable signal is that
|
|
// the picker tried host-a first, skipped to host-b, and host-a's failure was recorded.
|
|
var factory = new FakeHistorianConnectionFactory();
|
|
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
|
|
var config = ClusterConfig("host-a", "host-b");
|
|
using var ds = new HistorianDataSource(config, factory);
|
|
|
|
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
|
|
.GetAwaiter().GetResult();
|
|
|
|
factory.ConnectHistory.ShouldBe(new[] { "host-a", "host-b" });
|
|
var snap = ds.GetHealthSnapshot();
|
|
snap.NodeCount.ShouldBe(2);
|
|
snap.Nodes.Single(n => n.Name == "host-a").IsHealthy.ShouldBeFalse();
|
|
snap.Nodes.Single(n => n.Name == "host-a").FailureCount.ShouldBe(1);
|
|
snap.Nodes.Single(n => n.Name == "host-a").LastError.ShouldContain("A down");
|
|
}
|
|
|
|
[Fact]
|
|
public void Connect_AllNodesFail_ReturnsEmptyResults_AndAllInCooldown()
|
|
{
|
|
var factory = new FakeHistorianConnectionFactory();
|
|
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
|
|
factory.ServerBehaviors["host-b"] = new InvalidOperationException("B down");
|
|
var config = ClusterConfig("host-a", "host-b");
|
|
using var ds = new HistorianDataSource(config, factory);
|
|
|
|
var results = ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
|
|
.GetAwaiter().GetResult();
|
|
|
|
results.Count.ShouldBe(0);
|
|
factory.ConnectHistory.ShouldBe(new[] { "host-a", "host-b" });
|
|
|
|
var snap = ds.GetHealthSnapshot();
|
|
snap.ActiveProcessNode.ShouldBeNull();
|
|
snap.HealthyNodeCount.ShouldBe(0);
|
|
snap.TotalFailures.ShouldBe(1); // one read call failed (after all cluster tries)
|
|
snap.LastError.ShouldContain("All 2 healthy historian candidate(s) failed");
|
|
snap.LastError.ShouldContain("B down"); // last inner exception preserved
|
|
}
|
|
|
|
[Fact]
|
|
public void Connect_SecondCall_SkipsCooledDownNode()
|
|
{
|
|
// After first call: host-a is in cooldown (60s), host-b is also marked failed via
|
|
// HandleConnectionError since the fake connection doesn't support real queries.
|
|
// Second call: both are in cooldown and the picker returns empty → the read method
|
|
// catches the "all nodes failed" exception and returns empty without retrying connect.
|
|
// We verify this by checking that the second call adds NOTHING to the connect history.
|
|
var factory = new FakeHistorianConnectionFactory();
|
|
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
|
|
var config = ClusterConfig("host-a", "host-b"); // 60s cooldown
|
|
using var ds = new HistorianDataSource(config, factory);
|
|
|
|
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
|
|
.GetAwaiter().GetResult();
|
|
|
|
factory.ConnectHistory.Clear();
|
|
var results = ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
|
|
.GetAwaiter().GetResult();
|
|
|
|
// Both nodes are in cooldown → picker returns empty → factory is not called at all.
|
|
results.Count.ShouldBe(0);
|
|
factory.ConnectHistory.ShouldBeEmpty();
|
|
}
|
|
|
|
[Fact]
|
|
public void Connect_SingleNodeConfig_BehavesLikeLegacy()
|
|
{
|
|
var factory = new FakeHistorianConnectionFactory();
|
|
var config = new HistorianConfiguration
|
|
{
|
|
Enabled = true,
|
|
ServerName = "legacy-host",
|
|
Port = 32568,
|
|
FailureCooldownSeconds = 0
|
|
};
|
|
using var ds = new HistorianDataSource(config, factory);
|
|
|
|
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
|
|
.GetAwaiter().GetResult();
|
|
|
|
factory.ConnectHistory.ShouldBe(new[] { "legacy-host" });
|
|
var snap = ds.GetHealthSnapshot();
|
|
snap.NodeCount.ShouldBe(1);
|
|
snap.Nodes.Single().Name.ShouldBe("legacy-host");
|
|
}
|
|
|
|
[Fact]
|
|
public void Connect_PickerOrderRespected()
|
|
{
|
|
var factory = new FakeHistorianConnectionFactory();
|
|
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
|
|
factory.ServerBehaviors["host-b"] = new InvalidOperationException("B down");
|
|
factory.ServerBehaviors["host-c"] = new InvalidOperationException("C down");
|
|
var config = ClusterConfig("host-a", "host-b", "host-c");
|
|
using var ds = new HistorianDataSource(config, factory);
|
|
|
|
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
|
|
.GetAwaiter().GetResult();
|
|
|
|
// Candidates are tried in configuration order.
|
|
factory.ConnectHistory.ShouldBe(new[] { "host-a", "host-b", "host-c" });
|
|
}
|
|
|
|
[Fact]
|
|
public void Connect_SharedPickerAcrossProcessAndEventSilos()
|
|
{
|
|
// Process path tries host-a, fails, then tries host-b. host-a is in cooldown. When
|
|
// the event path subsequently starts with a 0s cooldown, the picker state is shared:
|
|
// host-a is still marked failed (via its cooldown window) at the moment the event
|
|
// silo asks. The event path therefore must not retry host-a.
|
|
var factory = new FakeHistorianConnectionFactory();
|
|
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
|
|
var config = ClusterConfig("host-a", "host-b");
|
|
using var ds = new HistorianDataSource(config, factory);
|
|
|
|
// Process path: host-a fails → host-b reached (then torn down mid-query via the fake).
|
|
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
|
|
.GetAwaiter().GetResult();
|
|
|
|
// At this point host-a and host-b are both in cooldown. ReadEvents will hit the
|
|
// picker's empty-healthy-list path and return empty without calling the factory.
|
|
factory.ConnectHistory.Clear();
|
|
var events = ds.ReadEventsAsync(null, DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
|
|
.GetAwaiter().GetResult();
|
|
|
|
events.Count.ShouldBe(0);
|
|
factory.ConnectHistory.ShouldBeEmpty();
|
|
// Critical assertion: host-a was NOT retried by the event silo — it's in the
|
|
// shared cooldown from the process path's failure.
|
|
factory.ConnectHistory.ShouldNotContain("host-a");
|
|
}
|
|
}
|
|
}
|