using System; using System.Linq; using Shouldly; using Xunit; using ZB.MOM.WW.OtOpcUa.Host.Configuration; namespace ZB.MOM.WW.OtOpcUa.Historian.Aveva.Tests { /// /// End-to-end behavior of the cluster endpoint picker wired into /// . Verifies that a failing node is skipped on the next /// attempt, that the picker state is shared across process + event silos, and that the /// health snapshot surfaces the winning node. /// public class HistorianClusterFailoverTests { private static HistorianConfiguration ClusterConfig(params string[] nodes) => new() { Enabled = true, ServerNames = nodes.ToList(), Port = 32568, IntegratedSecurity = true, CommandTimeoutSeconds = 5, FailureCooldownSeconds = 60 }; [Fact] public void Connect_FirstNodeFails_PicksSecond() { // host-a fails during connect; host-b connects successfully. The fake returns an // unconnected HistorianAccess on success, so the query phase will subsequently trip // HandleConnectionError on host-b — that's expected. The observable signal is that // the picker tried host-a first, skipped to host-b, and host-a's failure was recorded. var factory = new FakeHistorianConnectionFactory(); factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down"); var config = ClusterConfig("host-a", "host-b"); using var ds = new HistorianDataSource(config, factory); ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10) .GetAwaiter().GetResult(); factory.ConnectHistory.ShouldBe(new[] { "host-a", "host-b" }); var snap = ds.GetHealthSnapshot(); snap.NodeCount.ShouldBe(2); snap.Nodes.Single(n => n.Name == "host-a").IsHealthy.ShouldBeFalse(); snap.Nodes.Single(n => n.Name == "host-a").FailureCount.ShouldBe(1); snap.Nodes.Single(n => n.Name == "host-a").LastError.ShouldContain("A down"); } [Fact] public void Connect_AllNodesFail_ReturnsEmptyResults_AndAllInCooldown() { var factory = new FakeHistorianConnectionFactory(); factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down"); factory.ServerBehaviors["host-b"] = new InvalidOperationException("B down"); var config = ClusterConfig("host-a", "host-b"); using var ds = new HistorianDataSource(config, factory); var results = ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10) .GetAwaiter().GetResult(); results.Count.ShouldBe(0); factory.ConnectHistory.ShouldBe(new[] { "host-a", "host-b" }); var snap = ds.GetHealthSnapshot(); snap.ActiveProcessNode.ShouldBeNull(); snap.HealthyNodeCount.ShouldBe(0); snap.TotalFailures.ShouldBe(1); // one read call failed (after all cluster tries) snap.LastError.ShouldContain("All 2 healthy historian candidate(s) failed"); snap.LastError.ShouldContain("B down"); // last inner exception preserved } [Fact] public void Connect_SecondCall_SkipsCooledDownNode() { // After first call: host-a is in cooldown (60s), host-b is also marked failed via // HandleConnectionError since the fake connection doesn't support real queries. // Second call: both are in cooldown and the picker returns empty → the read method // catches the "all nodes failed" exception and returns empty without retrying connect. // We verify this by checking that the second call adds NOTHING to the connect history. var factory = new FakeHistorianConnectionFactory(); factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down"); var config = ClusterConfig("host-a", "host-b"); // 60s cooldown using var ds = new HistorianDataSource(config, factory); ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10) .GetAwaiter().GetResult(); factory.ConnectHistory.Clear(); var results = ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10) .GetAwaiter().GetResult(); // Both nodes are in cooldown → picker returns empty → factory is not called at all. results.Count.ShouldBe(0); factory.ConnectHistory.ShouldBeEmpty(); } [Fact] public void Connect_SingleNodeConfig_BehavesLikeLegacy() { var factory = new FakeHistorianConnectionFactory(); var config = new HistorianConfiguration { Enabled = true, ServerName = "legacy-host", Port = 32568, FailureCooldownSeconds = 0 }; using var ds = new HistorianDataSource(config, factory); ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10) .GetAwaiter().GetResult(); factory.ConnectHistory.ShouldBe(new[] { "legacy-host" }); var snap = ds.GetHealthSnapshot(); snap.NodeCount.ShouldBe(1); snap.Nodes.Single().Name.ShouldBe("legacy-host"); } [Fact] public void Connect_PickerOrderRespected() { var factory = new FakeHistorianConnectionFactory(); factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down"); factory.ServerBehaviors["host-b"] = new InvalidOperationException("B down"); factory.ServerBehaviors["host-c"] = new InvalidOperationException("C down"); var config = ClusterConfig("host-a", "host-b", "host-c"); using var ds = new HistorianDataSource(config, factory); ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10) .GetAwaiter().GetResult(); // Candidates are tried in configuration order. factory.ConnectHistory.ShouldBe(new[] { "host-a", "host-b", "host-c" }); } [Fact] public void Connect_SharedPickerAcrossProcessAndEventSilos() { // Process path tries host-a, fails, then tries host-b. host-a is in cooldown. When // the event path subsequently starts with a 0s cooldown, the picker state is shared: // host-a is still marked failed (via its cooldown window) at the moment the event // silo asks. The event path therefore must not retry host-a. var factory = new FakeHistorianConnectionFactory(); factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down"); var config = ClusterConfig("host-a", "host-b"); using var ds = new HistorianDataSource(config, factory); // Process path: host-a fails → host-b reached (then torn down mid-query via the fake). ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10) .GetAwaiter().GetResult(); // At this point host-a and host-b are both in cooldown. ReadEvents will hit the // picker's empty-healthy-list path and return empty without calling the factory. factory.ConnectHistory.Clear(); var events = ds.ReadEventsAsync(null, DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10) .GetAwaiter().GetResult(); events.Count.ShouldBe(0); factory.ConnectHistory.ShouldBeEmpty(); // Critical assertion: host-a was NOT retried by the event silo — it's in the // shared cooldown from the process path's failure. factory.ConnectHistory.ShouldNotContain("host-a"); } } }