lmxopcua/tests/ZB.MOM.WW.OtOpcUa.Historian.Aveva.Tests/HistorianClusterFailoverTests.cs

using System;
using System.Linq;
using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Host.Configuration;

namespace ZB.MOM.WW.OtOpcUa.Historian.Aveva.Tests
{
    /// <summary>
    ///     End-to-end behavior of the cluster endpoint picker wired into
    ///     <see cref="HistorianDataSource"/>. Verifies that a failing node is skipped on the next
    ///     attempt, that the picker state is shared across process + event silos, and that the
    ///     health snapshot surfaces the winning node.
    /// </summary>
    public class HistorianClusterFailoverTests
    {
        private static HistorianConfiguration ClusterConfig(params string[] nodes) => new()
        {
            Enabled = true,
            ServerNames = nodes.ToList(),
            Port = 32568,
            IntegratedSecurity = true,
            CommandTimeoutSeconds = 5,
            FailureCooldownSeconds = 60
        };

        [Fact]
        public void Connect_FirstNodeFails_PicksSecond()
        {
            // host-a fails during connect; host-b connects successfully. The fake returns an
            // unconnected HistorianAccess on success, so the query phase will subsequently trip
            // HandleConnectionError on host-b — that's expected. The observable signal is that
            // the picker tried host-a first, skipped to host-b, and host-a's failure was recorded.
            var factory = new FakeHistorianConnectionFactory();
            factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
            var config = ClusterConfig("host-a", "host-b");
            using var ds = new HistorianDataSource(config, factory);

            ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
                .GetAwaiter().GetResult();

            factory.ConnectHistory.ShouldBe(new[] { "host-a", "host-b" });
            var snap = ds.GetHealthSnapshot();
            snap.NodeCount.ShouldBe(2);
            snap.Nodes.Single(n => n.Name == "host-a").IsHealthy.ShouldBeFalse();
            snap.Nodes.Single(n => n.Name == "host-a").FailureCount.ShouldBe(1);
            snap.Nodes.Single(n => n.Name == "host-a").LastError.ShouldContain("A down");
        }

        [Fact]
        public void Connect_AllNodesFail_ReturnsEmptyResults_AndAllInCooldown()
        {
            var factory = new FakeHistorianConnectionFactory();
            factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
            factory.ServerBehaviors["host-b"] = new InvalidOperationException("B down");
            var config = ClusterConfig("host-a", "host-b");
            using var ds = new HistorianDataSource(config, factory);

            var results = ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
                .GetAwaiter().GetResult();

            results.Count.ShouldBe(0);
            factory.ConnectHistory.ShouldBe(new[] { "host-a", "host-b" });

            var snap = ds.GetHealthSnapshot();
            snap.ActiveProcessNode.ShouldBeNull();
            snap.HealthyNodeCount.ShouldBe(0);
            snap.TotalFailures.ShouldBe(1);  // one read call failed (after all cluster tries)
            snap.LastError.ShouldContain("All 2 healthy historian candidate(s) failed");
            snap.LastError.ShouldContain("B down");  // last inner exception preserved
        }

        [Fact]
        public void Connect_SecondCall_SkipsCooledDownNode()
        {
            // After first call: host-a is in cooldown (60s), host-b is also marked failed via
            // HandleConnectionError since the fake connection doesn't support real queries.
            // Second call: both are in cooldown and the picker returns empty → the read method
            // catches the "all nodes failed" exception and returns empty without retrying connect.
            // We verify this by checking that the second call adds NOTHING to the connect history.
            var factory = new FakeHistorianConnectionFactory();
            factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
            var config = ClusterConfig("host-a", "host-b");  // 60s cooldown
            using var ds = new HistorianDataSource(config, factory);

            ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
                .GetAwaiter().GetResult();

            factory.ConnectHistory.Clear();
            var results = ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
                .GetAwaiter().GetResult();

            // Both nodes are in cooldown → picker returns empty → factory is not called at all.
            results.Count.ShouldBe(0);
            factory.ConnectHistory.ShouldBeEmpty();
        }

        [Fact]
        public void Connect_SingleNodeConfig_BehavesLikeLegacy()
        {
            var factory = new FakeHistorianConnectionFactory();
            var config = new HistorianConfiguration
            {
                Enabled = true,
                ServerName = "legacy-host",
                Port = 32568,
                FailureCooldownSeconds = 0
            };
            using var ds = new HistorianDataSource(config, factory);

            ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
                .GetAwaiter().GetResult();

            factory.ConnectHistory.ShouldBe(new[] { "legacy-host" });
            var snap = ds.GetHealthSnapshot();
            snap.NodeCount.ShouldBe(1);
            snap.Nodes.Single().Name.ShouldBe("legacy-host");
        }

        [Fact]
        public void Connect_PickerOrderRespected()
        {
            var factory = new FakeHistorianConnectionFactory();
            factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
            factory.ServerBehaviors["host-b"] = new InvalidOperationException("B down");
            factory.ServerBehaviors["host-c"] = new InvalidOperationException("C down");
            var config = ClusterConfig("host-a", "host-b", "host-c");
            using var ds = new HistorianDataSource(config, factory);

            ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
                .GetAwaiter().GetResult();

            // Candidates are tried in configuration order.
            factory.ConnectHistory.ShouldBe(new[] { "host-a", "host-b", "host-c" });
        }

        [Fact]
        public void Connect_SharedPickerAcrossProcessAndEventSilos()
        {
            // Process path tries host-a, fails, then tries host-b. host-a is in cooldown. When
            // the event path subsequently starts with a 0s cooldown, the picker state is shared:
            // host-a is still marked failed (via its cooldown window) at the moment the event
            // silo asks. The event path therefore must not retry host-a.
            var factory = new FakeHistorianConnectionFactory();
            factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
            var config = ClusterConfig("host-a", "host-b");
            using var ds = new HistorianDataSource(config, factory);

            // Process path: host-a fails → host-b reached (then torn down mid-query via the fake).
            ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
                .GetAwaiter().GetResult();

            // At this point host-a and host-b are both in cooldown. ReadEvents will hit the
            // picker's empty-healthy-list path and return empty without calling the factory.
            factory.ConnectHistory.Clear();
            var events = ds.ReadEventsAsync(null, DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
                .GetAwaiter().GetResult();

            events.Count.ShouldBe(0);
            factory.ConnectHistory.ShouldBeEmpty();
            // Critical assertion: host-a was NOT retried by the event silo — it's in the
            // shared cooldown from the process path's failure.
            factory.ConnectHistory.ShouldNotContain("host-a");
        }
    }
}