Files
lmxopcua/tests/ZB.MOM.WW.LmxOpcUa.Historian.Aveva.Tests/HistorianClusterFailoverTests.cs

167 lines
7.9 KiB
C#

using System;
using System.Linq;
using Shouldly;
using Xunit;
using ZB.MOM.WW.LmxOpcUa.Host.Configuration;
namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva.Tests
{
/// <summary>
/// End-to-end behavior of the cluster endpoint picker wired into
/// <see cref="HistorianDataSource"/>. Verifies that a failing node is skipped on the next
/// attempt, that the picker state is shared across process + event silos, and that the
/// health snapshot surfaces the winning node.
/// </summary>
public class HistorianClusterFailoverTests
{
private static HistorianConfiguration ClusterConfig(params string[] nodes) => new()
{
Enabled = true,
ServerNames = nodes.ToList(),
Port = 32568,
IntegratedSecurity = true,
CommandTimeoutSeconds = 5,
FailureCooldownSeconds = 60
};
[Fact]
public void Connect_FirstNodeFails_PicksSecond()
{
// host-a fails during connect; host-b connects successfully. The fake returns an
// unconnected HistorianAccess on success, so the query phase will subsequently trip
// HandleConnectionError on host-b — that's expected. The observable signal is that
// the picker tried host-a first, skipped to host-b, and host-a's failure was recorded.
var factory = new FakeHistorianConnectionFactory();
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
var config = ClusterConfig("host-a", "host-b");
using var ds = new HistorianDataSource(config, factory);
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
.GetAwaiter().GetResult();
factory.ConnectHistory.ShouldBe(new[] { "host-a", "host-b" });
var snap = ds.GetHealthSnapshot();
snap.NodeCount.ShouldBe(2);
snap.Nodes.Single(n => n.Name == "host-a").IsHealthy.ShouldBeFalse();
snap.Nodes.Single(n => n.Name == "host-a").FailureCount.ShouldBe(1);
snap.Nodes.Single(n => n.Name == "host-a").LastError.ShouldContain("A down");
}
[Fact]
public void Connect_AllNodesFail_ReturnsEmptyResults_AndAllInCooldown()
{
var factory = new FakeHistorianConnectionFactory();
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
factory.ServerBehaviors["host-b"] = new InvalidOperationException("B down");
var config = ClusterConfig("host-a", "host-b");
using var ds = new HistorianDataSource(config, factory);
var results = ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
.GetAwaiter().GetResult();
results.Count.ShouldBe(0);
factory.ConnectHistory.ShouldBe(new[] { "host-a", "host-b" });
var snap = ds.GetHealthSnapshot();
snap.ActiveProcessNode.ShouldBeNull();
snap.HealthyNodeCount.ShouldBe(0);
snap.TotalFailures.ShouldBe(1); // one read call failed (after all cluster tries)
snap.LastError.ShouldContain("All 2 healthy historian candidate(s) failed");
snap.LastError.ShouldContain("B down"); // last inner exception preserved
}
[Fact]
public void Connect_SecondCall_SkipsCooledDownNode()
{
// After first call: host-a is in cooldown (60s), host-b is also marked failed via
// HandleConnectionError since the fake connection doesn't support real queries.
// Second call: both are in cooldown and the picker returns empty → the read method
// catches the "all nodes failed" exception and returns empty without retrying connect.
// We verify this by checking that the second call adds NOTHING to the connect history.
var factory = new FakeHistorianConnectionFactory();
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
var config = ClusterConfig("host-a", "host-b"); // 60s cooldown
using var ds = new HistorianDataSource(config, factory);
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
.GetAwaiter().GetResult();
factory.ConnectHistory.Clear();
var results = ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
.GetAwaiter().GetResult();
// Both nodes are in cooldown → picker returns empty → factory is not called at all.
results.Count.ShouldBe(0);
factory.ConnectHistory.ShouldBeEmpty();
}
[Fact]
public void Connect_SingleNodeConfig_BehavesLikeLegacy()
{
var factory = new FakeHistorianConnectionFactory();
var config = new HistorianConfiguration
{
Enabled = true,
ServerName = "legacy-host",
Port = 32568,
FailureCooldownSeconds = 0
};
using var ds = new HistorianDataSource(config, factory);
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
.GetAwaiter().GetResult();
factory.ConnectHistory.ShouldBe(new[] { "legacy-host" });
var snap = ds.GetHealthSnapshot();
snap.NodeCount.ShouldBe(1);
snap.Nodes.Single().Name.ShouldBe("legacy-host");
}
[Fact]
public void Connect_PickerOrderRespected()
{
var factory = new FakeHistorianConnectionFactory();
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
factory.ServerBehaviors["host-b"] = new InvalidOperationException("B down");
factory.ServerBehaviors["host-c"] = new InvalidOperationException("C down");
var config = ClusterConfig("host-a", "host-b", "host-c");
using var ds = new HistorianDataSource(config, factory);
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
.GetAwaiter().GetResult();
// Candidates are tried in configuration order.
factory.ConnectHistory.ShouldBe(new[] { "host-a", "host-b", "host-c" });
}
[Fact]
public void Connect_SharedPickerAcrossProcessAndEventSilos()
{
// Process path tries host-a, fails, then tries host-b. host-a is in cooldown. When
// the event path subsequently starts with a 0s cooldown, the picker state is shared:
// host-a is still marked failed (via its cooldown window) at the moment the event
// silo asks. The event path therefore must not retry host-a.
var factory = new FakeHistorianConnectionFactory();
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
var config = ClusterConfig("host-a", "host-b");
using var ds = new HistorianDataSource(config, factory);
// Process path: host-a fails → host-b reached (then torn down mid-query via the fake).
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
.GetAwaiter().GetResult();
// At this point host-a and host-b are both in cooldown. ReadEvents will hit the
// picker's empty-healthy-list path and return empty without calling the factory.
factory.ConnectHistory.Clear();
var events = ds.ReadEventsAsync(null, DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
.GetAwaiter().GetResult();
events.Count.ShouldBe(0);
factory.ConnectHistory.ShouldBeEmpty();
// Critical assertion: host-a was NOT retried by the event silo — it's in the
// shared cooldown from the process path's failure.
factory.ConnectHistory.ShouldNotContain("host-a");
}
}
}