167 lines
7.9 KiB
C#
167 lines
7.9 KiB
C#
using System;
|
|
using System.Linq;
|
|
using Shouldly;
|
|
using Xunit;
|
|
using ZB.MOM.WW.LmxOpcUa.Host.Configuration;
|
|
|
|
namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva.Tests
|
|
{
|
|
/// <summary>
|
|
/// End-to-end behavior of the cluster endpoint picker wired into
|
|
/// <see cref="HistorianDataSource"/>. Verifies that a failing node is skipped on the next
|
|
/// attempt, that the picker state is shared across process + event silos, and that the
|
|
/// health snapshot surfaces the winning node.
|
|
/// </summary>
|
|
public class HistorianClusterFailoverTests
|
|
{
|
|
private static HistorianConfiguration ClusterConfig(params string[] nodes) => new()
|
|
{
|
|
Enabled = true,
|
|
ServerNames = nodes.ToList(),
|
|
Port = 32568,
|
|
IntegratedSecurity = true,
|
|
CommandTimeoutSeconds = 5,
|
|
FailureCooldownSeconds = 60
|
|
};
|
|
|
|
[Fact]
|
|
public void Connect_FirstNodeFails_PicksSecond()
|
|
{
|
|
// host-a fails during connect; host-b connects successfully. The fake returns an
|
|
// unconnected HistorianAccess on success, so the query phase will subsequently trip
|
|
// HandleConnectionError on host-b — that's expected. The observable signal is that
|
|
// the picker tried host-a first, skipped to host-b, and host-a's failure was recorded.
|
|
var factory = new FakeHistorianConnectionFactory();
|
|
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
|
|
var config = ClusterConfig("host-a", "host-b");
|
|
using var ds = new HistorianDataSource(config, factory);
|
|
|
|
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
|
|
.GetAwaiter().GetResult();
|
|
|
|
factory.ConnectHistory.ShouldBe(new[] { "host-a", "host-b" });
|
|
var snap = ds.GetHealthSnapshot();
|
|
snap.NodeCount.ShouldBe(2);
|
|
snap.Nodes.Single(n => n.Name == "host-a").IsHealthy.ShouldBeFalse();
|
|
snap.Nodes.Single(n => n.Name == "host-a").FailureCount.ShouldBe(1);
|
|
snap.Nodes.Single(n => n.Name == "host-a").LastError.ShouldContain("A down");
|
|
}
|
|
|
|
[Fact]
|
|
public void Connect_AllNodesFail_ReturnsEmptyResults_AndAllInCooldown()
|
|
{
|
|
var factory = new FakeHistorianConnectionFactory();
|
|
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
|
|
factory.ServerBehaviors["host-b"] = new InvalidOperationException("B down");
|
|
var config = ClusterConfig("host-a", "host-b");
|
|
using var ds = new HistorianDataSource(config, factory);
|
|
|
|
var results = ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
|
|
.GetAwaiter().GetResult();
|
|
|
|
results.Count.ShouldBe(0);
|
|
factory.ConnectHistory.ShouldBe(new[] { "host-a", "host-b" });
|
|
|
|
var snap = ds.GetHealthSnapshot();
|
|
snap.ActiveProcessNode.ShouldBeNull();
|
|
snap.HealthyNodeCount.ShouldBe(0);
|
|
snap.TotalFailures.ShouldBe(1); // one read call failed (after all cluster tries)
|
|
snap.LastError.ShouldContain("All 2 healthy historian candidate(s) failed");
|
|
snap.LastError.ShouldContain("B down"); // last inner exception preserved
|
|
}
|
|
|
|
[Fact]
|
|
public void Connect_SecondCall_SkipsCooledDownNode()
|
|
{
|
|
// After first call: host-a is in cooldown (60s), host-b is also marked failed via
|
|
// HandleConnectionError since the fake connection doesn't support real queries.
|
|
// Second call: both are in cooldown and the picker returns empty → the read method
|
|
// catches the "all nodes failed" exception and returns empty without retrying connect.
|
|
// We verify this by checking that the second call adds NOTHING to the connect history.
|
|
var factory = new FakeHistorianConnectionFactory();
|
|
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
|
|
var config = ClusterConfig("host-a", "host-b"); // 60s cooldown
|
|
using var ds = new HistorianDataSource(config, factory);
|
|
|
|
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
|
|
.GetAwaiter().GetResult();
|
|
|
|
factory.ConnectHistory.Clear();
|
|
var results = ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
|
|
.GetAwaiter().GetResult();
|
|
|
|
// Both nodes are in cooldown → picker returns empty → factory is not called at all.
|
|
results.Count.ShouldBe(0);
|
|
factory.ConnectHistory.ShouldBeEmpty();
|
|
}
|
|
|
|
[Fact]
|
|
public void Connect_SingleNodeConfig_BehavesLikeLegacy()
|
|
{
|
|
var factory = new FakeHistorianConnectionFactory();
|
|
var config = new HistorianConfiguration
|
|
{
|
|
Enabled = true,
|
|
ServerName = "legacy-host",
|
|
Port = 32568,
|
|
FailureCooldownSeconds = 0
|
|
};
|
|
using var ds = new HistorianDataSource(config, factory);
|
|
|
|
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
|
|
.GetAwaiter().GetResult();
|
|
|
|
factory.ConnectHistory.ShouldBe(new[] { "legacy-host" });
|
|
var snap = ds.GetHealthSnapshot();
|
|
snap.NodeCount.ShouldBe(1);
|
|
snap.Nodes.Single().Name.ShouldBe("legacy-host");
|
|
}
|
|
|
|
[Fact]
|
|
public void Connect_PickerOrderRespected()
|
|
{
|
|
var factory = new FakeHistorianConnectionFactory();
|
|
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
|
|
factory.ServerBehaviors["host-b"] = new InvalidOperationException("B down");
|
|
factory.ServerBehaviors["host-c"] = new InvalidOperationException("C down");
|
|
var config = ClusterConfig("host-a", "host-b", "host-c");
|
|
using var ds = new HistorianDataSource(config, factory);
|
|
|
|
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
|
|
.GetAwaiter().GetResult();
|
|
|
|
// Candidates are tried in configuration order.
|
|
factory.ConnectHistory.ShouldBe(new[] { "host-a", "host-b", "host-c" });
|
|
}
|
|
|
|
[Fact]
|
|
public void Connect_SharedPickerAcrossProcessAndEventSilos()
|
|
{
|
|
// Process path tries host-a, fails, then tries host-b. host-a is in cooldown. When
|
|
// the event path subsequently starts with a 0s cooldown, the picker state is shared:
|
|
// host-a is still marked failed (via its cooldown window) at the moment the event
|
|
// silo asks. The event path therefore must not retry host-a.
|
|
var factory = new FakeHistorianConnectionFactory();
|
|
factory.ServerBehaviors["host-a"] = new InvalidOperationException("A down");
|
|
var config = ClusterConfig("host-a", "host-b");
|
|
using var ds = new HistorianDataSource(config, factory);
|
|
|
|
// Process path: host-a fails → host-b reached (then torn down mid-query via the fake).
|
|
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
|
|
.GetAwaiter().GetResult();
|
|
|
|
// At this point host-a and host-b are both in cooldown. ReadEvents will hit the
|
|
// picker's empty-healthy-list path and return empty without calling the factory.
|
|
factory.ConnectHistory.Clear();
|
|
var events = ds.ReadEventsAsync(null, DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
|
|
.GetAwaiter().GetResult();
|
|
|
|
events.Count.ShouldBe(0);
|
|
factory.ConnectHistory.ShouldBeEmpty();
|
|
// Critical assertion: host-a was NOT retried by the event silo — it's in the
|
|
// shared cooldown from the process path's failure.
|
|
factory.ConnectHistory.ShouldNotContain("host-a");
|
|
}
|
|
}
|
|
}
|